In [37]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV

In [49]:
##############################################################
## READING IN TRAINING DATA THAT HAS BEEN PREVIOSLY CLEANED ##
##############################################################

# Creating two empty lists that are used to store the data that is read in.
x_train_list = []
y_train_list = []
# Iterating through the data for all the years.
for i in range(2000, 2021):
    # We ignore the year of 2004, as there seems to me something wrong with the formatting of the data in out dataset.
    if i != 2004:
        # Assembling in the name of the file that contains the x data that needs to be trained.
        file_name = "season_stats/east" + str(i) + ".csv"
        # Reading the x data from the filename created above.
        df_x = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_x = df_x.sort_values("TEAM")
        # Resetting the index of the data.
        df_x = df_x.reset_index()
        # Dropping the TEAM column in the data.
        df_x = df_x.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_x = df_x.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the x data.
        x_train_list.append(df_x)
        # Assembling in the name of the file that contains the y data that needs to be trained.
        file_name = "playoff_labels/east" + str(i) + "playoff.csv"
        # Reading the y data from the filename created above.
        df_y = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_y = df_y.sort_values("TEAM")
        # Resetting the index of the data.
        df_y = df_y.reset_index()
        # Dropping the TEAM column in the data.
        df_y = df_y.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_y = df_y.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the y data.
        y_train_list.append(df_y)

# Concatentating the list of x data to a dataframe to hold all the x data.
x_train = pd.concat(x_train_list)
x_train = x_train.drop("MATCHUP", axis=1)
# x_train = x_train.drop("WLPCT", axis=1)
# Concatentating the list of y data to a dataframe to hold all the y data.
y_train = pd.concat(y_train_list)

In [50]:
##############################
## READING IN THE TEST DATA ##
##############################

# Assembling in the name of the file that contains the x data that needs to be tested against.
x_test = pd.read_csv("season_stats/east2021.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
x_test = x_test.sort_values("TEAM")
# Resetting the index of the data.
x_test = x_test.reset_index()
# Dropping the extra index column in the data.
x_test = x_test.drop("index", axis=1)
x_test = x_test.drop("MATCHUP", axis=1)
# x_test = x_test.drop("WLPCT", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
x_test_prediction = x_test
# Dropping the TEAM column in the data.
x_test = x_test.drop("TEAM", axis=1)
# Assembling in the name of the file that contains the y data that needs to be tested against.
y_test = pd.read_csv("playoff_labels/east2021playoff.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
y_test = y_test.sort_values("TEAM")
# Resetting the index of the data.
y_test = y_test.reset_index()
# Dropping the extra index column in the data.
y_test = y_test.drop("index", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
y_test_prediction = y_test
# Dropping the TEAM column in the data.
y_test = y_test.drop("TEAM", axis=1)

In [51]:
##############################
## STANDARDIZING THE X DATA ##
##############################
# Creating a scaler object.
sc = StandardScaler()
# Scaling the x training data.
x_train_scaled = sc.fit_transform(x_train)
# Scaling the x test data.
x_test_scaled = sc.transform(x_test)

In [52]:
############################
## NORMALIZING THE X DATA ##
############################
# Normalizing the x scaled training data.
a = preprocessing.normalize(x_train_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x training data.
x_train_normalized = pd.DataFrame(a, columns = x_train.columns)
# Normalizing the x scaled testing data.
b = preprocessing.normalize(x_test_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x testing data.
x_test_normalized = pd. DataFrame (b, columns = x_test.columns)

In [55]:
clf = DecisionTreeClassifier(max_depth=12, max_features=8, min_samples_leaf=20)
# clf = DecisionTreeClassifier()
params = {'max_depth': [4, 6, 8, 10, 12], 'min_samples_leaf': [5, 10, 15, 20], 'max_features': [4, 6, 8, 10, 12]}
grid_search = GridSearchCV(clf, params, cv = 10, scoring = 'accuracy')
grid_search.fit(x_train_normalized, y_train)

print(grid_search.best_params_)
print("Accuracy:", grid_search.best_score_*100)

clf.fit(x_train_normalized, y_train)


{'max_depth': 8, 'max_features': 10, 'min_samples_leaf': 15}
Accuracy: 87.66666666666667


In [56]:
train_feature_names = x_train.columns

dt_importances = clf.feature_importances_
dt_importances = sorted(zip(dt_importances, train_feature_names), reverse=True)
for i in range (5):
    print (dt_importances[i])

(0.9143691659056865, 'WLPCT')
(0.05253063290553241, 'PM')
(0.013649039677921214, 'PTS')
(0.009926574311215393, 'FG3PCT')
(0.005128353537521887, 'FGM')


In [57]:
# Getting the score from the normalized x training and y trianing data.
east_dt_train_score = clf.score(x_train_normalized, y_train)
# east_logreg_train_score = east_logreg.score(x_train_normalized, y_train)
# Printing out this score.
print("Score for training data: " + str(east_dt_train_score))
# Getting the score from the normalized x testing and y testing data.
east_dt_test_score = clf.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(east_dt_test_score))
print()
print("Predictions for which teams makes the playoffs for 2021 in the Eastern Conference")
print()
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
east_predictions_2021 = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
dt_probability = clf.predict_proba(x_test_normalized)[:, 1].tolist()
# rf_probability = rf.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
dt_prediction = clf.predict(x_test_normalized).tolist()
# Assinging the prediciotn value to the prediciotn colum in the list.
east_predictions_2021["PREDICTION"] = dt_prediction
# Assigning the probabilities to the column in the list..
east_predictions_2021["PROBABILITY"] = dt_probability
# Sorting the list according to the porbability values.
east_predictions_2021 = east_predictions_2021.sort_values("PROBABILITY", ascending=False)
print(east_predictions_2021)

Score for training data: 0.8833333333333333
Score for testing data: 0.8666666666666667

Predictions for which teams makes the playoffs for 2021 in the Eastern Conference

   TEAM  PLAYOFF  PREDICTION  PROBABILITY
0   ATL        1           1     1.000000
1   BKN        1           1     1.000000
2   BOS        1           1     1.000000
3   CHA        0           1     1.000000
8   MIA        1           1     1.000000
9   MIL        1           1     1.000000
10  NYK        1           1     1.000000
12  PHI        1           1     1.000000
4   CHI        0           0     0.181818
7   IND        0           0     0.181818
13  TOR        0           0     0.181818
6   DET        0           0     0.068966
11  ORL        0           0     0.068966
14  WAS        1           0     0.068966
5   CLE        0           0     0.000000
