In [1]:
# THIS IS THE MAIN FILE USED TO CREATE ALL MODELS
# We have so far implemented two models: Logistic Regression and SVM.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
#####################################################
## TRAINED MODELS FOR PREDICTING THE 2022 PLAYOFFS ##
#####################################################


#####################################################
## PREDICTING THE 2022 EASTERN CONFERENCE PLAYOFFS ##
######################################################


##############################################################
## READING IN TRAINING DATA THAT HAS BEEN PREVIOSLY CLEANED ##
##############################################################

# Creating two empty lists that are used to store the data that is read in.
x_train_list = []
y_train_list = []
# Iterating through the data for all the years.
for i in range(2000, 2021):
    # We ignore the year of 2004, as there seems to me something wrong with the formatting of the data in out dataset.
    if i != 2004:
        # Assembling in the name of the file that contains the x data that needs to be trained.
        file_name = "season_stats/east" + str(i) + ".csv"
        # Reading the x data from the filename created above.
        df_x = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_x = df_x.sort_values("TEAM")
        # Resetting the index of the data.
        df_x = df_x.reset_index()
        # Dropping the TEAM column in the data.
        df_x = df_x.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_x = df_x.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the x data.
        x_train_list.append(df_x)
        # Assembling in the name of the file that contains the y data that needs to be trained.
        file_name = "playoff_labels/east" + str(i) + "playoff.csv"
        # Reading the y data from the filename created above.
        df_y = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_y = df_y.sort_values("TEAM")
        # Resetting the index of the data.
        df_y = df_y.reset_index()
        # Dropping the TEAM column in the data.
        df_y = df_y.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_y = df_y.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the y data.
        y_train_list.append(df_y)

# Concatentating the list of x data to a dataframe to hold all the x data.
x_train = pd.concat(x_train_list)
x_train = x_train.drop("MATCHUP", axis=1)
x_train = x_train.drop("WLPCT", axis=1)
# Concatentating the list of y data to a dataframe to hold all the y data.
y_train = pd.concat(y_train_list)

In [3]:
##############################
## READING IN THE TEST DATA ##
##############################

# Assembling in the name of the file that contains the x data that needs to be tested against.
x_test = pd.read_csv("season_stats/east2022.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
x_test = x_test.sort_values("TEAM")
# Resetting the index of the data.
x_test = x_test.reset_index()
# Dropping the extra index column in the data.
x_test = x_test.drop("index", axis=1)
x_test = x_test.drop("MATCHUP", axis=1)
x_test = x_test.drop("WLPCT", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
x_test_prediction = x_test
# Dropping the TEAM column in the data.
x_test = x_test.drop("TEAM", axis=1)
# Assembling in the name of the file that contains the y data that needs to be tested against.
y_test = pd.read_csv("playoff_labels/east2022playoff.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
y_test = y_test.sort_values("TEAM")
# Resetting the index of the data.
y_test = y_test.reset_index()
# Dropping the extra index column in the data.
y_test = y_test.drop("index", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
y_test_prediction = y_test
# Dropping the TEAM column in the data.
y_test = y_test.drop("TEAM", axis=1)

In [4]:
##############################
## STANDARDIZING THE X DATA ##
##############################
# Creating a scaler object.
sc = StandardScaler()
# Scaling the x training data.
x_train_scaled = sc.fit_transform(x_train)
# Scaling the x test data.
x_test_scaled = sc.transform(x_test)

In [5]:
############################
## NORMALIZING THE X DATA ##
############################
# Normalizing the x scaled training data.
a = preprocessing.normalize(x_train_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x training data.
x_train_normalized = pd.DataFrame(a, columns = x_train.columns)
# Normalizing the x scaled testing data.
b = preprocessing.normalize(x_test_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x testing data.
x_test_normalized = pd. DataFrame (b, columns = x_test.columns)

In [86]:
###############################
## LOGISTIC REGRESSION MODEL ##
###############################

print("LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE")
print()
# Creating a logistic regression object.
east_logreg = LogisticRegression()
# Fitting the model with the normalized x training and y training data.
east_logreg.fit(x_train_normalized, np.ravel(y_train))
# Getting the score from the normalized x training and y trianing data.
east_logreg_train_score = east_logreg.score(x_train_normalized, y_train)
# Printing out this score.
print("Score for training data: " + str(east_logreg_train_score))
# Getting the score from the normalized x testing and y testing data.
east_logreg_test_score = east_logreg.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(east_logreg_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names = x_train.columns
print("Most important features for 2022 in the Eastern Conference")
# Calculating the coeffeicnt for the logistic regression model.
coefficient_logreg = east_logreg.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_logreg = coefficient_logreg[0]
# Taking the absolute value of this coeffficient.
abs_importance_logreg = abs(importance_logreg)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_log = list(zip(train_feature_names, importance_logreg, abs_importance_logreg))
# Sorting this list according to the absolute values of the coefficient.
importance_list_log.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_log[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Eastern Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
east_predictions_2022_LR = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
logreg_probability = east_logreg.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
logreg_prediction = east_logreg.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediciotn column in the list.
east_predictions_2022_LR["PREDICTION"] = logreg_prediction
# Assigning the probabilities to the column in the list.
east_predictions_2022_LR["PROBABILITY"] = logreg_probability
# Sorting the list according to the porbability values.
east_predictions_2022_LR = east_predictions_2022_LR.sort_values("PROBABILITY", ascending=False)
print(east_predictions_2022_LR)
print()
print()

LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE

Score for training data: 0.8066666666666666
Score for testing data: 0.6666666666666666

Most important features for 2022 in the Eastern Conference
('PM', 4.192892339967322, 4.192892339967322)
('FGPCT', 1.5612884719530524, 1.5612884719530524)
('FG3PCT', 1.2539959951474446, 1.2539959951474446)
('TOV', -1.2041265228950269, 1.2041265228950269)
('STL', 1.1156588648834882, 1.1156588648834882)

Predictions for which teams makes the playoffs for 2022 in the Eastern Conference
   TEAM  PLAYOFF  PREDICTION  PROBABILITY
4   CHI        1           1     0.964768
12  PHI        1           1     0.958279
1   BKN        1           1     0.956483
9   MIL        1           1     0.946853
5   CLE        0           1     0.933985
3   CHA        0           1     0.923703
8   MIA        1           1     0.903250
2   BOS        1           1     0.845746
0   ATL        1           1     0.829961
7   IND        0           1  

In [87]:
###############
## SVM MODEL ##
###############

print("SVM MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE")
print()
# Making SVM model.
east_svm = SVC(kernel="linear", probability=True)
# Fitting the data to the model.
east_svm.fit(x_train_normalized, np.ravel(y_train))
# Calculating the score from the x and y training data.
east_svm_train_score = east_svm.score(x_train_normalized, y_train)
# Printing the score for the training data.
print("Score for training data: " + str(east_svm_train_score))
# Printing the score for the test data.
east_svm_test_score = east_svm.score(x_test, y_test)
# Printing the score for the training data.
print("Score for testing data: " + str(east_svm_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names_SVM = x_train.columns
print("Most important features for 2022 in the Eastern Conference")
# Calculating the coefficient for the SVM.
coefficient_svm = east_svm.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_svm = coefficient_svm[0]
# Taking the absolute value of this coeffficient.
abs_importance_svm = abs(importance_svm)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_svm = list(zip(train_feature_names_SVM, importance_svm, abs_importance_svm))
# Sorting this list according to the absolute values of the coefficient.
importance_list_svm.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_svm[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Eastern Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
east_predictions_2022_SVM = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
svm_probability = east_svm.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
svm_prediction = east_svm.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediciotn colum in the list.
east_predictions_2022_SVM["PREDICTION"] = svm_prediction
# Assigning the probabilities to the column in the list.
east_predictions_2022_SVM["PROBABILITY"] = svm_probability
# Sorting the list according to the porbability values.
east_predictions_2022_SVM = east_predictions_2022_SVM.sort_values("PROBABILITY", ascending=False)
print(east_predictions_2022_SVM)

SVM MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE

Score for training data: 0.83
Score for testing data: 0.8

Most important features for 2022 in the Eastern Conference
('PM', 6.619356851213325, 6.619356851213325)
('TOV', -2.128579788769111, 2.128579788769111)
('STL', 1.8783485152102481, 1.8783485152102481)
('FGA', -1.8565698743163028, 1.8565698743163028)
('FGPCT', 1.8146080083443126, 1.8146080083443126)

Predictions for which teams makes the playoffs for 2022 in the Eastern Conference
   TEAM  PLAYOFF  PREDICTION   PROBABILITY
12  PHI        1           1  1.000000e+00
4   CHI        1           1  1.000000e+00
1   BKN        1           1  1.000000e+00
9   MIL        1           1  1.000000e+00
5   CLE        0           1  1.000000e+00
3   CHA        0           1  1.000000e+00
8   MIA        1           1  9.999999e-01
2   BOS        1           1  9.999990e-01
0   ATL        1           1  9.958712e-01
13  TOR        1           1  9.609481e-01
7   IND        0          

In [6]:
####################################
## Random Forest Classifier Model ##
####################################


east_rf = RandomForestClassifier()
east_rf.fit(x_train_normalized, y_train)
# Getting the score from the normalized x training and y trianing data.
east_rf_train_score = east_rf.score(x_train_normalized, y_train)
# Printing out this score.
print("Score for training data: " + str(east_rf_train_score))
# Getting the score from the normalized x testing and y testing data.
east_rf_test_score = east_rf.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(east_rf_test_score))
print()

# Getting the names of the columns in the x training dataset.
train_feature_names = x_train.columns

# Getting the feature importances for all of the features trained in the model
rf_importances = east_rf.feature_importances_
# Making a list of the feature importances and feature names
rf_importances = sorted(zip(rf_importances, train_feature_names), reverse=True)
# Sorting this list to find the most important features from the model
for i in range(5):
    print(rf_importances[i])

print()
print("Predictions for which teams makes the playoffs for 2021 in the Eastern Conference")
print()
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
east_predictions_2022 = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
rf_probability = east_rf.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
rf_prediction = east_rf.predict(x_test_normalized).tolist()
# Assinging the prediciotn value to the prediciotn colum in the list.
east_predictions_2022["PREDICTION"] = rf_prediction
# Assigning the probabilities to the column in the list..
east_predictions_2022["PROBABILITY"] = rf_probability
# Sorting the list according to the porbability values.
east_predictions_2022 = east_predictions_2022.sort_values("PROBABILITY", ascending=False)
with open('./results/east_predictions_2022_RF', 'wb') as f:
    pickle.dump(east_predictions_2022, f)
    f.close()
print(east_predictions_2022)

Score for training data: 1.0
Score for testing data: 0.7333333333333333

(0.3754902506143919, 'PM')
(0.06095798170249698, 'FGPCT')
(0.05649556207732384, 'FG3PCT')
(0.05407501161569508, 'DREB')
(0.04182972421969604, 'TOV')

Predictions for which teams makes the playoffs for 2021 in the Eastern Conference

   TEAM  PLAYOFF  PREDICTION  PROBABILITY
3   CHA        0           1         0.66
12  PHI        1           1         0.64
9   MIL        1           1         0.62
1   BKN        1           1         0.61
4   CHI        1           1         0.59
7   IND        0           1         0.59
5   CLE        0           1         0.57
8   MIA        1           1         0.56
2   BOS        1           1         0.55
14  WAS        0           1         0.55
0   ATL        1           1         0.51
13  TOR        1           1         0.51
10  NYK        0           0         0.48
11  ORL        0           0         0.43
6   DET        0           0         0.28


  east_rf.fit(x_train_normalized, y_train)


In [151]:
####################################
## Decision Tree Classifier Model ##
####################################

clf = DecisionTreeClassifier(max_depth=6, max_features=10, min_samples_leaf=20)
# Create dictionary of parameters to find the most optimal hyperparameters for the model
params = {'max_depth': [2, 4, 6, 8, 10, 12],
          'min_samples_leaf': [5, 10, 15, 20, 25, 30],
          'max_features': [10, 12, 14, 16, 18, 20]}
# Use GridSearchCV to find most optimal hyperparameters
grid_search = GridSearchCV(clf, params, cv = 10, scoring = 'accuracy')
# Fit the model to calculate the most optimal hyperparameters
grid_search.fit(x_train_normalized, y_train)

print(grid_search.best_params_)
print("Accuracy: ", grid_search.best_score_ * 100)

{'max_depth': 6, 'max_features': 10, 'min_samples_leaf': 20}
Accuracy:  87.66666666666666


In [7]:
# Fit the model based on the optimized hyperparameters
clf = DecisionTreeClassifier(max_depth=6, max_features=18, min_samples_leaf=20)
clf.fit(x_train_normalized, y_train)

train_feature_names = x_train.columns

# Getting the feature importances for all of the features trained in the model
dt_importances = clf.feature_importances_
# Sorting this list to find the most important features from the model
dt_importances = sorted(zip(dt_importances, train_feature_names), reverse=True)
# Sorting this list to find the most important features from the model
for i in range (5):
    print (dt_importances[i])

# Getting the score from the normalized x training and y trianing data.
east_dt_train_score = clf.score(x_train_normalized, y_train)
# east_logreg_train_score = east_logreg.score(x_train_normalized, y_train)
# Printing out this score.
print()
print("Score for training data: " + str(east_dt_train_score))
# Getting the score from the normalized x testing and y testing data.
east_dt_test_score = clf.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(east_dt_test_score))
print()
print("Predictions for which teams makes the playoffs for 2021 in the Eastern Conference")
print()
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
east_predictions_2022 = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
dt_probability = clf.predict_proba(x_test_normalized)[:, 1].tolist()
# rf_probability = rf.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
dt_prediction = clf.predict(x_test_normalized).tolist()
# Assinging the prediciotn value to the prediciotn colum in the list.
east_predictions_2022["PREDICTION"] = dt_prediction
# Assigning the probabilities to the column in the list..
east_predictions_2022["PROBABILITY"] = dt_probability
# Sorting the list according to the porbability values.
east_predictions_2022 = east_predictions_2022.sort_values("PROBABILITY", ascending=False)
with open('./results/east_predictions_2022_DT', 'wb') as f:
    pickle.dump(east_predictions_2022, f)
    f.close()
print(east_predictions_2022)



(0.9364809235518072, 'PM')
(0.032103779892645776, 'DREB')
(0.023118101512993153, 'TOV')
(0.005462775856840685, 'OREB')
(0.002089595140941444, 'AST')

Score for training data: 0.8766666666666667
Score for testing data: 0.7333333333333333

Predictions for which teams makes the playoffs for 2021 in the Eastern Conference

   TEAM  PLAYOFF  PREDICTION  PROBABILITY
1   BKN        1           1     1.000000
2   BOS        1           1     1.000000
4   CHI        1           1     1.000000
5   CLE        0           1     1.000000
7   IND        0           1     1.000000
8   MIA        1           1     1.000000
9   MIL        1           1     1.000000
12  PHI        1           1     1.000000
13  TOR        1           1     1.000000
3   CHA        0           1     0.642857
10  NYK        0           0     0.166667
0   ATL        1           0     0.125000
6   DET        0           0     0.125000
11  ORL        0           0     0.125000
14  WAS        0           0     0.125000


In [8]:
#####################################################
## PREDICTING THE 2022 WESTERN CONFERENCE PLAYOFFS ##
######################################################


##############################################################
## READING IN TRAINING DATA THAT HAS BEEN PREVIOSLY CLEANED ##
##############################################################

# Creating two empty lists that are used to store the data that is read in.
x_train_list = []
y_train_list = []
# Iterating through the data for all the years.
for i in range(2000, 2021):
    # We ignore the year of 2004, as there seems to me something wrong with the formatting of the data in out dataset.
    if i != 2004:
        # Assembling in the name of the file that contains the x data that needs to be trained.
        file_name = "season_stats/west" + str(i) + ".csv"
        # Reading the x data from the filename created above.
        df_x = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_x = df_x.sort_values("TEAM")
        # Resetting the index of the data.
        df_x = df_x.reset_index()
        # Dropping the TEAM column in the data.
        df_x = df_x.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_x = df_x.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the x data.
        x_train_list.append(df_x)
        # Assembling in the name of the file that contains the y data that needs to be trained.
        file_name = "playoff_labels/west" + str(i) + "playoff.csv"
        # Reading the y data from the filename created above.
        df_y = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_y = df_y.sort_values("TEAM")
        # Resetting the index of the data.
        df_y = df_y.reset_index()
        # Dropping the TEAM column in the data.
        df_y = df_y.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_y = df_y.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the y data.
        y_train_list.append(df_y)

# Concatentating the list of x data to a dataframe to hold all the x data.
x_train = pd.concat(x_train_list)
x_train = x_train.drop("MATCHUP", axis=1)
# x_train = x_train.drop("WLPCT", axis=1)
# Concatentating the list of y data to a dataframe to hold all the y data.
y_train = pd.concat(y_train_list)

In [9]:
##############################
## READING IN THE TEST DATA ##
##############################

# Assembling in the name of the file that contains the x data that needs to be tested against.
x_test = pd.read_csv("season_stats/west2022.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
x_test = x_test.sort_values("TEAM")
# Resetting the index of the data.
x_test = x_test.reset_index()
# Dropping the extra index column in the data.
x_test = x_test.drop("index", axis=1)
x_test = x_test.drop("MATCHUP", axis=1)
# x_test = x_test.drop("WLPCT", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
x_test_prediction = x_test
# Dropping the TEAM column in the data.
x_test = x_test.drop("TEAM", axis=1)
# Assembling in the name of the file that contains the y data that needs to be tested against.
y_test = pd.read_csv("playoff_labels/west2022playoff.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
y_test = y_test.sort_values("TEAM")
# Resetting the index of the data.
y_test = y_test.reset_index()
# Dropping the extra index column in the data.
y_test = y_test.drop("index", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
y_test_prediction = y_test
# Dropping the TEAM column in the data.
y_test = y_test.drop("TEAM", axis=1)

In [10]:
##############################
## STANDARDIZING THE X DATA ##
##############################
# Creating a scaler object.
sc = StandardScaler()
# Scaling the x training data.
x_train_scaled = sc.fit_transform(x_train)
# Scaling the x test data.
x_test_scaled = sc.transform(x_test)

In [11]:
############################
## NORMALIZING THE X DATA ##
############################
# Normalizing the x scaled training data.
a = preprocessing.normalize(x_train_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x training data.
x_train_normalized = pd.DataFrame(a, columns = x_train.columns)
# Normalizing the x scaled testing data.
b = preprocessing.normalize(x_test_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x testing data.
x_test_normalized = pd. DataFrame (b, columns = x_test.columns)

In [120]:
###############################
## LOGISTIC REGRESSION MODEL ##
###############################

print("LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE")
print()
# Creating a logistic regression object.
west_logreg = LogisticRegression()
# Fitting the model with the normalized x training and y training data.
west_logreg.fit(x_train_normalized, np.ravel(y_train))
# Getting the score from the normalized x training and y trianing data.
west_logreg_train_score = west_logreg.score(x_train_normalized, y_train)
# Printing out this score.
print("Score for training data: " + str(west_logreg_train_score))
# Getting the score from the normalized x testing and y testing data.
west_logreg_test_score = west_logreg.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(west_logreg_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names = x_train.columns
print("Most important features for 2022 in the Western Conference")
# Calculating the coeffeicnt for the logistic regression model.
coefficient_logreg = west_logreg.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_logreg = coefficient_logreg[0]
# Taking the absolute value of this coeffficient.
abs_importance_logreg = abs(importance_logreg)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_log = list(zip(train_feature_names, importance_logreg, abs_importance_logreg))
# Sorting this list according to the absolute values of the coefficient.
importance_list_log.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_log[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Western Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
west_predictions_2022_LR = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
logreg_probability = west_logreg.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
logreg_prediction = west_logreg.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediciotn column in the list.
west_predictions_2022_LR["PREDICTION"] = logreg_prediction
# Assigning the probabilities to the column in the list.
west_predictions_2022_LR["PROBABILITY"] = logreg_probability
# Sorting the list according to the porbability values.
west_predictions_2022_LR = west_predictions_2022_LR.sort_values("PROBABILITY", ascending=False)
print(west_predictions_2022_LR)
print()
print()

LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE

Score for training data: 0.8513513513513513
Score for testing data: 0.7333333333333333

Most important features for 2022 in the Western Conference
('WLPCT', 3.9489615636330027, 3.9489615636330027)
('PM', 3.7495904218670852, 3.7495904218670852)
('FGPCT', 1.7897564091541547, 1.7897564091541547)
('TOV', -1.2066008015955436, 1.2066008015955436)
('AST', 1.0476024095510832, 1.0476024095510832)

Predictions for which teams makes the playoffs for 2022 in the Western Conference
   TEAM  PLAYOFF  PREDICTION  PROBABILITY
10  PHX        1           1     0.994456
14  UTA        1           1     0.989021
2   GSW        1           1     0.987769
6   MEM        1           1     0.962776
5   LAL        0           1     0.748907
1   DEN        1           1     0.746379
0   DAL        1           1     0.598530
13  SAS        0           1     0.549802
4   LAC        0           0     0.471918
7   MIN        1           0 

In [123]:
###############
## SVM MODEL ##
###############

print("SVM MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE")
print()
# Making SVM model.
west_svm = SVC(kernel="linear", probability=True)
# Fitting the data to the model.
west_svm.fit(x_train_normalized, np.ravel(y_train))
# Calculating the score from the x and y training data.
west_svm_train_score = west_svm.score(x_train_normalized, y_train)
# Printing the score for the training data.
print("Score for training data: " + str(west_svm_train_score))
# Printing the score for the test data.
west_svm_test_score = west_svm.score(x_test, y_test)
# Printing the score for the training data.
print("Score for testing data: " + str(west_svm_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names_SVM = x_train.columns
print("Most important features for 2022 in the Western Conference")
# Calculating the coefficient for the SVM.
coefficient_svm = west_svm.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_svm = coefficient_svm[0]
# Taking the absolute value of this coeffficient.
abs_importance_svm = abs(importance_svm)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_svm = list(zip(train_feature_names_SVM, importance_svm, abs_importance_svm))
# Sorting this list according to the absolute values of the coefficient.
importance_list_svm.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_svm[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Western Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
west_predictions_2022_SVM = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
svm_probability = west_svm.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
svm_prediction = west_svm.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediction column in the list.
west_predictions_2022_SVM["PREDICTION"] = svm_prediction
# Assigning the probabilities to the column in the list.
west_predictions_2022_SVM["PROBABILITY"] = svm_probability
# Sorting the list according to the probability values.
west_predictions_2022_SVM = west_predictions_2022_SVM.sort_values("PROBABILITY", ascending=False)
print(west_predictions_2022_SVM)

SVM MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE

Score for training data: 0.8513513513513513
Score for testing data: 0.8666666666666667

Most important features for 2022 in the Western Conference
('WLPCT', 5.789988836508184, 5.789988836508184)
('PM', 5.168403969489876, 5.168403969489876)
('FGPCT', 2.228850098482319, 2.228850098482319)
('TOV', -1.642327804536691, 1.642327804536691)
('STL', 1.5457691263662912, 1.5457691263662912)

Predictions for which teams makes the playoffs for 2022 in the Western Conference
   TEAM  PLAYOFF  PREDICTION   PROBABILITY
2   GSW        1           1  1.000000e+00
10  PHX        1           1  1.000000e+00
14  UTA        1           1  1.000000e+00
6   MEM        1           1  1.000000e+00
1   DEN        1           1  9.901427e-01
5   LAL        0           1  9.338743e-01
0   DAL        1           1  8.834405e-01
13  SAS        0           0  2.079276e-01
4   LAC        0           0  8.104037e-02
7   MIN        1           0  1.582461e-02


In [12]:
####################################
## Random Forest Classifier Model ##
####################################


west_rf = RandomForestClassifier()
west_rf.fit(x_train_normalized, y_train)
# Getting the score from the normalized x training and y trianing data.
east_rf_train_score = west_rf.score(x_train_normalized, y_train)
# Printing out this score.
print("Score for training data: " + str(east_rf_train_score))
# Getting the score from the normalized x testing and y testing data.
east_rf_test_score = west_rf.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(east_rf_test_score))
print()

# Getting the names of the columns in the x training dataset.
train_feature_names = x_train.columns

# Getting the feature importances for all of the features trained in the model
rf_importances = west_rf.feature_importances_
# Making a list of the feature importances and feature names
rf_importances = sorted(zip(rf_importances, train_feature_names), reverse=True)
# Sorting this list to find the most important features from the model
for i in range(5):
    print(rf_importances[i])

print()
print("Predictions for which teams makes the playoffs for 2021 in the Eastern Conference")
print()
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
west_predictions_2022 = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
rf_probability = west_rf.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
rf_prediction = west_rf.predict(x_test_normalized).tolist()
# Assinging the prediciotn value to the prediciotn colum in the list.
west_predictions_2022["PREDICTION"] = rf_prediction
# Assigning the probabilities to the column in the list..
west_predictions_2022["PROBABILITY"] = rf_probability
# Sorting the list according to the porbability values.
west_predictions_2022 = west_predictions_2022.sort_values("PROBABILITY", ascending=False)
with open('./results/west_predictions_2022_RF', 'wb') as f:
    pickle.dump(west_predictions_2022, f)
    f.close()
print(west_predictions_2022)

Score for training data: 1.0
Score for testing data: 0.8666666666666667

(0.2814213143973307, 'PM')
(0.2569024772496241, 'WLPCT')
(0.0601612565253827, 'FGPCT')
(0.036013231918746086, 'AST')
(0.031539430738704105, 'DREB')

Predictions for which teams makes the playoffs for 2021 in the Eastern Conference

   TEAM  PLAYOFF  PREDICTION  PROBABILITY
2   GSW        1           1         0.80
10  PHX        1           1         0.77
14  UTA        1           1         0.75
1   DEN        1           1         0.70
6   MEM        1           1         0.68
0   DAL        1           1         0.65
5   LAL        0           0         0.33
7   MIN        1           0         0.30
8   NOP        1           0         0.29
9   OKC        0           0         0.28
11  POR        0           0         0.27
12  SAC        0           0         0.27
4   LAC        0           0         0.25
13  SAS        0           0         0.23
3   HOU        0           0         0.16


  west_rf.fit(x_train_normalized, y_train)


In [168]:
####################################
## Decision Tree Classifier Model ##
####################################

clf = DecisionTreeClassifier()
# Create dictionary of parameters to find the most optimal hyperparameters for the model
params = {'max_depth': [2, 4, 6, 8, 10, 12],
          'min_samples_leaf': [5, 10, 15, 20, 25, 30],
          'max_features': [10, 12, 14, 16, 18, 20]}
# Use GridSearchCV to find most optimal hyperparameters
grid_search = GridSearchCV(clf, params, cv = 10, scoring = 'accuracy')
# Fit the model to calculate the most optimal hyperparameters
grid_search.fit(x_train_normalized, y_train)

print(grid_search.best_params_)
print("Accuracy: ", grid_search.best_score_ * 100)

{'max_depth': 4, 'max_features': 10, 'min_samples_leaf': 20}
Accuracy:  88.49425287356321


In [13]:
# Fit the model based on the optimized hyperparameters
clf = DecisionTreeClassifier(max_depth=6, max_features=10, min_samples_leaf=20)
clf.fit(x_train_normalized, y_train)

train_feature_names = x_train.columns

# Getting the feature importances for all of the features trained in the model
dt_importances = clf.feature_importances_
# Sorting this list to find the most important features from the model
dt_importances = sorted(zip(dt_importances, train_feature_names), reverse=True)
# Sorting this list to find the most important features from the model
for i in range (5):
    print (dt_importances[i])

# Getting the score from the normalized x training and y trianing data.
east_dt_train_score = clf.score(x_train_normalized, y_train)
# east_logreg_train_score = east_logreg.score(x_train_normalized, y_train)
# Printing out this score.
print()
print("Score for training data: " + str(east_dt_train_score))
# Getting the score from the normalized x testing and y testing data.
east_dt_test_score = clf.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(east_dt_test_score))
print()
print("Predictions for which teams makes the playoffs for 2021 in the Eastern Conference")
print()
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
west_predictions_2022 = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
dt_probability = clf.predict_proba(x_test_normalized)[:, 1].tolist()
# rf_probability = rf.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
dt_prediction = clf.predict(x_test_normalized).tolist()
# Assinging the prediciotn value to the prediciotn colum in the list.
west_predictions_2022["PREDICTION"] = dt_prediction
# Assigning the probabilities to the column in the list..
west_predictions_2022["PROBABILITY"] = dt_probability
# Sorting the list according to the porbability values.
west_predictions_2022 = west_predictions_2022.sort_values("PROBABILITY", ascending=False)
with open('./results/ewest_predictions_2022_DT', 'wb') as f:
    pickle.dump(west_predictions_2022, f)
    f.close()
print(west_predictions_2022)


(0.9645211145210472, 'WLPCT')
(0.016873520771006068, 'STL')
(0.010371711516909583, 'OREB')
(0.00569555022391009, 'DREB')
(0.002538102967127175, 'FTPCT')

Score for training data: 0.8885135135135135
Score for testing data: 0.8666666666666667

Predictions for which teams makes the playoffs for 2021 in the Eastern Conference

   TEAM  PLAYOFF  PREDICTION  PROBABILITY
0   DAL        1           1     1.000000
1   DEN        1           1     1.000000
10  PHX        1           1     1.000000
14  UTA        1           1     1.000000
2   GSW        1           1     0.916667
6   MEM        1           1     0.916667
5   LAL        0           0     0.161290
3   HOU        0           0     0.000000
4   LAC        0           0     0.000000
7   MIN        1           0     0.000000
8   NOP        1           0     0.000000
9   OKC        0           0     0.000000
11  POR        0           0     0.000000
12  SAC        0           0     0.000000
13  SAS        0           0     0.000000
