In [1]:
# THIS IS THE MAIN FILE USED TO CREATE ALL MODELS
# We have so far implemented two models: Logistic Regression and SVM.

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.svm import SVC

In [33]:
#####################################################
## TRAINED MODELS FOR PREDICTING THE 2022 PLAYOFFS ##
#####################################################


#####################################################
## PREDICTING THE 2022 EASTERN CONFERENCE PLAYOFFS ##
######################################################


##############################################################
## READING IN TRAINING DATA THAT HAS BEEN PREVIOSLY CLEANED ##
##############################################################

# Creating two empty lists that are used to store the data that is read in.
x_train_list = []
y_train_list = []
# Iterating through the data for all the years.
for i in range(2000, 2021):
    # We ignore the year of 2004, as there seems to me something wrong with the formatting of the data in out dataset.
    if i != 2004:
        # Assembling in the name of the file that contains the x data that needs to be trained.
        file_name = "season_stats/east" + str(i) + ".csv"
        # Reading the x data from the filename created above.
        df_x = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_x = df_x.sort_values("TEAM")
        # Resetting the index of the data.
        df_x = df_x.reset_index()
        # Dropping the TEAM column in the data.
        df_x = df_x.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_x = df_x.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the x data.
        x_train_list.append(df_x)
        # Assembling in the name of the file that contains the y data that needs to be trained.
        file_name = "playoff_labels/east" + str(i) + "playoff.csv"
        # Reading the y data from the filename created above.
        df_y = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_y = df_y.sort_values("TEAM")
        # Resetting the index of the data.
        df_y = df_y.reset_index()
        # Dropping the TEAM column in the data.
        df_y = df_y.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_y = df_y.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the y data.
        y_train_list.append(df_y)

# Concatentating the list of x data to a dataframe to hold all the x data.
x_train = pd.concat(x_train_list)
x_train = x_train.drop("MATCHUP", axis=1)
x_train = x_train.drop("WLPCT", axis=1)
# Concatentating the list of y data to a dataframe to hold all the y data.
y_train = pd.concat(y_train_list)

In [34]:
##############################
## READING IN THE TEST DATA ##
##############################

# Assembling in the name of the file that contains the x data that needs to be tested against.
x_test = pd.read_csv("season_stats/east2022.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
x_test = x_test.sort_values("TEAM")
# Resetting the index of the data.
x_test = x_test.reset_index()
# Dropping the extra index column in the data.
x_test = x_test.drop("index", axis=1)
x_test = x_test.drop("MATCHUP", axis=1)
x_test = x_test.drop("WLPCT", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
x_test_prediction = x_test
# Dropping the TEAM column in the data.
x_test = x_test.drop("TEAM", axis=1)
# Assembling in the name of the file that contains the y data that needs to be tested against.
y_test = pd.read_csv("playoff_labels/east2022playoff.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
y_test = y_test.sort_values("TEAM")
# Resetting the index of the data.
y_test = y_test.reset_index()
# Dropping the extra index column in the data.
y_test = y_test.drop("index", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
y_test_prediction = y_test
# Dropping the TEAM column in the data.
y_test = y_test.drop("TEAM", axis=1)

In [35]:
# x_train.head
x_train.columns
# x_test.columns

Index(['FGM', 'FGA', 'FGPCT', 'FG3M', 'FG3A', 'FG3PCT', 'FTM', 'FTA', 'FTPCT',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PM'],
      dtype='object')

In [36]:
##############################
## STANDARDIZING THE X DATA ##
##############################
# Creating a scaler object.
sc = StandardScaler()
# Scaling the x training data.
x_train_scaled = sc.fit_transform(x_train)
# Scaling the x test data.
x_test_scaled = sc.transform(x_test)

In [37]:
############################
## NORMALIZING THE X DATA ##
############################
# Normalizing the x scaled training data.
a = preprocessing.normalize(x_train_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x training data.
x_train_normalized = pd.DataFrame(a, columns = x_train.columns)
# Normalizing the x scaled testing data.
b = preprocessing.normalize(x_test_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x testing data.
x_test_normalized = pd. DataFrame (b, columns = x_test.columns)

In [38]:
###############################
## LOGISTIC REGRESSION MODEL ##
###############################

print("LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE")
print()
# Creating a logistic regression object.
east_logreg = LogisticRegression()
# Fitting the model with the normalized x training and y training data.
east_logreg.fit(x_train_normalized, np.ravel(y_train))
# Getting the score from the normalized x training and y trianing data.
east_logreg_train_score = east_logreg.score(x_train_normalized, y_train)
# Printing out this score.
print("Score for training data: " + str(east_logreg_train_score))
# Getting the score from the normalized x testing and y testing data.
east_logreg_test_score = east_logreg.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(east_logreg_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names = x_train.columns
print("Most important features for 2022 in the Eastern Conference")
# Calculating the coeffeicnt for the logistic regression model.
coefficient_logreg = east_logreg.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_logreg = coefficient_logreg[0]
# Taking the absolute value of this coeffficient.
abs_importance_logreg = abs(importance_logreg)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_log = list(zip(train_feature_names, importance_logreg, abs_importance_logreg))
# Sorting this list according to the absolute values of the coefficient.
importance_list_log.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_log[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Eastern Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
east_predictions_2022_LR = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
logreg_probability = east_logreg.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
logreg_prediction = east_logreg.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediciotn column in the list.
east_predictions_2022_LR["PREDICTION"] = logreg_prediction
# Assigning the probabilities to the column in the list.
east_predictions_2022_LR["PROBABILITY"] = logreg_probability
# Sorting the list according to the porbability values.
east_predictions_2022_LR = east_predictions_2022_LR.sort_values("PROBABILITY", ascending=False)
print(east_predictions_2022_LR)
print()
print()

LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE

Score for training data: 0.8066666666666666
Score for testing data: 0.6666666666666666

Most important features for 2022 in the Eastern Conference
('PM', 4.192892339967322, 4.192892339967322)
('FGPCT', 1.5612884719530524, 1.5612884719530524)
('FG3PCT', 1.2539959951474446, 1.2539959951474446)
('TOV', -1.2041265228950269, 1.2041265228950269)
('STL', 1.1156588648834882, 1.1156588648834882)

Predictions for which teams makes the playoffs for 2022 in the Eastern Conference
   TEAM  PLAYOFF  PREDICTION  PROBABILITY
4   CHI        1           1     0.964768
12  PHI        1           1     0.958279
1   BKN        1           1     0.956483
9   MIL        1           1     0.946853
5   CLE        0           1     0.933985
3   CHA        0           1     0.923703
8   MIA        1           1     0.903250
2   BOS        1           1     0.845746
0   ATL        1           1     0.829961
7   IND        0           1  

In [39]:
###############
## SVM MODEL ##
###############

print("SVM MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE")
print()
# Making SVM model.
east_svm = SVC(kernel="linear", probability=True)
# Fitting the data to the model.
east_svm.fit(x_train_normalized, np.ravel(y_train))
# Calculating the score from the x and y training data.
east_svm_train_score = east_svm.score(x_train_normalized, y_train)
# Printing the score for the training data.
print("Score for training data: " + str(east_svm_train_score))
# Printing the score for the test data.
east_svm_test_score = east_svm.score(x_test, y_test)
# Printing the score for the training data.
print("Score for testing data: " + str(east_svm_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names_SVM = x_train.columns
print("Most important features for 2022 in the Eastern Conference")
# Calculating the coefficient for the SVM.
coefficient_svm = east_svm.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_svm = coefficient_svm[0]
# Taking the absolute value of this coeffficient.
abs_importance_svm = abs(importance_svm)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_svm = list(zip(train_feature_names_SVM, importance_svm, abs_importance_svm))
# Sorting this list according to the absolute values of the coefficient.
importance_list_svm.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_svm[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Eastern Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
east_predictions_2022_SVM = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
svm_probability = east_svm.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
svm_prediction = east_svm.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediciotn colum in the list.
east_predictions_2022_SVM["PREDICTION"] = svm_prediction
# Assigning the probabilities to the column in the list.
east_predictions_2022_SVM["PROBABILITY"] = svm_probability
# Sorting the list according to the porbability values.
east_predictions_2022_SVM = east_predictions_2022_SVM.sort_values("PROBABILITY", ascending=False)
print(east_predictions_2022_SVM)

SVM MODEL FOR THE 2022 NBA SEASON IN THE EASTERN CONFERENCE

Score for training data: 0.83
Score for testing data: 0.8

Most important features for 2022 in the Eastern Conference
('PM', 6.619356851213325, 6.619356851213325)
('TOV', -2.128579788769111, 2.128579788769111)
('STL', 1.8783485152102481, 1.8783485152102481)
('FGA', -1.8565698743163028, 1.8565698743163028)
('FGPCT', 1.8146080083443126, 1.8146080083443126)

Predictions for which teams makes the playoffs for 2022 in the Eastern Conference
   TEAM  PLAYOFF  PREDICTION   PROBABILITY
12  PHI        1           1  1.000000e+00
4   CHI        1           1  1.000000e+00
1   BKN        1           1  1.000000e+00
9   MIL        1           1  1.000000e+00
5   CLE        0           1  1.000000e+00
3   CHA        0           1  1.000000e+00
8   MIA        1           1  1.000000e+00
2   BOS        1           1  9.999996e-01
0   ATL        1           1  9.971350e-01
13  TOR        1           1  9.687925e-01
7   IND        0          

In [40]:
#####################################################
## PREDICTING THE 2022 WESTERN CONFERENCE PLAYOFFS ##
######################################################


##############################################################
## READING IN TRAINING DATA THAT HAS BEEN PREVIOSLY CLEANED ##
##############################################################

# Creating two empty lists that are used to store the data that is read in.
x_train_list = []
y_train_list = []
# Iterating through the data for all the years.
for i in range(2000, 2021):
    # We ignore the year of 2004, as there seems to me something wrong with the formatting of the data in out dataset.
    if i != 2004:
        # Assembling in the name of the file that contains the x data that needs to be trained.
        file_name = "season_stats/west" + str(i) + ".csv"
        # Reading the x data from the filename created above.
        df_x = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_x = df_x.sort_values("TEAM")
        # Resetting the index of the data.
        df_x = df_x.reset_index()
        # Dropping the TEAM column in the data.
        df_x = df_x.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_x = df_x.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the x data.
        x_train_list.append(df_x)
        # Assembling in the name of the file that contains the y data that needs to be trained.
        file_name = "playoff_labels/west" + str(i) + "playoff.csv"
        # Reading the y data from the filename created above.
        df_y = pd.read_csv(file_name, index_col=0)
        # Sorting the data according to the values in the TEAM column.
        df_y = df_y.sort_values("TEAM")
        # Resetting the index of the data.
        df_y = df_y.reset_index()
        # Dropping the TEAM column in the data.
        df_y = df_y.drop("TEAM", axis=1)
        # Dropping the extra index column in the data.
        df_y = df_y.drop("index", axis=1)
        # Appending the current round of data to the overall list holding all the y data.
        y_train_list.append(df_y)

# Concatentating the list of x data to a dataframe to hold all the x data.
x_train = pd.concat(x_train_list)
x_train = x_train.drop("MATCHUP", axis=1)
x_train = x_train.drop("WLPCT", axis=1)
# Concatentating the list of y data to a dataframe to hold all the y data.
y_train = pd.concat(y_train_list)

In [41]:
##############################
## READING IN THE TEST DATA ##
##############################

# Assembling in the name of the file that contains the x data that needs to be tested against.
x_test = pd.read_csv("season_stats/west2022.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
x_test = x_test.sort_values("TEAM")
# Resetting the index of the data.
x_test = x_test.reset_index()
# Dropping the extra index column in the data.
x_test = x_test.drop("index", axis=1)
x_test = x_test.drop("MATCHUP", axis=1)
x_test = x_test.drop("WLPCT", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
x_test_prediction = x_test
# Dropping the TEAM column in the data.
x_test = x_test.drop("TEAM", axis=1)
# Assembling in the name of the file that contains the y data that needs to be tested against.
y_test = pd.read_csv("playoff_labels/west2022playoff.csv", index_col=0)
# Sorting the data according to the values in the TEAM column.
y_test = y_test.sort_values("TEAM")
# Resetting the index of the data.
y_test = y_test.reset_index()
# Dropping the extra index column in the data.
y_test = y_test.drop("index", axis=1)
# Assigning the test data to a variable that tells us that this is the prediction that should be made.
y_test_prediction = y_test
# Dropping the TEAM column in the data.
y_test = y_test.drop("TEAM", axis=1)

In [42]:
##############################
## STANDARDIZING THE X DATA ##
##############################
# Creating a scaler object.
sc = StandardScaler()
# Scaling the x training data.
x_train_scaled = sc.fit_transform(x_train)
# Scaling the x test data.
x_test_scaled = sc.transform(x_test)

In [43]:
############################
## NORMALIZING THE X DATA ##
############################
# Normalizing the x scaled training data.
a = preprocessing.normalize(x_train_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x training data.
x_train_normalized = pd.DataFrame(a, columns = x_train.columns)
# Normalizing the x scaled testing data.
b = preprocessing.normalize(x_test_scaled, axis = 0)
# Creating a new dataframe to hold the saled, normalized x testing data.
x_test_normalized = pd. DataFrame (b, columns = x_test.columns)

In [44]:
###############################
## LOGISTIC REGRESSION MODEL ##
###############################

print("LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE")
print()
# Creating a logistic regression object.
west_logreg = LogisticRegression()
# Fitting the model with the normalized x training and y training data.
west_logreg.fit(x_train_normalized, np.ravel(y_train))
# Getting the score from the normalized x training and y trianing data.
west_logreg_train_score = west_logreg.score(x_train_normalized, y_train)
# Printing out this score.
print("Score for training data: " + str(west_logreg_train_score))
# Getting the score from the normalized x testing and y testing data.
west_logreg_test_score = west_logreg.score(x_test_normalized, y_test)
# Printing out this score.
print("Score for testing data: " + str(west_logreg_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names = x_train.columns
print("Most important features for 2022 in the Western Conference")
# Calculating the coeffeicnt for the logistic regression model.
coefficient_logreg = west_logreg.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_logreg = coefficient_logreg[0]
# Taking the absolute value of this coeffficient.
abs_importance_logreg = abs(importance_logreg)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_log = list(zip(train_feature_names, importance_logreg, abs_importance_logreg))
# Sorting this list according to the absolute values of the coefficient.
importance_list_log.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_log[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Western Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
west_predictions_2022_LR = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
logreg_probability = west_logreg.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
logreg_prediction = west_logreg.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediciotn column in the list.
west_predictions_2022_LR["PREDICTION"] = logreg_prediction
# Assigning the probabilities to the column in the list.
west_predictions_2022_LR["PROBABILITY"] = logreg_probability
# Sorting the list according to the porbability values.
west_predictions_2022_LR = west_predictions_2022_LR.sort_values("PROBABILITY", ascending=False)
print(west_predictions_2022_LR)
print()
print()

LOGISTIC REGRESSION MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE

Score for training data: 0.8209459459459459
Score for testing data: 0.6666666666666666

Most important features for 2022 in the Western Conference
('PM', 4.28481160964705, 4.28481160964705)
('FGPCT', 2.0474860354445084, 2.0474860354445084)
('TOV', -1.3837647012875676, 1.3837647012875676)
('FG3PCT', 1.2054626567489557, 1.2054626567489557)
('AST', 1.1817391543816471, 1.1817391543816471)

Predictions for which teams makes the playoffs for 2022 in the Western Conference
   TEAM  PLAYOFF  PREDICTION  PROBABILITY
14  UTA        1           1     0.980645
10  PHX        1           1     0.980516
2   GSW        1           1     0.966208
6   MEM        1           1     0.934063
13  SAS        0           1     0.808435
5   LAL        0           1     0.787030
1   DEN        1           1     0.749589
0   DAL        1           1     0.579765
4   LAC        0           1     0.530997
7   MIN        1           0    

In [45]:
###############
## SVM MODEL ##
###############

print("SVM MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE")
print()
# Making SVM model.
west_svm = SVC(kernel="linear", probability=True)
# Fitting the data to the model.
west_svm.fit(x_train_normalized, np.ravel(y_train))
# Calculating the score from the x and y training data.
west_svm_train_score = west_svm.score(x_train_normalized, y_train)
# Printing the score for the training data.
print("Score for training data: " + str(west_svm_train_score))
# Printing the score for the test data.
west_svm_test_score = west_svm.score(x_test, y_test)
# Printing the score for the training data.
print("Score for testing data: " + str(west_svm_test_score))
print()
# Getting the names of the columns in the x training dataset.
train_feature_names_SVM = x_train.columns
print("Most important features for 2022 in the Western Conference")
# Calculating the coefficient for the SVM.
coefficient_svm = west_svm.coef_
# Getting the most important coefficient value which would be the first one in the list.
importance_svm = coefficient_svm[0]
# Taking the absolute value of this coeffficient.
abs_importance_svm = abs(importance_svm)
# Making a list of the feature names, the importance coeffeicient and the absolute value of this coefficient.
importance_list_svm = list(zip(train_feature_names_SVM, importance_svm, abs_importance_svm))
# Sorting this list according to the absolute values of the coefficient.
importance_list_svm.sort(key=lambda x: x[2], reverse=True)
for i in range(5):
    print(importance_list_svm[i])
print()
print("Predictions for which teams makes the playoffs for 2022 in the Western Conference")
# Getting the teams and whether they made it to the playoffs in our testing data. This will be the prediction that we would like to match.
west_predictions_2022_SVM = y_test_prediction[["TEAM", "PLAYOFF"]]
# Getting the probability of this predction, using the x data.
svm_probability = west_svm.predict_proba(x_test_normalized)[:, 1].tolist()
# Making a new predcition.
svm_prediction = west_svm.predict(x_test_normalized).tolist()
# Assigning the prediction value to the prediction column in the list.
west_predictions_2022_SVM["PREDICTION"] = svm_prediction
# Assigning the probabilities to the column in the list.
west_predictions_2022_SVM["PROBABILITY"] = svm_probability
# Sorting the list according to the probability values.
west_predictions_2022_SVM = west_predictions_2022_SVM.sort_values("PROBABILITY", ascending=False)
print(west_predictions_2022_SVM)

SVM MODEL FOR THE 2022 NBA SEASON IN THE WESTERN CONFERENCE

Score for training data: 0.831081081081081
Score for testing data: 0.6666666666666666

Most important features for 2022 in the Western Conference
('PM', 6.744170276898257, 6.744170276898257)
('FGPCT', 2.9210255416927127, 2.9210255416927127)
('TOV', -2.325008375441481, 2.325008375441481)
('FGA', -1.7192620654198394, 1.7192620654198394)
('AST', 1.571019523353721, 1.571019523353721)

Predictions for which teams makes the playoffs for 2022 in the Western Conference
   TEAM  PLAYOFF  PREDICTION   PROBABILITY
10  PHX        1           1  1.000000e+00
14  UTA        1           1  1.000000e+00
2   GSW        1           1  1.000000e+00
6   MEM        1           1  1.000000e+00
13  SAS        0           1  9.936167e-01
1   DEN        1           1  9.810084e-01
5   LAL        0           1  9.795443e-01
0   DAL        1           1  7.960683e-01
4   LAC        0           0  1.724981e-01
7   MIN        1           0  3.293828e-02
