## Import necessary libraries

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import joblib

## Read and Display NBA data

In [3]:
nba2015 = pd.read_csv(r"C:\Users\User\Downloads\nba_data_science_project\data\transformed_data\nba_2015.csv")
nba16_23 = pd.read_csv(r"C:\Users\User\Downloads\nba_data_science_project\data\transformed_data\nba_16-23.csv")

In [4]:
# Merge NBA 2015 and 2016 to 2023 season data

nbadata = pd.concat([nba2015, nba16_23], axis=0, ignore_index=True)
nbadata.head()

Unnamed: 0,season,team,elo_rating,points,L,W,W%,conf,playoffs_y_n
0,2015,GSW,1707.0,9016.0,15,67,81.71,west,1
1,2015,LAC,1648.0,8751.0,26,56,68.29,west,1
2,2015,HOU,1618.0,8522.0,26,56,68.29,west,1
3,2015,SAS,1672.0,8461.0,27,55,67.07,west,1
4,2015,MEM,1622.0,8062.0,27,55,67.07,west,1


## West Conference data

In [5]:
# Separate West conference data from original dataframe for ML model

westdf = nbadata.loc[(nbadata['conf'] == 'west')]
westdf.reset_index(inplace=True, drop=True)
westdf.drop(columns=['season', 'team', 'playoffs_y_n', 'conf'], inplace=True)
westdf.head()

Unnamed: 0,elo_rating,points,L,W,W%
0,1707.0,9016.0,15,67,81.71
1,1648.0,8751.0,26,56,68.29
2,1618.0,8522.0,26,56,68.29
3,1672.0,8461.0,27,55,67.07
4,1622.0,8062.0,27,55,67.07


In [6]:
d = nba16_23.loc[(nba16_23['conf'] == 'west') & (nba16_23["season"] == 2021)]
d

Unnamed: 0,season,team,elo_rating,points,L,W,W%,conf,playoffs_y_n
120,2021,UTA,1647.0,8382.0,20,52,72.22,west,1
121,2021,PHO,1611.0,8300.0,21,51,70.83,west,1
122,2021,DEN,1603.0,8284.0,25,47,65.28,west,1
123,2021,LAC,1619.0,8209.0,25,47,65.28,west,1
124,2021,LAL,1628.0,7990.0,30,43,58.9,west,1
125,2021,DAL,1548.0,8096.0,30,42,58.33,west,1
126,2021,POR,1519.0,8360.0,30,42,58.33,west,1
127,2021,MEM,1540.0,8374.0,34,40,54.05,west,0
128,2021,GSW,1471.0,8399.0,35,39,52.7,west,0
129,2021,SAS,1500.0,8094.0,40,33,45.21,west,0


In [7]:
# Normalize the dataframe values

scaler = MinMaxScaler(feature_range=(0,1))
scaledwestdf = scaler.fit_transform(westdf)

In [8]:
westdf1 = pd.DataFrame(scaledwestdf)
westdata = nbadata.loc[(nbadata['conf'] == 'west')]
westdata.reset_index(inplace=True, drop=True)
westdf1 = westdf1.merge(westdata['season'], left_index=True, right_index=True)
westdf1.head()

Unnamed: 0,0,1,2,3,4,season
0,0.815109,0.622754,0.105263,0.894737,0.894835,2015
1,0.697813,0.50941,0.298246,0.701754,0.70177,2015
2,0.638171,0.411463,0.298246,0.701754,0.70177,2015
3,0.745527,0.385372,0.315789,0.684211,0.684218,2015
4,0.646123,0.214713,0.315789,0.684211,0.684218,2015


## Train-Test Split

In [9]:
# Split the dataframe into training and testing data

train_west = westdf1.loc[westdf1["season"] < 2023]
test_west = westdf1.loc[westdf1["season"] == 2023]

X_train_west = train_west.drop(columns=['season'])
X_test_west = test_west.drop(columns=['season'])

unscaledtrain_west = westdata.loc[westdata["season"] < 2023]
unscaledtest_west = westdata.loc[westdata["season"] == 2023]

y_train_west = unscaledtrain_west[['playoffs_y_n']]
y_test_west = unscaledtest_west[['playoffs_y_n']]

## Logistic Regression Model

In [10]:
# Create logistic regression instance and train the model
# Calculate model accuracy on training and testing data

print ("logreg - West")
logreg_west = LogisticRegression()
logreg_west.fit(X_train_west, y_train_west)

log_west_train_score = logreg_west.score(X_train_west, y_train_west)
log_west_test_score = logreg_west.score(X_test_west, y_test_west)

print(f"Training Data Score: {log_west_train_score}")
print(f"Testing Data Score: {log_west_test_score}")

logreg - West
Training Data Score: 0.9238095238095239
Testing Data Score: 0.8666666666666667


## Random Forest Classifier Model

In [11]:
# Create Random Forest Classifier instance and train the model
# Calculate model accuracy on training and testing data

print ("RFC - West")
rfc_west= RandomForestClassifier()
rfc_west.fit(X_train_west, y_train_west)

rfc_west_train_score = rfc_west.score(X_train_west, y_train_west)
rfc_west_test_score = rfc_west.score(X_test_west, y_test_west)

print(f"Training Data Score: {rfc_west_train_score}")
print(f"Testing Data Score: {rfc_west_test_score}")

RFC - West
Training Data Score: 1.0
Testing Data Score: 0.8666666666666667


In [12]:
# Present model accuracy results in a dataframe

west_scores = [["Train", log_west_train_score, rfc_west_train_score],
               ["Test", log_west_test_score, rfc_west_test_score]]
df_west_scores = pd.DataFrame(west_scores, columns = ['Type','Logistic', 'RFC']) 
df_west_scores

Unnamed: 0,Type,Logistic,RFC
0,Train,0.92381,1.0
1,Test,0.866667,0.866667


## Model Predictions

In [13]:
# Predict West conference 2023 season data
# Calculate probability of each team making it to playoffs

west_predictions_2023 = unscaledtest_west[["season", "team", "playoffs_y_n"]]

log_probability = logreg_west.predict_proba(X_test_west)[:,1].tolist()
rfc_probability = rfc_west.predict_proba(X_test_west)[:,1].tolist()

log_prediction = logreg_west.predict(X_test_west).tolist()
rfc_prediction = rfc_west.predict(X_test_west).tolist()

west_predictions_2023["prediction_log"] = log_prediction
west_predictions_2023["prob_log"] = log_probability
west_predictions_2023["prediction_rfc"] = rfc_prediction
west_predictions_2023["prob_rfc"] = rfc_probability

west_predictions_2023.sort_values("prob_log", ascending = False)
west_predictions_2023.sort_values("prob_rfc", ascending = False)

Unnamed: 0,season,team,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc
105,2023,DEN,1,1,0.789798,1,1.0
106,2023,MEM,1,1,0.771724,1,1.0
107,2023,SAC,1,1,0.63205,1,0.87
110,2023,LAC,1,0,0.494768,1,0.58
111,2023,LAL,1,0,0.469566,1,0.54
112,2023,MIN,0,0,0.457171,0,0.43
109,2023,GSW,1,1,0.575525,0,0.39
108,2023,PHO,1,1,0.584972,0,0.17
113,2023,NOP,0,0,0.46815,0,0.1
115,2023,DAL,0,0,0.359809,0,0.02


In [14]:
# Save trained models

#west_log = r'C:\Users\User\Downloads\nba_data_science_project\models\conference_qualifiers_models\west_log_model.h5'
#west_rfc = r'C:\Users\User\Downloads\nba_data_science_project\models\conference_qualifiers_models\west_rfc_model.h5'

#joblib.dump(logreg_west, west_log)
#joblib.dump(rfc_west, west_rfc)

['C:\\Users\\User\\Downloads\\nba_data_science_project\\models\\conference_qualifiers_models\\west_rfc_model.h5']