## Import necessary libraries

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

import joblib

## Read and Display NBA data

In [3]:
nba2015 = pd.read_csv(r"C:\Users\User\Downloads\nba_data_science_project\data\transformed_data\nba_2015.csv")
nba16_23 = pd.read_csv(r"C:\Users\User\Downloads\nba_data_science_project\data\transformed_data\nba_16-23.csv")

In [4]:
# Merge NBA 2015 and 2016 to 2023 season data

nbadata = pd.concat([nba2015, nba16_23], axis=0, ignore_index=True)
nbadata.head()

Unnamed: 0,season,team,elo_rating,points,L,W,W%,conf,playoffs_y_n
0,2015,GSW,1707.0,9016.0,15,67,81.71,west,1
1,2015,LAC,1648.0,8751.0,26,56,68.29,west,1
2,2015,HOU,1618.0,8522.0,26,56,68.29,west,1
3,2015,SAS,1672.0,8461.0,27,55,67.07,west,1
4,2015,MEM,1622.0,8062.0,27,55,67.07,west,1


## East Conference data

In [5]:
# Separate East conference data from original dataframe for ML model

eastdf = nbadata.loc[(nbadata['conf'] == 'east')]
eastdf.reset_index(inplace=True, drop=True)
eastdf.drop(columns=['season', 'team', 'playoffs_y_n', 'conf'], inplace=True)
eastdf.head()

Unnamed: 0,elo_rating,points,L,W,W%
0,1608.0,8409.0,22,60,73.17
1,1542.0,8457.0,29,53,64.63
2,1567.0,8265.0,32,50,60.98
3,1574.0,8527.0,33,49,59.76
4,1538.0,8080.0,36,46,56.1


In [6]:
# Normalize the dataframe values

scaler = MinMaxScaler(feature_range=(0,1))
scaledeastdf = scaler.fit_transform(eastdf)

In [7]:
eastdf1 = pd.DataFrame(scaledeastdf)
eastdata = nbadata.loc[(nbadata['conf'] == 'east')]
eastdata.reset_index(inplace=True, drop=True)
eastdf1 = eastdf1.merge(eastdata['season'], left_index=True, right_index=True)
eastdf1.head()

Unnamed: 0,0,1,2,3,4,season
0,0.871359,0.396852,0.0,1.0,1.0,2015
1,0.711165,0.417269,0.14,0.86,0.859931,2015
2,0.771845,0.335602,0.2,0.8,0.800066,2015
3,0.788835,0.447044,0.22,0.78,0.780056,2015
4,0.701456,0.256912,0.28,0.72,0.720026,2015


## Train-Test Split

In [8]:
# Split the dataframe into training and testing data

train_east = eastdf1.loc[eastdf1["season"] < 2023]
test_east = eastdf1.loc[eastdf1["season"] == 2023]

X_train_east = train_east.drop(columns=['season'])
X_test_east = test_east.drop(columns=['season'])

unscaledtrain_east = eastdata.loc[eastdata["season"] < 2023]
unscaledtest_east = eastdata.loc[eastdata["season"] == 2023]

y_train_east = unscaledtrain_east[['playoffs_y_n']]
y_test_east = unscaledtest_east[['playoffs_y_n']]

## Logistic Regression Model

In [9]:
# Create logistic regression instance and train the model
# Calculate model accuracy on training and testing data

print ("logreg - East")
logreg_east = LogisticRegression()
logreg_east.fit(X_train_east, y_train_east)

log_east_train_score = logreg_east.score(X_train_east, y_train_east)
log_east_test_score = logreg_east.score(X_test_east, y_test_east)

print(f"Training Data Score: {log_east_train_score}")
print(f"Testing Data Score: {log_east_test_score}")

logreg - East
Training Data Score: 0.8952380952380953
Testing Data Score: 0.9333333333333333


## Random Forest Classifier Model

In [10]:
# Create Random Forest Classifier instance and train the model
# Calculate model accuracy on training and testing data

print ("RFC - East")
rfc_east= RandomForestClassifier()
rfc_east.fit(X_train_east, y_train_east)

rfc_east_train_score = rfc_east.score(X_train_east, y_train_east)
rfc_east_test_score = rfc_east.score(X_test_east, y_test_east)

print(f"Training Data Score: {rfc_east_train_score}")
print(f"Testing Data Score: {rfc_east_test_score}")

RFC - East
Training Data Score: 1.0
Testing Data Score: 0.9333333333333333


In [11]:
# Present model accuracy results in a dataframe

east_scores = [["Train", log_east_train_score, rfc_east_train_score],
               ["Test", log_east_test_score, rfc_east_test_score]]
df_east_scores = pd.DataFrame(east_scores, columns = ['Type','Logistic', 'RFC']) 
df_east_scores

Unnamed: 0,Type,Logistic,RFC
0,Train,0.895238,1.0
1,Test,0.933333,0.933333


## Model Predictions

In [12]:
# Predict East conference 2023 season data
# Calculate probability of each team making it to playoffs

east_predictions_2023 = unscaledtest_east[["season", "team", "playoffs_y_n"]]

log_probability = logreg_east.predict_proba(X_test_east)[:,1].tolist()
rfc_probability = rfc_east.predict_proba(X_test_east)[:,1].tolist()

log_prediction = logreg_east.predict(X_test_east).tolist()
rfc_prediction = rfc_east.predict(X_test_east).tolist()

east_predictions_2023["prediction_log"] = log_prediction
east_predictions_2023["prob_log"] = log_probability
east_predictions_2023["prediction_rfc"] = rfc_prediction
east_predictions_2023["prob_rfc"] = rfc_probability

east_predictions_2023.sort_values("prob_log", ascending = False)
east_predictions_2023.sort_values("prob_rfc", ascending = False)

Unnamed: 0,season,team,playoffs_y_n,prediction_log,prob_log,prediction_rfc,prob_rfc
107,2023,PHI,1,1,0.885344,1,1.0
108,2023,CLE,1,1,0.824814,1,1.0
105,2023,MIL,1,1,0.930539,1,0.97
106,2023,BOS,1,1,0.934649,1,0.97
110,2023,BRK,1,1,0.649784,1,0.96
109,2023,NYK,1,1,0.717123,1,0.94
111,2023,MIA,1,1,0.621433,0,0.25
112,2023,ATL,0,1,0.524262,0,0.18
113,2023,TOR,0,0,0.491349,0,0.15
114,2023,CHI,0,0,0.445871,0,0.15


In [13]:
# Save trained models

#east_log = r'C:\Users\User\Downloads\nba_data_science_project\models\conference_qualifiers_models\east_log_model.h5'
#east_rfc = r'C:\Users\User\Downloads\nba_data_science_project\models\conference_qualifiers_models\east_rfc_model.h5'

#joblib.dump(logreg_east, east_log)
#joblib.dump(rfc_east, east_rfc)

['C:\\Users\\User\\Downloads\\nba_data_science_project\\models\\conference_qualifiers_models\\east_rfc_model.h5']