## Import necessary libraries

In [1]:
import warnings
warnings.simplefilter('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

import joblib

## Read and Display NBA data

In [3]:
nbaplayoffsdata = pd.read_csv(r"C:\Users\User\Downloads\nba_data_science_project\data\transformed_data\nba_playoffs.csv")

In [4]:
nbaplayoffsdata.head()

Unnamed: 0,season,team,elo_rating,points,L,W,W%,playoffs_winner
0,2015,GSW,1788.0,2169.0,5,16,76.19,1
1,2015,CLE,1676.0,1980.0,6,14,70.0,0
2,2015,WAS,1547.0,1011.0,4,6,60.0,0
3,2015,MEM,1611.0,1042.0,5,6,54.55,0
4,2015,HOU,1647.0,1839.0,8,9,52.94,0


## Train-Test Split

In [5]:
# Split the dataframe into training and testing data

train_data = nbaplayoffsdata.loc[nbaplayoffsdata["season"] < 2023]
test_data = nbaplayoffsdata.loc[nbaplayoffsdata["season"] == 2023]

xtrain_data = train_data.drop(columns=['season', 'team', 'playoffs_winner'])
xtest_data = test_data.drop(columns=['season', 'team', 'playoffs_winner'])

y_train = train_data[['playoffs_winner']]
y_test = test_data[['playoffs_winner']]

In [6]:
# Normalize the dataframe values

scaler = MinMaxScaler(feature_range=(0,1))

scaled_train_data = scaler.fit_transform(xtrain_data)
scaled_test_data = scaler.fit_transform(xtest_data)

X_train = pd.DataFrame(scaled_train_data)
X_test = pd.DataFrame(scaled_test_data)

## Logistic Regression Model

In [7]:
# Create logistic regression instance and train the model
# Calculate model accuracy on training and testing data

print ("logreg model")
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

logreg_train_score = logreg.score(X_train, y_train)
logreg_test_score = logreg.score(X_test, y_test)

print(f"Training Data Score: {logreg_train_score}")
print(f"Testing Data Score: {logreg_test_score}")

logreg model
Training Data Score: 0.9375
Testing Data Score: 0.9375


## Random Forest Classifier Model

In [8]:
# Create Random Forest Classifier instance and train the model
# Calculate model accuracy on training and testing data

print ("RFC model")
rfc= RandomForestClassifier()
rfc.fit(X_train, y_train)

rfc_train_score = rfc.score(X_train, y_train)
rfc_test_score = rfc.score(X_test, y_test)

print(f"Training Data Score: {rfc_train_score}")
print(f"Testing Data Score: {rfc_test_score}")

RFC model
Training Data Score: 1.0
Testing Data Score: 0.875


## Support Vector Machine Model

In [9]:
# Create Support Vector Classifier instance and train the model
# Calculate model accuracy on training and testing data

print ("SVM model")
svc= SVC(kernel='linear', probability=True)
svc.fit(X_train, y_train)

svc_train_score = svc.score(X_train, y_train)
svc_test_score = svc.score(X_test, y_test)

print(f"Training Data Score: {svc_train_score}")
print(f"Testing Data Score: {svc_test_score}")

SVM model
Training Data Score: 0.9464285714285714
Testing Data Score: 0.9375


In [10]:
# Present model accuracy results in a dataframe

model_scores = [["Train", logreg_train_score, rfc_train_score, svc_train_score],
               ["Test", logreg_test_score, rfc_test_score, svc_test_score]]
df_model_scores = pd.DataFrame(model_scores, columns = ['Type','Logistic', 'RFC', 'SVC']) 
df_model_scores

Unnamed: 0,Type,Logistic,RFC,SVC
0,Train,0.9375,1.0,0.946429
1,Test,0.9375,0.875,0.9375


## Model Predictions

In [11]:
# Predict NBA 2023 playoffs winner
# Calculate probability of each team winning the playoffs

playoffs_predictions_2023 = test_data[["season", "team", "playoffs_winner"]]

log_probability = logreg.predict_proba(X_test)[:,1].tolist()
rfc_probability = rfc.predict_proba(X_test)[:,1].tolist()
svc_probability = svc.predict_proba(X_test)[:,1].tolist()

log_prediction = logreg.predict(X_test).tolist()
rfc_prediction = rfc.predict(X_test).tolist()
svc_prediction = svc.predict(X_test).tolist()

playoffs_predictions_2023["prediction_log"] = log_prediction
playoffs_predictions_2023["prob_log"] = log_probability
playoffs_predictions_2023["prediction_rfc"] = rfc_prediction
playoffs_predictions_2023["prob_rfc"] = rfc_probability
playoffs_predictions_2023["prediction_svc"] = svc_prediction
playoffs_predictions_2023["prob_svc"] = svc_probability

playoffs_predictions_2023.sort_values("prob_log", ascending = False)
playoffs_predictions_2023.sort_values("prob_rfc", ascending = False)
playoffs_predictions_2023.sort_values("prob_svc", ascending = False)

Unnamed: 0,season,team,playoffs_winner,prediction_log,prob_log,prediction_rfc,prob_rfc,prediction_svc,prob_svc
112,2023,DEN,1,0,0.273205,0,0.2,0,0.513647
114,2023,PHI,0,0,0.117929,0,0.18,0,0.342847
113,2023,MIA,0,0,0.271319,1,0.91,0,0.22257
117,2023,PHO,0,0,0.071454,0,0.15,0,0.091314
116,2023,NYK,0,0,0.067841,0,0.15,0,0.081481
115,2023,BOS,0,0,0.214752,0,0.33,0,0.062751
120,2023,SAC,0,0,0.029277,0,0.05,0,0.029332
118,2023,LAL,0,0,0.093827,0,0.11,0,0.023126
119,2023,GSW,0,0,0.061384,0,0.0,0,0.016761
122,2023,MEM,0,0,0.022871,0,0.05,0,0.013521


In [13]:
# Save the trained models

#logreg_model = r'C:\Users\User\Downloads\nba_data_science_project\models\playoffs_prediction_models\playoffs_logreg_model.h5'
#rfc_model = r'C:\Users\User\Downloads\nba_data_science_project\models\playoffs_prediction_models\playoffs_rfc_model.h5'
#svc_model = r'C:\Users\User\Downloads\nba_data_science_project\models\playoffs_prediction_models\playoffs_svc_model.h5'

#joblib.dump(logreg, logreg_model)
#joblib.dump(rfc, rfc_model)
#joblib.dump(svc, svc_model)

['C:\\Users\\User\\Downloads\\nba_data_science_project\\models\\playoffs_prediction_models\\playoffs_svc_model.h5']

## ML Pipeline

In [14]:
# Create a ML Pipeline to scale the input data and train the chosen ML model

input = [('scale', MinMaxScaler()), ('model', RandomForestClassifier())]
pipe = Pipeline(input)
pipe.fit(xtrain_data, y_train)
predictions = pipe.predict(xtest_data)
pred_probability = pipe.predict_proba(xtest_data)[:,1].tolist()

# ML Pipeline Predictions

In [15]:
# Create dataframe to contain; actual data and predicted data

predictions_2023 = test_data[["season", "team", "playoffs_winner"]]
predictions_2023["prediction_rfc"] = predictions
predictions_2023["prob_rfc"] = pred_probability
predictions_2023.sort_values("prob_rfc", ascending = False)

Unnamed: 0,season,team,playoffs_winner,prediction_rfc,prob_rfc
113,2023,MIA,0,0,0.47
112,2023,DEN,1,0,0.08
114,2023,PHI,0,0,0.01
115,2023,BOS,0,0,0.01
116,2023,NYK,0,0,0.0
117,2023,PHO,0,0,0.0
118,2023,LAL,0,0,0.0
119,2023,GSW,0,0,0.0
120,2023,SAC,0,0,0.0
121,2023,ATL,0,0,0.0
