In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing 
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('nbaHomeWinLossModelDataset.csv').drop(['Unnamed: 0'],axis=1)
data = data.dropna()
data.head(10)

Unnamed: 0,HOME_LAST_GAME_OE,HOME_LAST_GAME_HOME_WIN_PCTG,HOME_NUM_REST_DAYS,HOME_LAST_GAME_AWAY_WIN_PCTG,HOME_LAST_GAME_TOTAL_WIN_PCTG,HOME_LAST_GAME_ROLLING_SCORING_MARGIN,HOME_LAST_GAME_ROLLING_OE,HOME_W,SEASON,AWAY_LAST_GAME_OE,AWAY_LAST_GAME_HOME_WIN_PCTG,AWAY_NUM_REST_DAYS,AWAY_LAST_GAME_AWAY_WIN_PCTG,AWAY_LAST_GAME_TOTAL_WIN_PCTG,AWAY_LAST_GAME_ROLLING_SCORING_MARGIN,AWAY_LAST_GAME_ROLLING_OE
0,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
1,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
2,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
3,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
4,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
5,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
6,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
7,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312
8,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.568966,0.583333,2.0,0.6,0.591549,-2.333333,0.567575
9,0.677419,0.361111,3.0,0.277778,0.319444,-4.0,0.583732,1,2020-21,0.59375,0.583333,2.0,0.583333,0.583333,2.333333,0.585312


In [3]:
validation = data[data['SEASON'] == '2024-25']  # Predicting 2024-25 season
modelData = data[data['SEASON'] != '2024-25'].sample(frac=1)  # Training on past seasons

In [4]:
X = modelData.drop(['HOME_W','SEASON'], axis=1)  # Features (independent variables)
y = modelData['HOME_W']  # Target variable (dependent variable)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Standard Scaling Prediction Variables
scaler = preprocessing.StandardScaler()

scaler.fit(X_train)
scaled_data_train = scaler.transform(X_train)

scaled_data_test = scaler.transform(X_test)

In [5]:
model = LogisticRegression()
model.fit(scaled_data_train, y_train)
model.score(scaled_data_test, y_test)

0.8670050235641411

In [6]:
#Instead of a single test set evaluation, this splits scaled_data_test into 12 folds,
#  training the model on 11 and testing on 1, repeating for all 12 combinations
F1Score = cross_val_score(model,scaled_data_test,y_test,cv=12,scoring='f1_macro')
print("Logistic Model F1 Accuracy: %0.2f (+/- %0.2f)"%(F1Score.mean(), F1Score.std() *2))

Logistic Model F1 Accuracy: 0.87 (+/- 0.01)


In [7]:
#Uses the trained logistic regression model to predict outcomes (HOME_W).
#y_pred contains 0s (losses) and 1s (wins) for the test set.
y_pred = model.predict(scaled_data_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.85      0.85     26403
           1       0.88      0.88      0.88     31524

    accuracy                           0.87     57927
   macro avg       0.87      0.87      0.87     57927
weighted avg       0.87      0.87      0.87     57927



In [8]:
#Validation Set review
#scaled_val_data = scaler.transform(validation.drop(['HOME_W','SEASON'],axis=1))
#Validation Set review

# Standard Scaling Prediction Variables
scaler = preprocessing.StandardScaler()
scaler.fit(validation.drop(['HOME_W','SEASON'],axis=1))
scaled_val_data = scaler.transform(validation.drop(['HOME_W','SEASON'],axis=1))

In [11]:
# Predict outcomes for the 2024-25 season
y_pred = model.predict(scaled_val_data)

# Print classification report for the validation set
print(classification_report(validation['HOME_W'], y_pred))

              precision    recall  f1-score   support

           0       0.53      0.55      0.54       422
           1       0.63      0.61      0.62       526

    accuracy                           0.58       948
   macro avg       0.58      0.58      0.58       948
weighted avg       0.58      0.58      0.58       948

