In [192]:
import pandas as pd
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, make_scorer, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from lists import cols
from sklearn import tree
from sklearn.linear_model import RidgeClassifier
import matplotlib.pyplot as plt

In [193]:
games_df = pd.read_csv("nba_games.csv", index_col=0)
games_df.sort_values('date', inplace=True)
games_df.reset_index(drop=True, inplace=True)
games_df = games_df.drop(['mp.1', 'mp_MAX','mp_MAX.1', 'mp_MAX_OPP', 'mp_MAX_OPP.1'], axis=1)
games_df['won'] = games_df['won'].astype(int, errors= 'ignore')
games_df = games_df[cols]
games_df

Unnamed: 0,date,season,won,team,team_OPP,score,score_OPP,mp,fg,fga,...,drb%_MAX_OPP,trb%_MAX_OPP,ast%_MAX_OPP,stl%_MAX_OPP,blk%_MAX_OPP,tov%_MAX_OPP,usg%_MAX_OPP,ortg_MAX_OPP,drtg_MAX_OPP,home_OPP
0,2019-10-22,2020,1,LAC,LAL,112,102,240.0,42.0,81.0,...,74.4,47.7,54.1,4.2,14.0,25.0,100.0,300.0,125.0,0
1,2019-10-22,2020,0,NOP,TOR,122,130,265.0,43.0,102.0,...,71.9,51.8,54.8,6.2,5.3,22.2,100.0,158.0,114.0,1
2,2019-10-22,2020,1,TOR,NOP,130,122,265.0,42.0,103.0,...,69.8,48.2,69.8,7.9,14.3,25.0,100.0,146.0,124.0,0
3,2019-10-22,2020,0,LAL,LAC,102,112,240.0,37.0,85.0,...,79.1,52.3,57.1,8.4,9.6,22.2,100.0,203.0,116.0,1
4,2019-10-23,2020,1,MIA,MEM,120,101,240.0,40.0,85.0,...,73.8,44.0,91.9,15.6,15.1,30.4,100.0,136.0,115.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9335,2024-06-12,2024,0,BOS,DAL,106,99,240.0,38.0,82.0,...,85.7,54.4,39.5,5.5,8.2,50.0,100.0,205.0,124.0,1
9336,2024-06-14,2024,1,BOS,DAL,84,122,240.0,29.0,80.0,...,90.7,62.7,45.7,7.5,12.5,100.0,100.0,212.0,99.0,1
9337,2024-06-14,2024,0,DAL,BOS,122,84,240.0,46.0,91.0,...,67.5,37.3,62.1,2.2,16.4,50.0,100.0,200.0,138.0,0
9338,2024-06-17,2024,1,DAL,BOS,88,106,240.0,35.0,78.0,...,100.0,88.1,65.8,10.2,4.9,15.5,100.0,300.0,110.0,1


In [194]:
games_df['won'].value_counts()

won
1    4670
0    4670
Name: count, dtype: int64

In [195]:
games_df = games_df.dropna()

In [196]:
nulls = pd.isnull(games_df).sum()
print(nulls[nulls > 0])  

gdf = games_df.copy()

columns_to_drop = ['date', 'index_OPP']
gdf.drop(columns=columns_to_drop, inplace=True)
gdf.dropna(inplace=True)
removed_cols = ['won', 'team', 'season' ,'team_OPP']
selected = [col for col in gdf.columns if col not in removed_cols]
gdf_grouped = gdf.groupby(['season', 'team'])

scaler = MinMaxScaler()

def scale_group(group):
    group[selected] = scaler.fit_transform(group[selected])
    return group

gdf_scaled = gdf_grouped.apply(scale_group)
gdf_scaled.reset_index(drop=True, inplace=True)

gdf_scaled

Series([], dtype: int64)


  gdf_scaled = gdf_grouped.apply(scale_group)


Unnamed: 0,season,won,team,score,score_OPP,mp,fg,fga,fg%,3p,...,drb%_MAX_OPP,trb%_MAX_OPP,ast%_MAX_OPP,stl%_MAX_OPP,blk%_MAX_OPP,tov%_MAX_OPP,usg%_MAX_OPP,ortg_MAX_OPP,drtg_MAX_OPP,home_OPP
0,2020,1,ATL,0.507042,0.265823,0.0,0.583333,0.352941,0.732558,0.333333,...,0.449086,0.137874,0.512195,0.136986,0.289809,0.286339,0.0,0.055556,0.677419,1.0
1,2020,0,ATL,0.309859,0.253165,0.0,0.541667,0.294118,0.732558,0.200000,...,0.381201,0.117940,0.245779,0.308219,0.449045,0.180328,0.0,0.022222,0.435484,0.0
2,2020,0,ATL,0.309859,0.329114,0.0,0.250000,0.294118,0.410853,0.200000,...,0.535248,0.204319,0.557223,0.472603,0.614650,0.182514,0.0,0.038889,0.322581,0.0
3,2020,1,ATL,0.225352,0.417722,0.0,0.208333,0.264706,0.383721,0.333333,...,0.477807,0.332226,0.521576,0.445205,0.433121,0.562842,0.0,0.244444,0.258065,1.0
4,2020,1,ATL,0.225352,0.341772,0.0,0.250000,0.411765,0.333333,0.066667,...,0.130548,0.088040,0.801126,0.568493,0.531847,0.276503,0.0,0.277778,0.419355,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9331,2024,0,WAS,0.300000,0.235294,0.0,0.227273,0.379310,0.295337,0.000000,...,0.339888,0.206349,0.698565,0.063309,0.236434,0.126437,0.0,0.118644,0.461538,0.0
9332,2024,0,WAS,0.700000,0.666667,0.0,0.409091,0.413793,0.502591,0.687500,...,0.542135,0.176367,0.617225,0.077698,0.313953,0.137931,0.0,0.124294,0.615385,1.0
9333,2024,0,WAS,0.680000,0.666667,0.0,0.500000,0.413793,0.616580,0.937500,...,0.519663,0.149912,0.677033,0.089209,0.558140,0.425287,0.0,0.429379,0.897436,1.0
9334,2024,0,WAS,0.800000,0.647059,0.0,0.545455,0.310345,0.766839,0.812500,...,0.738764,0.275132,0.401914,0.083453,0.496124,0.106897,0.0,0.203390,0.794872,0.0


In [197]:
rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

sfs.fit(gdf_scaled[selected], gdf_scaled["won"])

In [198]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["won"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["won"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

predictions = backtest(gdf_scaled, rr, selected)
accuracy_score(predictions["actual"], predictions["prediction"])

0.6138194150063586