In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("nba_games.csv", index_col=0) # read in the CSV file and setting the first column as the index 
df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,38.0,72.0,0.528,16.0,37.0,0.432,18.0,21.0,...,26.2,26.8,155.0,123.0,MIA,107,1,2023,2022-11-07,True
1,240.0,240.0,40.0,84.0,0.476,14.0,39.0,0.359,13.0,15.0,...,41.0,37.3,160.0,121.0,POR,110,0,2023,2022-11-07,False
2,240.0,240.0,41.0,78.0,0.526,8.0,24.0,0.333,15.0,19.0,...,28.6,41.1,250.0,125.0,DAL,90,1,2023,2022-12-14,True
3,240.0,240.0,29.0,74.0,0.392,13.0,38.0,0.342,19.0,26.0,...,12.6,33.0,183.0,110.0,CLE,105,0,2023,2022-12-14,False
4,240.0,240.0,39.0,81.0,0.481,6.0,20.0,0.300,14.0,18.0,...,22.8,29.0,178.0,111.0,DAL,95,1,2016,2015-12-09,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20357,240.0,240.0,35.0,81.0,0.432,11.0,26.0,0.423,27.0,36.0,...,34.2,33.7,160.0,118.0,OKC,92,0,2019,2018-10-19,True
20358,240.0,240.0,37.0,74.0,0.500,13.0,25.0,0.520,26.0,37.0,...,25.0,30.0,139.0,129.0,ORL,108,1,2017,2016-12-14,True
20359,240.0,240.0,42.0,89.0,0.472,14.0,33.0,0.424,10.0,20.0,...,25.6,29.9,175.0,126.0,LAC,113,0,2017,2016-12-14,False
20360,240.0,240.0,41.0,85.0,0.482,9.0,26.0,0.346,26.0,30.0,...,27.7,27.1,150.0,126.0,MIA,106,1,2020,2020-09-19,True


In [3]:
df = df.sort_values("date") # sorting the DataFrame by date
df = df.reset_index(drop=True) # reseting a new index and dropping the old one

In [4]:
del df["mp.1"] # deleting columns that are not needed
del df["mp_opp.1"]
del df["index_opp"]

In [7]:
# this function creats a new column called 'target' which is created by shifting the "won" column up by one
# row, so that the target for each row is whether or not the team won the next game.
# this target column is what the machine learning model will be trained to predict
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target) # applying the above function

In [8]:
df["target"][pd.isnull(df["target"])] = 2 # handling null values
df["target"] = df["target"].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [9]:
df["won"].value_counts() # data exploration

True     10181
False    10181
Name: won, dtype: int64

In [10]:
df["target"].value_counts() # data exploration

1    10167
0    10165
2       30
Name: target, dtype: int64

In [11]:
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]

In [12]:
# select valid columns (those without nulls) and subset the DataFrame with these columns
valid_columns = df.columns[~df.columns.isin(nulls.index)]
valid_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp', 'team_opp', 'total_opp',
       'home_opp', 'season', 'date', 'won', 'target'],
      dtype='object', length=140)

In [13]:
df = df[valid_columns].copy()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,240.0,37.0,87.0,0.425,7.0,19.0,0.368,16.0,23.0,0.696,...,29.0,138.0,105.0,CLE,95,0,2016,2015-10-27,True,1
1,240.0,38.0,94.0,0.404,9.0,29.0,0.310,10.0,17.0,0.588,...,34.6,162.0,104.0,CHI,97,1,2016,2015-10-27,False,1
2,240.0,37.0,82.0,0.451,8.0,27.0,0.296,12.0,15.0,0.800,...,23.6,132.0,104.0,DET,106,0,2016,2015-10-27,False,1
3,240.0,37.0,96.0,0.385,12.0,29.0,0.414,20.0,26.0,0.769,...,33.8,258.0,121.0,ATL,94,1,2016,2015-10-27,True,1
4,240.0,41.0,96.0,0.427,9.0,30.0,0.300,20.0,22.0,0.909,...,38.9,201.0,120.0,NOP,95,0,2016,2015-10-27,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20357,240.0,42.0,85.0,0.494,13.0,27.0,0.481,21.0,28.0,0.750,...,34.8,207.0,130.0,PHO,102,0,2023,2023-05-09,True,2
20358,240.0,42.0,87.0,0.483,10.0,27.0,0.370,12.0,15.0,0.800,...,35.5,244.0,113.0,GSW,121,1,2023,2023-05-10,False,2
20359,240.0,35.0,71.0,0.493,13.0,34.0,0.382,29.0,40.0,0.725,...,29.9,135.0,131.0,MIA,103,0,2023,2023-05-10,True,2
20360,240.0,37.0,88.0,0.420,13.0,43.0,0.302,16.0,19.0,0.842,...,28.7,227.0,120.0,NYK,112,1,2023,2023-05-10,False,2


In [14]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [18]:
# initializing models
rr = RidgeClassifier(alpha=1)
tss = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv= tss, n_jobs=1)
scaler = MinMaxScaler()

In [19]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]
df[selected_columns] = scaler.fit_transform(df[selected_columns])
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won,target
0,0.0,0.391304,0.397059,0.373206,0.241379,0.227273,0.437055,0.348837,0.349206,0.645274,...,0.089744,0.232227,0.329412,CLE,0.276786,0.0,2016,2015-10-27,True,1
1,0.0,0.413043,0.500000,0.322967,0.310345,0.378788,0.368171,0.209302,0.253968,0.519253,...,0.161538,0.345972,0.317647,CHI,0.294643,1.0,2016,2015-10-27,False,1
2,0.0,0.391304,0.323529,0.435407,0.275862,0.348485,0.351544,0.255814,0.222222,0.766628,...,0.020513,0.203791,0.317647,DET,0.375000,0.0,2016,2015-10-27,False,1
3,0.0,0.391304,0.529412,0.277512,0.413793,0.378788,0.491686,0.441860,0.396825,0.730455,...,0.151282,0.800948,0.517647,ATL,0.267857,1.0,2016,2015-10-27,True,1
4,0.0,0.478261,0.529412,0.377990,0.310345,0.393939,0.356295,0.441860,0.333333,0.893816,...,0.216667,0.530806,0.505882,NOP,0.276786,0.0,2016,2015-10-27,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20357,0.0,0.500000,0.367647,0.538278,0.448276,0.348485,0.571259,0.465116,0.428571,0.708285,...,0.164103,0.559242,0.623529,PHO,0.339286,0.0,2023,2023-05-09,True,2
20358,0.0,0.500000,0.397059,0.511962,0.344828,0.348485,0.439430,0.255814,0.222222,0.766628,...,0.173077,0.734597,0.423529,GSW,0.508929,1.0,2023,2023-05-10,False,2
20359,0.0,0.347826,0.161765,0.535885,0.448276,0.454545,0.453682,0.651163,0.619048,0.679113,...,0.101282,0.218009,0.635294,MIA,0.348214,0.0,2023,2023-05-10,True,2
20360,0.0,0.391304,0.411765,0.361244,0.448276,0.590909,0.358670,0.348837,0.285714,0.815636,...,0.085897,0.654028,0.505882,NYK,0.428571,1.0,2023,2023-05-10,False,2


In [20]:
sfs.fit(df[selected_columns], df["target"]) # fitting SequentialFeatureSelector to find 30 best features

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [21]:
predictors = list(selected_columns[sfs.get_support()])

In [22]:
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season] # train data will be data before the current season
        test = data[data["season"] == season] # test data will be data in the current season
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [23]:
predictions = backtest(df, rr, predictors)

In [24]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.5396930404869013

In [37]:
df.groupby('home').apply(lambda x:x[x['won'] == 1].shape[0] / x.shape[0])

home
0.0    0.424569
1.0    0.574662
dtype: float64

In [25]:
df_rolling = df[list(selected_columns) + ["won", "team", "season"]]

# this function will create rolling averages of 10 games for each team and each season and adds these as 
# new features to the DataFrame.
def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)

In [26]:
df_rolling

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,total_opp,home_opp,won,season
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20357,0.025,0.521739,0.422059,0.518660,0.400000,0.401515,0.450119,0.379070,0.328571,0.757176,...,0.0615,0.0739,0.375577,0.194103,0.451659,0.608235,0.386607,0.4,0.7,2023.0
20358,0.025,0.476087,0.427941,0.456459,0.331034,0.416667,0.358907,0.406977,0.352381,0.759627,...,0.0626,0.0653,0.397065,0.174615,0.409005,0.478824,0.387500,0.5,0.6,2023.0
20359,0.000,0.371739,0.330882,0.407416,0.327586,0.431818,0.346200,0.423256,0.401587,0.678646,...,0.0491,0.0628,0.627778,0.184615,0.405687,0.503529,0.322321,0.5,0.6,2023.0
20360,0.025,0.497826,0.414706,0.496890,0.482759,0.504545,0.456888,0.376744,0.334921,0.732789,...,0.0316,0.0647,0.284172,0.187436,0.540284,0.554118,0.415179,0.6,0.7,2023.0


In [27]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)
df = df.dropna()
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,stl%_max_opp_10,blk%_max_opp_10,tov%_max_opp_10,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10
243,0.0,0.500000,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.0628,0.0679,0.413522,0.125256,0.361611,0.449412,0.322321,0.4,0.8,2016.0
249,0.0,0.630435,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.0613,0.0772,0.469497,0.220641,0.394787,0.531765,0.300893,0.5,1.0,2016.0
258,0.0,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.0657,0.1032,0.437212,0.126026,0.404739,0.408235,0.398214,0.2,0.3,2016.0
263,0.0,0.304348,0.132353,0.500000,0.275862,0.272727,0.432304,0.581395,0.444444,0.879813,...,0.0741,0.0982,0.313312,0.181026,0.500000,0.471765,0.353571,0.5,0.4,2016.0
264,0.0,0.282609,0.235294,0.363636,0.344828,0.348485,0.439430,0.627907,0.476190,0.886814,...,0.0747,0.0742,0.303564,0.131667,0.387678,0.410588,0.325893,0.5,0.5,2016.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20357,0.0,0.500000,0.367647,0.538278,0.448276,0.348485,0.571259,0.465116,0.428571,0.708285,...,0.0615,0.0739,0.375577,0.194103,0.451659,0.608235,0.386607,0.4,0.7,2023.0
20358,0.0,0.500000,0.397059,0.511962,0.344828,0.348485,0.439430,0.255814,0.222222,0.766628,...,0.0626,0.0653,0.397065,0.174615,0.409005,0.478824,0.387500,0.5,0.6,2023.0
20359,0.0,0.347826,0.161765,0.535885,0.448276,0.454545,0.453682,0.651163,0.619048,0.679113,...,0.0491,0.0628,0.627778,0.184615,0.405687,0.503529,0.322321,0.5,0.6,2023.0
20360,0.0,0.391304,0.411765,0.361244,0.448276,0.590909,0.358670,0.348837,0.285714,0.815636,...,0.0316,0.0647,0.284172,0.187436,0.540284,0.554118,0.415179,0.6,0.7,2023.0


In [28]:
# adding information about the next game (like home team, opponent team, and date) as new features to the DataFrame
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [29]:
df

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,usg%_max_opp_10,ortg_max_opp_10,drtg_max_opp_10,total_opp_10,home_opp_10,won_10,season_10,home_next,team_opp_next,date_next
243,0.0,0.500000,0.382353,0.523923,0.344828,0.333333,0.457245,0.255814,0.238095,0.708285,...,0.125256,0.361611,0.449412,0.322321,0.4,0.8,2016.0,0.0,BOS,2015-11-13
249,0.0,0.630435,0.426471,0.645933,0.620690,0.515152,0.562945,0.325581,0.238095,0.927655,...,0.220641,0.394787,0.531765,0.300893,0.5,1.0,2016.0,1.0,BRK,2015-11-14
258,0.0,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.126026,0.404739,0.408235,0.398214,0.2,0.3,2016.0,1.0,TOR,2015-11-15
263,0.0,0.304348,0.132353,0.500000,0.275862,0.272727,0.432304,0.581395,0.444444,0.879813,...,0.181026,0.500000,0.471765,0.353571,0.5,0.4,2016.0,0.0,CHO,2015-11-15
264,0.0,0.282609,0.235294,0.363636,0.344828,0.348485,0.439430,0.627907,0.476190,0.886814,...,0.131667,0.387678,0.410588,0.325893,0.5,0.5,2016.0,0.0,WAS,2015-11-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20357,0.0,0.500000,0.367647,0.538278,0.448276,0.348485,0.571259,0.465116,0.428571,0.708285,...,0.194103,0.451659,0.608235,0.386607,0.4,0.7,2023.0,,,
20358,0.0,0.500000,0.397059,0.511962,0.344828,0.348485,0.439430,0.255814,0.222222,0.766628,...,0.174615,0.409005,0.478824,0.387500,0.5,0.6,2023.0,,,
20359,0.0,0.347826,0.161765,0.535885,0.448276,0.454545,0.453682,0.651163,0.619048,0.679113,...,0.184615,0.405687,0.503529,0.322321,0.5,0.6,2023.0,,,
20360,0.0,0.391304,0.411765,0.361244,0.448276,0.590909,0.358670,0.348837,0.285714,0.815636,...,0.187436,0.540284,0.554118,0.415179,0.6,0.7,2023.0,,,


In [30]:
# merging the original DataFrame with the DataFrame containing rolling averages, based on team and date
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])
full

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,...,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y,team_opp_next_y,team_y
0,0.00,0.456522,0.500000,0.375598,0.379310,0.348485,0.483373,0.441860,0.396825,0.730455,...,0.380294,0.274359,0.270616,0.478824,0.286607,0.6,0.7,2016.0,SAC,TOR
1,0.00,0.326087,0.250000,0.413876,0.310345,0.257576,0.509501,0.511628,0.412698,0.827305,...,0.437212,0.126026,0.404739,0.408235,0.398214,0.2,0.3,2016.0,TOR,SAC
2,0.25,0.478261,0.558824,0.356459,0.068966,0.212121,0.131829,0.325581,0.238095,0.927655,...,0.421593,0.134615,0.345498,0.454118,0.318750,0.4,0.7,2016.0,BRK,ATL
3,0.00,0.413043,0.397059,0.401914,0.137931,0.212121,0.263658,0.418605,0.365079,0.757293,...,0.468868,0.124231,0.332227,0.407059,0.343750,0.5,0.4,2016.0,ORL,MIN
4,0.25,0.521739,0.544118,0.416268,0.413793,0.454545,0.419240,0.186047,0.142857,0.883314,...,0.467505,0.277436,0.352607,0.482353,0.293750,0.7,0.6,2016.0,GSW,TOR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18079,0.25,0.478261,0.397059,0.483254,0.586207,0.606061,0.458432,0.348837,0.333333,0.681447,...,0.334591,0.138077,0.444076,0.574118,0.384821,0.6,0.8,2023.0,BOS,PHI
18080,0.00,0.434783,0.382353,0.440191,0.206897,0.318182,0.285036,0.441860,0.301587,1.000000,...,0.375681,0.159359,0.399052,0.523529,0.426786,0.5,0.5,2023.0,LAL,GSW
18081,0.00,0.456522,0.382353,0.468900,0.413793,0.560606,0.347981,0.186047,0.174603,0.708285,...,0.431971,0.243077,0.363507,0.503529,0.379464,0.5,0.7,2023.0,GSW,LAL
18082,0.00,0.391304,0.235294,0.521531,0.310345,0.363636,0.381235,0.395349,0.365079,0.708285,...,0.257966,0.193077,0.500948,0.552941,0.396429,0.5,0.8,2023.0,NYK,MIA


In [31]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,BRK,ATL,ATL,BRK,2015-11-17
3,ORL,MIN,MIN,ORL,2015-11-18
4,GSW,TOR,TOR,GSW,2015-11-17
...,...,...,...,...,...
18079,BOS,PHI,PHI,BOS,2023-05-09
18080,LAL,GSW,GSW,LAL,2023-05-10
18081,GSW,LAL,LAL,GSW,2023-05-10
18082,NYK,MIA,MIA,NYK,2023-05-10


In [32]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
removed_columns

['team_x',
 'team_opp',
 'date',
 'team_opp_next_x',
 'date_next',
 'team_opp_next_y',
 'team_y',
 'season',
 'date',
 'won',
 'target',
 'team',
 'team_opp']

In [33]:
# feature selection, again, but this time with the new features added to the DataFrame.
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RidgeClassifier(alpha=1),
                          n_features_to_select=30, n_jobs=1)

In [34]:
predictors = list(selected_columns[sfs.get_support()])
predictors

['mp',
 '3p%',
 'fta',
 'trb%',
 'usg%',
 'mp_opp',
 'trb%_opp',
 'usg%_opp',
 'ft%_max_opp',
 'fg_10_x',
 'ftr_10_x',
 'usg%_10_x',
 'pts_max_10_x',
 'trb%_max_10_x',
 'stl%_max_10_x',
 'blk_opp_10_x',
 'usg%_opp_10_x',
 'fg%_max_opp_10_x',
 '3p_max_opp_10_x',
 'stl_max_opp_10_x',
 'pts_max_opp_10_x',
 'won_10_x',
 'home_next',
 'ftr_10_y',
 'usg%_10_y',
 'pts_max_10_y',
 'fg%_opp_10_y',
 'trb_opp_10_y',
 'blk_max_opp_10_y',
 'won_10_y']

In [35]:
# making predictions/backtest again
# finding new accuracy score
predictions = backtest(full, rr, predictors)
accuracy_score(predictions["actual"], predictions["prediction"])

0.635821340690478