# Prediction (DataQuest Version)

In [98]:
import pandas as pd

In [99]:
# Get data
df_2020 = pd.read_csv("nba_games_2020.csv")
df_2021 = pd.read_csv("nba_games_2021.csv")
df_2022 = pd.read_csv("nba_games_2022.csv")
df_2023 = pd.read_csv("nba_games_2023.csv")
df_2024 = pd.read_csv("nba_games_2024 (1).csv")

In [100]:
# Concatenate all seasons in one df
df = pd.concat([df_2020,df_2021,df_2022,df_2023,df_2024])
df = df.sort_values("date")
df = df.reset_index(drop=True)
df

Unnamed: 0,season,date,home_team,home_total,away_team,away_total,mp,home_fg,home_fga,home_fg%,...,away_drb%,away_trb%,away_ast%,away_stl%,away_blk%,away_tov%,away_usg%,away_ortg,away_drtg,home_won
0,2020,2019-10-22,TOR,130,NOP,122,265.0,42.0,103.0,0.408,...,69.8,48.2,69.8,3.5,14.3,14.6,100.0,108.1,115.2,True
1,2020,2019-10-22,NOP,122,TOR,130,265.0,43.0,102.0,0.422,...,71.9,51.8,54.8,6.2,5.3,12.4,100.0,114.7,107.6,False
2,2020,2019-10-22,LAC,112,LAL,102,240.0,42.0,81.0,0.519,...,74.4,47.7,54.1,4.2,14.0,13.7,100.0,106.5,116.9,True
3,2020,2019-10-22,LAL,102,LAC,112,240.0,37.0,85.0,0.435,...,79.1,52.3,57.1,8.4,9.6,13.3,100.0,117.5,107.0,False
4,2020,2019-10-23,DEN,108,POR,100,240.0,34.0,81.0,0.420,...,76.1,51.6,36.1,9.0,8.2,17.1,100.0,100.1,108.1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,2024,2024-06-12,BOS,106,DAL,99,240.0,38.0,82.0,0.463,...,85.7,54.4,39.5,5.5,2.8,7.9,100.0,109.7,117.4,True
12538,2024,2024-06-14,BOS,84,DAL,122,240.0,29.0,80.0,0.363,...,90.7,62.7,45.7,7.6,5.1,7.4,100.0,131.7,90.7,False
12539,2024,2024-06-14,DAL,122,BOS,84,240.0,46.0,91.0,0.505,...,67.5,37.3,62.1,2.2,9.3,13.2,100.0,90.7,131.7,True
12540,2024,2024-06-17,BOS,106,DAL,88,240.0,38.0,89.0,0.427,...,65.1,40.7,51.4,4.6,8.0,13.4,100.0,101.3,122.1,True


In [101]:
# Adding column that indicates whether the home team wins the next game they play
def add_target(group):
    group["target"] = group["home_won"].shift(-1)
    return group

df = df.groupby("home_team", group_keys=False)[df.columns].apply(add_target)

In [102]:
# Assign NaN values in 'target' to be 2
df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors="ignore")

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df["target"][pd.isnull(df["target"])] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnul

In [103]:
# Removing columns with NaN
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
valid_columns = df.columns[~df.columns.isin(nulls.index)]

# Resetting df to avoid accidental self-references
df = df[valid_columns].copy()

In [104]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr,
                                n_features_to_select=50,
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [105]:
# Removing non-numeric columns
removed_columns = ["season", "date", "home_won", "target", "home_team", "away_team"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [106]:
# Scaling so that everything is between 0 and 1
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [107]:
# Fitting the sfs model
sfs.fit(df[selected_columns], df["target"])

In [108]:
# Get a list of the 30 best predictors of 'target'
predictors = list(selected_columns[sfs.get_support()])
predictors

['home_total',
 'away_total',
 'mp',
 'home_fg',
 'home_fga',
 'home_fg%',
 'home_3p',
 'home_3pa',
 'home_3p%',
 'home_ft',
 'home_fta',
 'home_ft%',
 'home_orb',
 'home_drb',
 'home_trb',
 'home_ast',
 'home_stl',
 'home_blk',
 'home_tov',
 'home_pts',
 'home_ts%',
 'home_efg%',
 'home_3par',
 'home_ftr',
 'home_orb%',
 'home_drb%',
 'home_trb%',
 'home_ast%',
 'home_stl%',
 'home_blk%',
 'home_tov%',
 'home_usg%',
 'home_ortg',
 'home_drtg',
 'away_fg',
 'away_3p%',
 'away_orb',
 'away_drb',
 'away_trb',
 'away_stl',
 'away_tov',
 'away_pts',
 'away_3par',
 'away_orb%',
 'away_drb%',
 'away_trb%',
 'away_stl%',
 'away_usg%',
 'away_ortg',
 'away_drtg']

In [109]:
# Backtest so no future games are used to predict past games
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []

    seasons = sorted(data["season"].unique())

    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]

        model.fit(train[predictors], train["target"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [110]:
# Predictions
predictions = backtest(df, rr, predictors)

In [111]:
# Getting the accuracy score of our model
from sklearn.metrics import accuracy_score

accuracy_score(predictions["actual"], predictions["prediction"])

0.5358404846037355

## Improving our model
We will add extra columns to try and improve the model through heuristics. There are the columns that we will add:
- Rolling averages over the past 5 games for each team
- The next opponent of each team

In [112]:
# Get selected columns
df_rolling = df[list(selected_columns) + ["home_won", "home_team", "season"]]

# Find rolling averages by team and season
def find_team_averages(team):
    rolling = team.loc[:, team.columns != 'home_team'].rolling(10).mean()
    return rolling

df_rolling = df_rolling.groupby(["home_team", "season"], group_keys=False)[df_rolling.columns].apply(find_team_averages)

# Creating the rolling averages columns
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

df = df.dropna() # Remove missing values (the first 5 games)

df

Unnamed: 0,season,date,home_team,home_total,away_team,away_total,mp,home_fg,home_fga,home_fg%,...,away_trb%_10,away_ast%_10,away_stl%_10,away_blk%_10,away_tov%_10,away_usg%_10,away_ortg_10,away_drtg_10,home_won_10,season_10
248,2020,2019-11-08,DET,0.339623,IND,0.396226,0.000000,0.409091,0.396552,0.440191,...,0.459722,0.555136,0.416038,0.185891,0.417668,0.0,0.434282,0.407766,0.4,2020.0
255,2020,2019-11-09,CHI,0.226415,HOU,0.443396,0.000000,0.363636,0.603448,0.260766,...,0.620278,0.504532,0.328774,0.327475,0.560424,0.0,0.389205,0.343011,0.3,2020.0
260,2020,2019-11-09,GSW,0.358491,OKC,0.415094,0.000000,0.477273,0.379310,0.538278,...,0.541667,0.492296,0.355189,0.216089,0.436749,0.0,0.499526,0.401673,0.2,2020.0
262,2020,2019-11-10,NYK,0.160377,CLE,0.358491,0.000000,0.227273,0.396552,0.217703,...,0.505833,0.513444,0.370283,0.199010,0.444876,0.0,0.421827,0.305496,0.2,2020.0
267,2020,2019-11-10,POR,0.509434,ATL,0.405660,0.333333,0.545455,0.706897,0.392344,...,0.523611,0.479758,0.356132,0.248515,0.406714,0.0,0.413879,0.412903,0.4,2020.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,2024,2024-06-12,BOS,0.339623,DAL,0.273585,0.000000,0.386364,0.327586,0.464115,...,0.449444,0.443353,0.288208,0.126485,0.365018,0.0,0.427046,0.557348,1.0,2024.0
12538,2024,2024-06-14,BOS,0.132075,DAL,0.490566,0.000000,0.181818,0.293103,0.224880,...,0.500556,0.441088,0.302358,0.128465,0.361837,0.0,0.456346,0.520072,0.9,2024.0
12539,2024,2024-06-14,DAL,0.490566,BOS,0.132075,0.000000,0.568182,0.482759,0.564593,...,0.406944,0.550755,0.347170,0.241089,0.328622,0.0,0.457888,0.510275,0.6,2024.0
12540,2024,2024-06-17,BOS,0.339623,DAL,0.169811,0.000000,0.386364,0.448276,0.377990,...,0.502500,0.422961,0.283019,0.132178,0.385512,0.0,0.444247,0.523536,0.9,2024.0


In [113]:
# Get the next opponent of teams
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

# Adding the columns to the df
def add_col(df, col_name):
    return df.groupby("home_team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# Adding columns
df["home_team_next"] = add_col(df, "home_team")
df["away_team_next"] = add_col(df, "away_team")
df["date_next"] = add_col(df, "date")

df

  return df.groupby("home_team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("home_team", group_keys=False).apply(lambda x: shift_col(x, col_name))
  return df.groupby("home_team", group_keys=False).apply(lambda x: shift_col(x, col_name))


Unnamed: 0,season,date,home_team,home_total,away_team,away_total,mp,home_fg,home_fga,home_fg%,...,away_blk%_10,away_tov%_10,away_usg%_10,away_ortg_10,away_drtg_10,home_won_10,season_10,home_team_next,away_team_next,date_next
248,2020,2019-11-08,DET,0.339623,IND,0.396226,0.000000,0.409091,0.396552,0.440191,...,0.185891,0.417668,0.0,0.434282,0.407766,0.4,2020.0,DET,MIN,2019-11-11
255,2020,2019-11-09,CHI,0.226415,HOU,0.443396,0.000000,0.363636,0.603448,0.260766,...,0.327475,0.560424,0.0,0.389205,0.343011,0.3,2020.0,CHI,NYK,2019-11-12
260,2020,2019-11-09,GSW,0.358491,OKC,0.415094,0.000000,0.477273,0.379310,0.538278,...,0.216089,0.436749,0.0,0.499526,0.401673,0.2,2020.0,GSW,UTA,2019-11-11
262,2020,2019-11-10,NYK,0.160377,CLE,0.358491,0.000000,0.227273,0.396552,0.217703,...,0.199010,0.444876,0.0,0.421827,0.305496,0.2,2020.0,NYK,CHI,2019-11-12
267,2020,2019-11-10,POR,0.509434,ATL,0.405660,0.333333,0.545455,0.706897,0.392344,...,0.248515,0.406714,0.0,0.413879,0.412903,0.4,2020.0,POR,SAC,2019-11-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,2024,2024-06-12,BOS,0.339623,DAL,0.273585,0.000000,0.386364,0.327586,0.464115,...,0.126485,0.365018,0.0,0.427046,0.557348,1.0,2024.0,BOS,DAL,2024-06-14
12538,2024,2024-06-14,BOS,0.132075,DAL,0.490566,0.000000,0.181818,0.293103,0.224880,...,0.128465,0.361837,0.0,0.456346,0.520072,0.9,2024.0,BOS,DAL,2024-06-17
12539,2024,2024-06-14,DAL,0.490566,BOS,0.132075,0.000000,0.568182,0.482759,0.564593,...,0.241089,0.328622,0.0,0.457888,0.510275,0.6,2024.0,DAL,BOS,2024-06-17
12540,2024,2024-06-17,BOS,0.339623,DAL,0.169811,0.000000,0.386364,0.448276,0.377990,...,0.132178,0.385512,0.0,0.444247,0.523536,0.9,2024.0,,,


In [114]:
# Merging the dataframes to see whether both home and away teams that are meant to play each other in the next game are accurate
full = df.merge(df[rolling_cols + ["away_team_next", "date_next", "home_team"]],
                left_on=["home_team", "date_next"],
                right_on=["away_team_next", "date_next"])

full[["home_team_x", "away_team_next_x", "home_team_y", "away_team_next_y", "date_next"]]

Unnamed: 0,home_team_x,away_team_next_x,home_team_y,away_team_next_y,date_next
0,CHI,NYK,NYK,CHI,2019-11-12
1,NYK,CHI,CHI,NYK,2019-11-12
2,IND,OKC,OKC,IND,2019-11-12
3,ORL,PHI,PHI,ORL,2019-11-13
4,MIL,CHI,CHI,MIL,2019-11-14
...,...,...,...,...,...
11105,DAL,BOS,BOS,DAL,2024-06-12
11106,DAL,BOS,BOS,DAL,2024-06-14
11107,BOS,DAL,DAL,BOS,2024-06-14
11108,BOS,DAL,DAL,BOS,2024-06-17


In [115]:
# Remove more non-numeric columns
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns
removed_columns

['date',
 'home_team_x',
 'away_team',
 'home_team_next',
 'away_team_next_x',
 'date_next',
 'away_team_next_y',
 'home_team_y',
 'season',
 'date',
 'home_won',
 'target',
 'home_team',
 'away_team']

In [116]:
# Running the feature selection again
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

In [117]:
# Getting a list of the new set of predictors
predictors = list(selected_columns[sfs.get_support()])
predictors

['home_total',
 'home_fga',
 'home_3p',
 'home_3pa',
 'home_fta',
 'home_ft%',
 'home_orb',
 'home_tov',
 'home_pts',
 'home_3par',
 'home_ftr',
 'home_drb%',
 'home_usg%',
 'away_3par',
 'away_orb%',
 'away_usg%',
 'home_fga_10_x',
 'home_fta_10_x',
 'home_ftr_10_x',
 'home_usg%_10_x',
 'home_drtg_10_x',
 'away_orb_10_x',
 'away_ast_10_x',
 'away_stl_10_x',
 'away_stl%_10_x',
 'away_usg%_10_x',
 'away_ortg_10_x',
 'home_won_10_x',
 'away_total_10_y',
 'mp_10_y',
 'home_3p%_10_y',
 'home_orb%_10_y',
 'home_drb%_10_y',
 'home_ast%_10_y',
 'home_usg%_10_y',
 'home_drtg_10_y',
 'away_ft_10_y',
 'away_fta_10_y',
 'away_drb_10_y',
 'away_blk_10_y',
 'away_pts_10_y',
 'away_ts%_10_y',
 'away_efg%_10_y',
 'away_orb%_10_y',
 'away_drb%_10_y',
 'away_ast%_10_y',
 'away_blk%_10_y',
 'away_usg%_10_y',
 'away_ortg_10_y',
 'home_won_10_y']

In [118]:
# Running the backtest model
predictions = backtest(full, rr, predictors)

In [119]:
# New accuracy score
accuracy_score(predictions["actual"], predictions["prediction"])

0.6069102237326537