# Importing libraries

In [138]:
import pandas as pd

In [139]:
df = pd.read_csv("nba_games.csv", index_col=0)
df = df.sort_values("date")
df = df.reset_index(drop=True)

# Data Cleaning

In [140]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [141]:
def add_target(team):
    team["target"] = team["won"].shift(-1)
    return team

df = df.groupby("team", group_keys=False).apply(add_target)

In [142]:
df["target"][pd.isnull(df["target"])] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [143]:
df["target"] = df["target"].astype(int, errors="ignore")

In [144]:
nulls = pd.isnull(df).sum()
nulls = nulls[nulls>0]

In [145]:
valid_columns = df.columns[~df.columns.isin(nulls.index)]

In [146]:
df = df[valid_columns].copy()

# Scaling Data

In [147]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]

In [148]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df[selected_columns] = scaler.fit_transform(df[selected_columns])

# Feature Selection

In [149]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

In [101]:
rr = RidgeClassifier(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=30, direction="forward", cv=split)

In [103]:
sfs.fit(df[selected_columns], df["target"])

In [104]:
predictors = list(selected_columns[sfs.get_support()])

# Baseline Model

In [171]:
def backtest(df, model, predictors, start=2, step=1):
    all_predictions = []

    seasons=sorted(df["season"].unique())
    for i in range(start, len(seasons), step):
        season = seasons[i]

        train = df[df["season"]<season]
        test = df[df["season"]==season]

        model.fit(train[predictors], train["target"])

        preds =model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [151]:
predictions = backtest(df, rr, predictors)

# Baseline Model Accuracy
from sklearn.metrics import accuracy_score
predictions = predictions[predictions["actual"]!=2]
accuracy_score(predictions["actual"], predictions["prediction"])

0.5485110470701249

In [152]:
# Baseline accuracy based on home court advantage
# 57 percent of the matches were won by home team - We should look to beat this score
df.groupby("home").apply(lambda x:x[x["won"]==1].shape[0]/x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [153]:
df_rolling = df[list(selected_columns)+["won", "team", "season"]]

In [154]:
# Rolling averages method
def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

# Performance of a given team in it's previous 10 games of a specific season - Mean or average of it's previous 10 performances
df_rolling = df_rolling.groupby(["team", "season"], group_keys=False).apply(find_team_averages)
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols

df = pd.concat([df, df_rolling], axis=1)

  rolling = team.rolling(10).mean()


In [155]:
df = df.dropna()

In [156]:
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

# Adding information of next game for a team to the previous row for it to be trained on it
# This is only for information we know in advanced such as home court or away court, next team opponent and next match date
df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [157]:
full = df.merge(df[rolling_cols+["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

In [158]:
removed_columns = list(full.columns[full.dtypes=="object"])+removed_columns

In [160]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]

In [161]:
sfs.fit(full[selected_columns], full["target"])

In [162]:
predictors = list(selected_columns[sfs.get_support()])

In [172]:
predictions = backtest(full, rr, predictors)

In [174]:
# This is a significant improvement over our baseline model
accuracy_score(predictions["actual"], predictions["prediction"])

0.6296296296296297