In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import yaml
from IPython.display import Markdown, display
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from scipy.stats import linregress
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
# load yaml
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    DATA_PATH = config.get("data_path")
    if DATA_PATH is None:
        print("ERROR: No data path provided")
    USE_DRIVE = bool(config.get("use_drive", False))

In [None]:
# load from drive if requested
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

# Prepare Data

In [None]:
team_stat_df = pd.read_csv(os.path.join(DATA_PATH, "important_features.csv"))
team_stat_df["gameDate"] = pd.to_datetime(team_stat_df["gameDate"])
team_stat_df.sort_index(inplace=True, ascending=True)
team_stat_df.set_index("gameDate", inplace=True)

In [None]:
team_stat_df.head()

In [None]:
# Sort by date
team_stat_df.sort_index(inplace=True, ascending=True)

# Cross-correlation vs. win

In [None]:
max_lag = 20
cors = []
cols =  [
    "teamScore",
    "opponentScore",
    "threePointersPercentage",
    "freeThrowsPercentage",
    "assistsPerPossession",
    "blocksPerPossession",
    "stealsPerPossession",
    "threePointersAttemptedPerPossession",
    "freeThrowsAttemptedPerPossession",
    "reboundsDefensivePerPossession",
    "reboundsOffensivePerPossession",
    "foulsPersonalPerPossession",
    "turnoversPerPossession",
    "effectiveFieldGoalPercentage",
    "trueShootingPercentage"
]

for col in cols:
    cors = []
    for lag in range(1, max_lag + 1):
        c = team_stat_df[col].shift(lag).corr(team_stat_df["win"])
        cors.append(c)

    N = team_stat_df.shape[0] - max_lag
    sig = 2 / np.sqrt(N)

    plt.figure(figsize=(8,4))
    (markerline, stemlines, baseline) = plt.stem(
        range(1, max_lag+1), cors, use_line_collection=True
    )
    plt.axhline(+sig, color='gray', linestyle='--', label=r'$+2/\sqrt{N}$')
    plt.axhline(-sig, color='gray', linestyle='--', label=r'$-2/\sqrt{N}$')
    plt.xlabel("Lag (games)")
    plt.ylabel(f"Correlation ({col}(t-k)$ vs win$_t$)")
    plt.title(f"Cross-correlation: {col} vs. win")
    plt.legend()
    plt.show()

In [None]:
n = len(team_stat_df)
max_lag = 20
N = n - max_lag
thr = 2 / np.sqrt(N)

recommended = {}

for feat in cols:
    cors = []
    for k in range(1, max_lag+1):
        c = team_stat_df[feat].shift(k).corr(team_stat_df["win"])
        cors.append(c)
    # find lags where |corr| > threshold
    sig_lags = [k+1 for k,c in enumerate(cors) if abs(c) > thr]
    recommended[feat] = sig_lags

print("Significant lags by feature:")
for feat, lags in recommended.items():
    print(f"  - {feat:8s}: lags = {lags or ['none']}")

all_lags = sum(recommended.values(), [])
if all_lags:
    X = max(all_lags)
    print(f"\nRecommended window size X = {X} (max across all stats)")
else:
    print("\nNo feature shows significant lagged correlation up to k =", max_lag)

# "Random" Model (Home Team win probability)

In [None]:
home_wins = team_stat_df[team_stat_df["home"] == 1].groupby("teamName")["win"].sum()
home_games = team_stat_df[team_stat_df["home"] == 1].groupby("teamName")["win"].count()
home_win_pct = home_wins / home_games
mean_home_win_pct = home_win_pct.mean()
print(f"Mean home win percentage: {mean_home_win_pct:.2f}")

# Logistic Regression to evaluate window size

In [None]:
def make_team_lags(df, feature_cols, window):
    """
    Build lagged features up to `window`, resetting at each team–season boundary,
    without repeatedly inserting into the same DataFrame.
    """
    blocks = []
    # group by (team, season) so lags never cross seasons
    for (_, _), g in df.groupby(['teamName','season'], sort=False):
        g = g.sort_values('gameDate')
        pieces = [g[feature_cols + ['home','win']]]  # keep originals

        # build each lagged series separately
        for feat in feature_cols:
            for k in range(1, window+1):
                pieces.append(
                    g[feat]
                     .shift(k)
                     .rename(f"{feat}_lag{k}")
                )

        block = pd.concat(pieces, axis=1)
        blocks.append(block)

    df_lag = pd.concat(blocks, axis=0)
    needed = [f"{feat}_lag{window}" for feat in feature_cols]
    return df_lag.dropna(subset=needed)


In [None]:
def make_lags(df, features, W):
    parts = []
    for (_, _), g in df.groupby(["teamName","season"], sort=False):
        g = g.sort_values("gameDate")
        df_l = g.copy()
        for feat in features:
            for k in range(1, W+1):
                df_l[f"{feat}_lag{k}"] = g[feat].shift(k)
        parts.append(df_l)
    return pd.concat(parts).dropna()

In [None]:
df = pd.read_csv(os.path.join(DATA_PATH, "important_features.csv"))
df['gameDate'] = pd.to_datetime(df['gameDate'])
df = df.sort_values(['teamName','gameDate'])
df['season'] = np.where(
    df['gameDate'].dt.month >= 10,
    df['gameDate'].dt.year,
    df['gameDate'].dt.year - 1
)

features = [
    'teamScore','threePointersPercentage','freeThrowsPercentage',
    'assistsPerPossession','blocksPerPossession','stealsPerPossession',
    'threePointersAttemptedPerPossession','freeThrowsAttemptedPerPossession',
    'reboundsDefensivePerPossession','reboundsOffensivePerPossession',
    'foulsPersonalPerPossession','turnoversPerPossession',
    'effectiveFieldGoalPercentage','trueShootingPercentage',
]
target_col = 'win'

train_df = df[df['season'] < 2023]
test_df  = df[df['season'] >= 2023]

In [None]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf',    LogisticRegression(solver='liblinear', max_iter=1000))
])

windows = [1, 3, 5, 10, 15, 20, 30]
C = [0.01, 0.1, 1, 10, 100]
tscv = TimeSeriesSplit(n_splits=5)
scoring = {'auc': 'roc_auc', 'accuracy': 'accuracy'}

results = []
best_models = {}
for W in windows:
    dfW      = make_team_lags(train_df, features, W)
    lag_cols = [f"{f}_lag{k}" for f in features for k in range(1,W+1)] + ['home']
    X_train  = dfW[lag_cols]
    y_train  = dfW[target_col]

    gs = GridSearchCV(pipe,
                      {'clf__C': C},
                      cv=tscv,
                      scoring='roc_auc',
                      n_jobs=-1)
    gs.fit(X_train, y_train)
    best_models[W] = gs.best_estimator_

    dfWt    = make_team_lags(test_df, features, W)
    X_test  = dfWt[lag_cols]
    y_test  = dfWt[target_col]
    y_prob  = gs.predict_proba(X_test)[:,1]
    y_pred  = (y_prob >= .5).astype(int)

    results.append({
        'window':   W,
        'test_AUC': roc_auc_score(y_test, y_prob),
        'test_acc': accuracy_score(y_test, y_pred),
    })


res_df = pd.DataFrame(results).set_index("window")
print(res_df)

In [None]:
from sklearn.metrics import roc_curve, auc

bestW = res_df["test_AUC"].idxmax()
model = best_models[bestW]

dfWb = make_lags(test_df, features, bestW)
Xb   = dfWb[[f"{f}_lag{k}" for f in features for k in range(1,bestW+1)] + ["home"]]
yb   = dfWb[target_col]
y_prob = model.predict_proba(Xb)[:,1]
y_pred = (y_prob >= 0.5).astype(int)

# Confusion matrix
cm = confusion_matrix(yb, y_pred)
plt.figure(figsize=(5,4))
plt.imshow(cm, cmap="Blues", interpolation="nearest")
plt.title(f"Confusion Matrix (W={bestW})")
plt.xticks([0,1], ["Loss","Win"])
plt.yticks([0,1], ["Loss","Win"])
for i in (0,1):
    for j in (0,1):
        plt.text(j, i, cm[i,j], ha="center", va="center",
                 color="white" if cm[i,j] > cm.max()/2 else "black")
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.colorbar()
plt.tight_layout()
plt.show()

# ROC curve
fpr, tpr, _ = roc_curve(yb, y_prob)
plt.figure(figsize=(5,4))
plt.plot(fpr, tpr, label=f"AUC = {res_df.loc[bestW,'test_AUC']:.3f}")
plt.plot([0,1],[0,1],"--", color="gray")
plt.title("ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.tight_layout()
plt.show()

Based off the AUC, the best window size is 5-10 games. More than that does not result in a significant increase in AUC. We will base our future models loosely on these window sizes. Also, based off of only our own stats, it is basically impossible to predict the outcome of a game, it is the same as simply choosing home team winning percentage. We will next include opponent stats to get a better model.

# Include Opponent Averages

In [None]:
base = pd.read_csv(os.path.join(DATA_PATH, "important_features.csv"))

In [None]:
opp_cols = [
    'teamScore','threePointersPercentage','freeThrowsPercentage',
    'assistsPerPossession','blocksPerPossession','stealsPerPossession',
    'threePointersAttemptedPerPossession','freeThrowsAttemptedPerPossession',
    'reboundsDefensivePerPossession','reboundsOffensivePerPossession',
    'foulsPersonalPerPossession','turnoversPerPossession',
    'effectiveFieldGoalPercentage','trueShootingPercentage',
]

opp = base[['gameId','teamName'] + opp_cols].copy()
opp.columns = ['gameId','opponentTeamName'] + [f"opp_{c}" for c in opp_cols]

df_full = base.merge(
    opp,
    on=['gameId','opponentTeamName'],
    how='left'
)

df_full.head()


In [None]:
df = df_full.sort_values(['teamName','gameDate'])
df.head()

In [None]:
features = [
    'teamScore','threePointersPercentage','freeThrowsPercentage',
    'assistsPerPossession','blocksPerPossession','stealsPerPossession',
    'threePointersAttemptedPerPossession','freeThrowsAttemptedPerPossession',
    'reboundsDefensivePerPossession','reboundsOffensivePerPossession',
    'foulsPersonalPerPossession','turnoversPerPossession',
    'effectiveFieldGoalPercentage','trueShootingPercentage',
    'opp_teamScore','opp_threePointersPercentage','opp_freeThrowsPercentage',
    'opp_assistsPerPossession','opp_blocksPerPossession','opp_stealsPerPossession',
    'opp_threePointersAttemptedPerPossession','opp_freeThrowsAttemptedPerPossession',
    'opp_reboundsDefensivePerPossession','opp_reboundsOffensivePerPossession',
    'opp_foulsPersonalPerPossession','opp_turnoversPerPossession',
    'opp_effectiveFieldGoalPercentage','opp_trueShootingPercentage'
]

# CV on different window sizes

In [None]:
def evaluate_window(window, df=base):
    roll = (
        df
        .sort_values(['teamName','gameDate'])
        .groupby('teamName')
        .apply(lambda g: g.assign(**{
            f'{feat}_avg_{window}': g[feat].shift().rolling(window).mean()
            for feat in features
        }))
        .reset_index(drop=True)
    )
    df_w = df.merge(
        roll[['gameId','teamName'] + [f'{feat}_avg_{window}' for feat in features]],
        on=['gameId','teamName'], how='left'
    )
    opp = roll[['gameId','teamName'] + [f'{feat}_avg_{window}' for feat in features]].copy()
    opp = opp.rename(columns={
        'teamName': 'opponentTeamName',
        **{f'{feat}_avg_{window}': f'{feat}_avg_{window}_opp' for feat in features}
    })
    df_w = df_w.merge(
        opp,
        on=['gameId','opponentTeamName'], how='left'
    )

    feat_cols = [f'{feat}_avg_{window}' for feat in features] + \
                [f'{feat}_avg_{window}_opp' for feat in features] + ['home']
    df_model = df_w.dropna(subset=feat_cols + ['win'])

    df_model = df_model.sort_values('gameDate')
    X = df_model[feat_cols]
    y = df_model['win']

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=2000, solver='saga'))
    ])
    tscv = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(pipe, X, y, cv=tscv, scoring='accuracy', n_jobs=-1)

    return scores.mean(), scores.std()

windows = [1, 3, 5, 10, 15, 20, 30]
results = []
for W in windows:
    mean, std = evaluate_window(W, df)
    results.append((W, mean, std))

results_lr_df = pd.DataFrame(results, columns=['window', 'mean', 'std'])
print(results_lr_df)

Window Size around 10-15 games is best.

In [None]:
def make_window_df(df_games, features, window):
    # df_games must be the untouched raw data
    roll = (
      df_games
      .sort_values(['teamName','gameDate'])
      .groupby('teamName')
      .apply(lambda g: g.assign(**{
          f'{feat}_avg_{window}': g[feat].shift().rolling(window).mean()
          for feat in features
      }))
      .reset_index(drop=True)
    )

    # merge own‐team avgs
    df_w = df_games.merge(
      roll[['gameId','teamName'] + [f'{feat}_avg_{window}' for feat in features]],
      on=['gameId','teamName'], how='left'
    )

    # opponent avgs
    opp = roll[['gameId','teamName'] + [f'{feat}_avg_{window}' for feat in features]]
    opp = opp.rename(columns={
      'teamName': 'opponentTeamName',
      **{f'{feat}_avg_{window}': f'{feat}_avg_{window}_opp' for feat in features}
    })
    df_w = df_w.merge(opp, on=['gameId','opponentTeamName'], how='left')
    return df_w

df_10 = make_window_df(df, features, 10)
df_5  = make_window_df(df, features, 5)

# Random Forest

In [None]:
windows = [5, 10, 15, 20]

In [None]:
results = []

for W in windows:
    df_w = make_window_df(df, features, W)
    feat_cols = [f'{feat}_avg_{W}' for feat in features] + \
                [f'{feat}_avg_{W}_opp' for feat in features] + ['home']
    df_model = df_w.dropna(subset=feat_cols + ['win'])

    X = df_model[feat_cols]
    y = df_model['win']

    pipe_rf = Pipeline([
        ('rf', RandomForestClassifier(
            n_estimators=200,
            max_depth=None,
            min_samples_leaf=5,
            random_state=42,
            n_jobs=-1
        ))
    ])

    scores_rf = cross_val_score(
        pipe_rf,
        X, y,
        cv=tscv,
        scoring='accuracy',
        n_jobs=-1
    )

    results.append((W, scores_rf.mean(), scores_rf.std()))

results_rf_df = pd.DataFrame(results, columns=['window', 'mean', 'std'])
print(results_rf_df)

Not better than Logistic Regression...

# XGBoost

In [None]:
results_xgb = []
tscv = TimeSeriesSplit(n_splits=5)

param_dist = {
    'xgb__n_estimators': [100, 200, 400, 800],
    'xgb__max_depth': [3, 5, 7, 9],
    'xgb__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],
    'xgb__gamma': [0, 0.1, 0.3, 0.5],
    'xgb__min_child_weight': [1, 3, 5],
    'xgb__reg_alpha': [0, 0.01, 0.1],
    'xgb__reg_lambda': [1, 1.5, 2]
}

for W in windows:
    df_w = make_window_df(df, features, W)
    feat_cols = [f'{feat}_avg_{W}' for feat in features] + \
                [f'{feat}_avg_{W}_opp' for feat in features] + ['home']
    df_model = df_w.dropna(subset=feat_cols + ['win'])

    df_model = df_model.sort_values('gameDate')
    X = df_model[feat_cols]
    y = df_model['win']

    pipe_xgb = Pipeline([
        ('scaler', StandardScaler()),
        ('xgb', XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1
        ))
    ])

    search = RandomizedSearchCV(
        estimator=pipe_xgb,
        param_distributions=param_dist,
        n_iter=30,                    # try 30 random combos
        cv=tscv,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    search.fit(X, y)

    best_idx = search.best_index_
    best_score = search.best_score_
    best_std   = search.cv_results_['std_test_score'][best_idx]
    best_params= search.best_params_

    results_xgb.append({
        'window': W,
        'best_cv_accuracy': best_score,
        'std_cv_accuracy': best_std,
        'best_params': best_params
    })

results_xgb_df = pd.DataFrame(results_xgb)
print(results_xgb_df)

# LightGBM & CatBoost

In [None]:
from lightgbm import LGBMClassifier

param_dist_lgbm = {
    'lgbm__n_estimators': [100, 200, 400, 800],
    'lgbm__max_depth': [3, 5, 7, -1],
    'lgbm__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'lgbm__subsample': [0.6, 0.8, 1.0],
    'lgbm__colsample_bytree': [0.6, 0.8, 1.0],
    'lgbm__reg_alpha': [0, 0.01, 0.1],
    'lgbm__reg_lambda': [1, 1.5, 2],
    'lgbm__min_child_samples': [5, 10, 20],
    'lgbm__min_split_gain': [0.0, 0.001, 0.01]
}
tscv = TimeSeriesSplit(n_splits=5)

results_lgbm = []

for W in windows:
    df_w = make_window_df(df, features, W)
    feat_cols = [f'{feat}_avg_{W}' for feat in features] + \
                [f'{feat}_avg_{W}_opp' for feat in features] + ['home']
    df_model = df_w.dropna(subset=feat_cols + ['win'])
    df_model = df_model.sort_values('gameDate')

    X = df_model[feat_cols]
    y = df_model['win']

    pipe_lgbm = Pipeline([
        ('lgbm', LGBMClassifier(random_state=42))
    ])

    search = RandomizedSearchCV(
        estimator=pipe_lgbm,
        param_distributions=param_dist_lgbm,
        n_iter=30,
        cv=tscv,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    search.fit(X, y)

    best_idx = search.best_index_
    results_lgbm.append({
        'window': W,
        'best_cv_accuracy': search.best_score_,
        'std_cv_accuracy': search.cv_results_['std_test_score'][best_idx],
        'best_params': search.best_params_
    })

results_lgbm_df = pd.DataFrame(results_lgbm)
print(results_lgbm_df)


In [None]:
from catboost import CatBoostClassifier

param_dist_cat = {
    'cat__iterations': [100, 200, 400, 800],
    'cat__depth': [3, 5, 7, 9],
    'cat__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'cat__l2_leaf_reg': [1, 3, 5, 7, 9],
    'cat__border_count': [32, 64, 128]
}
tscv = TimeSeriesSplit(n_splits=5)

results_cat = []

for W in windows:
    df_w = make_window_df(df, features, W)
    feat_cols = [f'{feat}_avg_{W}' for feat in features] + \
                [f'{feat}_avg_{W}_opp' for feat in features] + ['home']
    df_model = df_w.dropna(subset=feat_cols + ['win'])

    df_model = df_model.sort_values('gameDate')
    X = df_model[feat_cols]
    y = df_model['win']

    pipe_cat = Pipeline([
        ('cat', CatBoostClassifier(
            verbose=0,
            random_state=42
        ))
    ])

    search = RandomizedSearchCV(
        estimator=pipe_cat,
        param_distributions=param_dist_cat,
        n_iter=30,
        cv=tscv,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1,
        random_state=42
    )
    search.fit(X, y)

    # 5) Record results
    best_idx = search.best_index_
    results_cat.append({
        'window': W,
        'best_cv_accuracy': search.best_score_,
        'std_cv_accuracy': search.cv_results_['std_test_score'][best_idx],
        'best_params': search.best_params_
    })

results_cat_df = pd.DataFrame(results_cat)
print(results_cat_df)


# Ensemble of XGBoost, LightGBM & Catboost

In [None]:
param_grid_xgb = {
    'n_estimators': [200, 400],
    'max_depth': [5, 7],
    'learning_rate': [0.01, 0.05],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

param_grid_lgbm = {
    'n_estimators': [200, 400],
    'max_depth': [5, 7, -1],
    'learning_rate': [0.01, 0.05],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

param_grid_cat = {
    'iterations': [200, 400],
    'depth': [5, 7],
    'learning_rate': [0.01, 0.05],
    'l2_leaf_reg': [3, 5]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

def tune_model(model_class, param_grid, X, y, cv, name, verbose=0):
    search = RandomizedSearchCV(
        model_class(),
        param_distributions=param_grid,
        n_iter=10,
        cv=cv,
        scoring='accuracy',
        random_state=42,
        verbose=verbose,
        n_jobs=-1
    )
    search.fit(X, y)
    print(f"{name} best score: {search.best_score_:.4f}")
    return search.best_estimator_

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:


def train_stacking_ensemble(X, y, cv, use_passthrough=True, scoring='accuracy', random_state=42):
    """
    Trains a stacking classifier ensemble with XGB, LGBM, and CatBoost as base learners.

    Parameters:
        X (pd.DataFrame): Feature matrix
        y (pd.Series or np.array): Target labels
        cv (cross-validation splitter): e.g., TimeSeriesSplit or StratifiedKFold
        use_passthrough (bool): Include original features in meta-model
        scoring (str): e.g., 'accuracy', 'roc_auc', or 'neg_log_loss'
        random_state (int): For reproducibility

    Returns:
        trained_model: Fitted StackingClassifier
        cv_scores: Cross-validated scores
    """

    # Base models
    base_models = [
        ('xgb', XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=random_state,
            n_jobs=-1
        )),
        ('lgbm', LGBMClassifier(
            random_state=random_state,
            n_jobs=-1
        )),
        ('cat', CatBoostClassifier(
            verbose=0,
            random_state=random_state
        ))
    ]

    # Meta model
    meta_model = LogisticRegression(max_iter=1000, random_state=random_state)

    # Full stacking ensemble
    stack_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        passthrough=use_passthrough,
        n_jobs=-1
    )

    # Cross-validation performance
    cv_scores = cross_val_score(stack_model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    print(f"{scoring} (mean ± std): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    # Fit on full training data
    stack_model.fit(X, y)

    return stack_model, cv_scores


In [None]:
df_15 = make_window_df(df, features, 15)
feat_cols = [f'{feat}_avg_{W}' for feat in features] + \
            [f'{feat}_avg_{W}_opp' for feat in features] + ['home']
df_model = df_15.dropna(subset=feat_cols + ['win'])

X = df_model[feat_cols]
y = df_model['win']

xgb_best = tune_model(lambda: XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), param_grid_xgb, X, y, tscv, 'XGBoost')
lgbm_best = tune_model(lambda: LGBMClassifier(random_state=42), param_grid_lgbm, X, y, tscv, 'LightGBM')
cat_best  = tune_model(lambda: CatBoostClassifier(verbose=0, random_state=42), param_grid_cat, X, y, tscv, 'CatBoost')

In [None]:
from sklearn.ensemble import StackingClassifier

meta_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],  # L1 may need solver='liblinear'
    'solver': ['lbfgs']
}

meta_search = RandomizedSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_distributions=meta_grid,
    cv=tscv,
    scoring='accuracy',
    n_iter=5,
    n_jobs=-1
)
meta_search.fit(X, y)
meta_best = meta_search.best_estimator_


stack_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_best),
        ('lgbm', lgbm_best),
        ('cat', cat_best)
    ],
    final_estimator=meta_best,
    passthrough=True,
    n_jobs=-1
)

stack_model.fit(X, y)
cv_scores = cross_val_score(stack_model, X, y, cv=tscv, scoring='accuracy')
print(f"Stacking CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

In [None]:
results_ensemble = [{
    'accuracy': cv_scores.mean(),
    'std_error': cv_scores.std()
}]
results_ensemble_df =  pd.DataFrame(results_ensemble)
print(results_ensemble_df)

# Check if all columns make differenc

In [None]:
base = pd.read_csv(os.path.join(DATA_PATH, "team_statistics_advanced.csv"))
base['gameDate'] = pd.to_datetime(base['gameDate'])
base = base.sort_values(['teamName','gameDate'])
base['season'] = np.where(
    base['gameDate'].dt.month >= 10,
    base['gameDate'].dt.year,
    base['gameDate'].dt.year - 1
)

features = [
    'fieldGoalsMade', 'fieldGoalsAttempted', 'fieldGoalsPercentage',
    'threePointersMade', 'threePointersAttempted', 'threePointersPercentage',
    'freeThrowsMade', 'freeThrowsAttempted', 'freeThrowsPercentage',
    'reboundsOffensive', 'reboundsDefensive', 'reboundsTotal',
    'assists', 'steals', 'blocks', 'turnovers', 'foulsPersonal',
    'teamScore', 'plusMinusPoints', 'opponentScore', 'win',
    'possessions', 'teamScorePerPossession', 'assistsPerPossession',
    'blocksPerPossession', 'stealsPerPossession',
    'fieldGoalsAttemptedPerPossession', 'fieldGoalsMadePerPossession',
    'threePointersAttemptedPerPossession', 'threePointersMadePerPossession',
    'freeThrowsAttemptedPerPossession', 'freeThrowsMadePerPossession',
    'reboundsDefensivePerPossession', 'reboundsOffensivePerPossession',
    'reboundsTotalPerPossession', 'foulsPersonalPerPossession',
    'turnoversPerPossession', 'effectiveFieldGoalPercentage',
    'trueShootingPercentage', 'pointsPerShotAttempt', 'freeThrowRate',
    'assistRate', 'turnoverRate', 'assistToTurnover'
]

target_col = 'win'

train_df = df[df['season'] < 2023]
test_df  = df[df['season'] >= 2023]

In [None]:

# 1) pick the columns you want to treat as “opponent stats”:
opp_cols = [
    'fieldGoalsMade', 'fieldGoalsAttempted', 'fieldGoalsPercentage',
    'threePointersMade', 'threePointersAttempted', 'threePointersPercentage',
    'freeThrowsMade', 'freeThrowsAttempted', 'freeThrowsPercentage',
    'reboundsOffensive', 'reboundsDefensive', 'reboundsTotal',
    'assists', 'steals', 'blocks', 'turnovers', 'foulsPersonal',
    'teamScore', 'plusMinusPoints', 'win',
    'possessions', 'teamScorePerPossession', 'assistsPerPossession',
    'blocksPerPossession', 'stealsPerPossession',
    'fieldGoalsAttemptedPerPossession', 'fieldGoalsMadePerPossession',
    'threePointersAttemptedPerPossession', 'threePointersMadePerPossession',
    'freeThrowsAttemptedPerPossession', 'freeThrowsMadePerPossession',
    'reboundsDefensivePerPossession', 'reboundsOffensivePerPossession',
    'reboundsTotalPerPossession', 'foulsPersonalPerPossession',
    'turnoversPerPossession', 'effectiveFieldGoalPercentage',
    'trueShootingPercentage', 'pointsPerShotAttempt', 'freeThrowRate',
    'assistRate', 'turnoverRate', 'assistToTurnover'
]

# 2) build a small df of opponent stats by renaming
opp = base[['gameId','teamName'] + opp_cols].copy()
opp.columns = ['gameId','opponentTeamName'] + [f"opp_{c}" for c in opp_cols]

# 3) merge back on gameId & opponentTeamName
df_full = base.merge(
    opp,
    on=['gameId','opponentTeamName'],
    how='left'
)

df_full.head()


In [None]:
# the list of “per‐game” stats you want to average
features = opp_cols + [f"opp_{col}" for col in opp_cols]

def evaluate_window(window, df=base):
    roll = (
        df
        .sort_values(['teamName','gameDate'])
        .groupby('teamName')
        .apply(lambda g: g.assign(**{
            f'{feat}_avg_{window}': g[feat].shift().rolling(window).mean()
            for feat in features
        }))
        .reset_index(drop=True)
    )
    df_w = df.merge(
        roll[['gameId','teamName'] + [f'{feat}_avg_{window}' for feat in features]],
        on=['gameId','teamName'], how='left'
    )
    opp = roll[['gameId','teamName'] + [f'{feat}_avg_{window}' for feat in features]].copy()
    opp = opp.rename(columns={
        'teamName': 'opponentTeamName',
        **{f'{feat}_avg_{window}': f'{feat}_avg_{window}_opp' for feat in features}
    })
    df_w = df_w.merge(
        opp,
        on=['gameId','opponentTeamName'], how='left'
    )

    feat_cols = [f'{feat}_avg_{window}' for feat in features] + \
                [f'{feat}_avg_{window}_opp' for feat in features] + ['home']
    df_model = df_w.dropna(subset=feat_cols + ['win'])

    df_model = df_model.sort_values('gameDate')
    X = df_model[feat_cols]
    y = df_model['win']

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=2000, solver='saga'))
    ])
    tscv = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(pipe, X, y, cv=tscv, scoring='accuracy', n_jobs=-1)

    return scores.mean(), scores.std()

windows = [5, 10, 15]
results = []
for W in windows:
    mean, std = evaluate_window(W, df_full)
    results.append((W, mean, std))

results_df = pd.DataFrame(results, columns=['window', 'mean', 'std'])
print(results_df)

--> No improvements using all columns instead of our engineered columns --> Success

# Conclusions

Realistically, a perfect predictor will only achieve ~77% accuracy, as the NBA has around 23% upset rate, where the 'better' team loses. Since we are dealing with sport, it is not perfectly predictable. (Source: https://www.bruinsportsanalytics.com/post/nba-odds-upsets)

In [None]:
# 1. Format ensemble results
results_ensemble_df['model'] = 'Ensemble'
results_ensemble_df['window'] = 15
results_ensemble_df = results_ensemble_df.rename(columns={
    'accuracy': 'best_cv_accuracy',
    'std_error': 'std_cv_accuracy'
})

# 2. Add model label to CatBoost results
results_cat_df['model'] = 'CatBoost'

# 3. Add model label to LightGBM results
results_lgbm_df['model'] = 'LightGBM'

results_xgb_df['model'] = 'XGBoost'

results_rf_df['model'] = 'RandomForest'
results_rf_df = results_rf_df.rename(columns={
    'mean': 'best_cv_accuracy',
    'std': 'std_cv_accuracy'
})

results_lr_df['model'] = 'LogisticRegression'
results_lr_df = results_lr_df.rename(columns={
    'mean': 'best_cv_accuracy',
    'std': 'std_cv_accuracy'
})

# 4. Combine all results
all_results_df = pd.concat([results_ensemble_df, results_cat_df, results_lgbm_df, results_xgb_df, results_rf_df, results_lr_df], ignore_index=True)

# 5. Pretty display: reorder columns
pretty_results = all_results_df[['model', 'window', 'best_cv_accuracy', 'std_cv_accuracy', 'best_params']]

# Rank by best_cv_accuracy
pretty_results = pretty_results.sort_values(by=['best_cv_accuracy'], ascending=False)


# Display
import pandas as pd
pd.set_option("display.max_colwidth", None)
print(pretty_results)


Surprisingly, LogisticRegression is the best performing single model, achieving similar accuracy as to the Ensemble Model of XGBoost, CatBoost and LightGBM.