In [31]:
import pandas as pd
import numpy as np
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"


def find_scoring_runs(df, threshold=6):
    runs = []
    df = df.sort_values(['game_id','play_id']).reset_index(drop=True)
    for game_id, g in df.groupby('game_id'):
        home, away = g.iloc[0]['home'], g.iloc[0]['away']
        home_score = away_score = 0
        run_points = {home:0, away:0}
        run_start_idx = None
        for idx, row in g.iterrows():
            h_new, a_new = row['home_score'], row['away_score']
            if h_new != home_score or a_new != away_score:
                team_scored = home if h_new != home_score else away
                pts = (h_new - home_score) if team_scored == home else (a_new - away_score)
                if run_points[team_scored] == 0:
                    run_start_idx = idx
                run_points[team_scored] += pts
                opp = away if team_scored == home else home
                run_points[opp] = 0
                if run_points[team_scored] >= threshold:
                    runs.append({
                        'game_id': game_id,
                        'team': team_scored,
                        'start_play_id': int(g.loc[run_start_idx, 'play_id']),
                        'end_play_id': int(row['play_id'])
                    })
                    run_points[team_scored] = 0
                    run_start_idx = None
            home_score, away_score = h_new, a_new
    return pd.DataFrame(runs)



def label_runs(df, runs_df):
    df = df.sort_values(['game_id','play_id']).reset_index(drop=True)
    df['run_start'] = 0
    df['run_in_process'] = 0
    for _, run in runs_df.iterrows():
        mask = (
            (df['game_id'] == run.game_id) &
            (df['play_id'] >= run.start_play_id) &
            (df['play_id'] <= run.end_play_id)
        )
        df.loc[mask, 'run_in_process'] = 1
        df.loc[
            (df['game_id'] == run.game_id) &
            (df['play_id'] == run.start_play_id),
            'run_start'
        ] = 1
    return df





def prepare_pbp_df(
    csv_path,
    score_threshold=6,
    prob_threshold=0.10,
    drop_cols=None
):
    df = pd.read_csv(csv_path)
    # Label pure scoring runs
    runs_df = find_scoring_runs(df, score_threshold)
    df = label_runs(df, runs_df)
    # Compute team-specific win probability and delta
    df['team_win_prob'] = np.where(
        df['action_team'] == df['home'],
        df['win_prob'],
        1 - df['win_prob']
    )
    df['team_win_prob_delta'] = df['team_win_prob'].diff().fillna(0)
    # Composite run_start including big win-prob swings
    df['run_start'] = (
        (df['run_start'] == 1) |
        (df['team_win_prob_delta'] >= prob_threshold)
    ).astype(int)
    # Additional features
    df['time_seconds'] = df['time_remaining_half'].apply(
        lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1])
    )
    df['points_scored'] = df['home_score'].diff().fillna(0).astype(int)
    df['score_diff'] = (df['home_score'] - df['away_score']).fillna(0).astype(int)
    df['foul_flag'] = df['foul'].fillna(False).astype(int)
    df['turnover_flag'] = df['description'].str.contains('Turnover', na=False).astype(int)
    # Drop unwanted columns
    if drop_cols is None:
        drop_cols = [
            'date','home','away','referees','arena_location','arena',
            'attendance','naive_win_prob',
            'home_time_outs_remaining','away_time_outs_remaining',
            'home_favored_by','total_line'
        ]
    df = df.drop(columns=[c for c in drop_cols if c in df.columns])
    return df

In [32]:

cleaned_df = prepare_pbp_df('Alabama_pbp.csv')

print(cleaned_df.shape)
cleaned_df.head(60)

(11472, 32)


Unnamed: 0,game_id,play_id,half,time_remaining_half,secs_remaining,secs_remaining_absolute,description,action_team,home_score,away_score,...,possession_before,possession_after,run_start,run_in_process,team_win_prob,team_win_prob_delta,time_seconds,points_scored,foul_flag,turnover_flag
0,401574551,1,1,19:36,2376,2376,PJ Hall made Jumper.,away,0,2,...,Clemson,Alabama,0,0,0.234735,0.0,1176,0,0,0
1,401574551,2,1,19:27,2367,2367,Mohamed Wague made Layup. Assisted by Mark Sears.,home,2,2,...,Alabama,Clemson,0,0,0.204385,-0.03035,1167,2,0,0
2,401574551,3,1,19:27,2367,2367,Foul on PJ Hall.,away,2,2,...,Alabama,Clemson,0,0,0.204385,0.0,1167,0,1,0
3,401574551,4,1,19:27,2367,2367,Mohamed Wague missed Free Throw.,home,2,2,...,Alabama,Clemson,0,0,0.204385,0.0,1167,0,0,0
4,401574551,5,1,19:27,2367,2367,PJ Hall Defensive Rebound.,away,2,2,...,Alabama,Clemson,0,0,0.204385,0.0,1167,0,0,0
5,401574551,6,1,18:58,2338,2338,Ian Schieffelin Turnover.,away,2,2,...,Clemson,Alabama,0,0,0.205061,0.000676,1138,0,0,1
6,401574551,7,1,18:58,2338,2338,Rylan Griffen Steal.,home,2,2,...,Clemson,Alabama,0,0,0.205061,0.0,1138,0,0,0
7,401574551,8,1,18:51,2331,2331,Grant Nelson missed Three Point Jumper.,home,2,2,...,Alabama,Alabama,0,0,0.205228,0.000167,1131,0,0,0
8,401574551,9,1,18:46,2326,2326,Rylan Griffen Offensive Rebound.,home,2,2,...,Alabama,Alabama,0,0,0.205348,0.00012,1126,0,0,0
9,401574551,10,1,18:34,2314,2314,Aaron Estrada missed Three Point Jumper.,home,2,2,...,Alabama,Alabama,0,0,0.20564,0.000292,1114,0,0,0


In [34]:

from sklearn.preprocessing import LabelEncoder

df = cleaned_df.copy()
df['event_type'] = df['description'].apply(
    lambda x: x.split()[1] if isinstance(x, str) and any(w in x for w in ['made','missed']) else 'other'
)

le_team  = LabelEncoder().fit(df['action_team'])
le_event = LabelEncoder().fit(df['event_type'])
df['team_encoded']  = le_team.transform(df['action_team'])
df['event_encoded'] = le_event.transform(df['event_type'])

window_size = 3
feature_cols = [
    'team_encoded','event_encoded','time_seconds','points_scored',
    'score_diff','team_win_prob_delta','foul_flag','turnover_flag'
]

X, y = [], []
for i in range(window_size, len(df)):
    block = df.iloc[i-window_size:i]
    if block['game_id'].nunique() > 1:
        continue
    X.append(block[feature_cols].values)
    y.append(df.iloc[i]['run_start'])

X = np.array(X)
y = np.array(y)

# -----------------------------
# 5) Train/Test split & oversample
# -----------------------------
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

n_samples, w, f = X_train.shape
X_flat = X_train.reshape(n_samples, w * f)
ros = RandomOverSampler(random_state=42)
X_res_flat, y_res = ros.fit_resample(X_flat, y_train)
X_res = X_res_flat.reshape(-1, w, f)

print("Post-oversample counts:", np.bincount(y_res), "-> X_res shape:", X_res.shape)

# -----------------------------
# 6) Custom focal loss
# -----------------------------
import tensorflow as tf
from tensorflow.keras import backend as K

def focal_loss(alpha=0.25, gamma=2.0):
    def loss_fn(y_true, y_pred):
        eps = K.epsilon()
        y_pred = K.clip(y_pred, eps, 1 - eps)
        ce = -y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred)
        w = alpha * y_true * K.pow(1 - y_pred, gamma) + \
            (1 - alpha) * (1 - y_true) * K.pow(y_pred, gamma)
        return K.mean(w * ce)
    return loss_fn

# ---------------------------------
# 7) Build, compile & train LSTM
# ---------------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense

model = Sequential([
    LSTM(32, input_shape=(window_size, f)),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss=focal_loss(alpha=0.25, gamma=2.0),
    metrics=['accuracy']
)
model.summary()

history = model.fit(
    X_res, y_res,
    validation_split=0.2,
    epochs=10,
    batch_size=64
)

# -----------------------------
# 8) Evaluate & threshold-tune
# -----------------------------
from sklearn.metrics import classification_report

probs = model.predict(X_test).flatten()
for thresh in [0.5, 0.4, 0.3, 0.2]:
    preds = (probs > thresh).astype(int)
    print(f"\n--- Threshold = {thresh} ---")
    print(classification_report(y_test, preds, zero_division=0))

Post-oversample counts: [8879 8879] -> X_res shape: (17758, 3, 8)


  super().__init__(**kwargs)


Epoch 1/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6098 - loss: 0.0845 - val_accuracy: 0.0000e+00 - val_loss: 0.0888
Epoch 2/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6205 - loss: 0.0673 - val_accuracy: 0.0099 - val_loss: 0.0841
Epoch 3/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6294 - loss: 0.0659 - val_accuracy: 0.0042 - val_loss: 0.0803
Epoch 4/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6260 - loss: 0.0661 - val_accuracy: 0.0000e+00 - val_loss: 0.0896
Epoch 5/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6324 - loss: 0.0653 - val_accuracy: 0.0349 - val_loss: 0.0799
Epoch 6/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6344 - loss: 0.0648 - val_accuracy: 0.0504 - val_loss: 0.0876
Epoch 7/10
[1m222/2