In [6]:
import numpy as np, jupyter, matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("robbypeery/college-basketball-pbp-23-24")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/axelvandenheuvel/.cache/kagglehub/datasets/robbypeery/college-basketball-pbp-23-24/versions/18


In [12]:

df = pd.read_csv(path+'/Colorado_pbp.csv')

In [13]:
print(df.shape)
df.head(6)
# df.score_diff
# df["game_id"].unique().shape

(10858, 34)


Unnamed: 0,game_id,date,home,away,play_id,half,time_remaining_half,secs_remaining,secs_remaining_absolute,description,...,arena_location,arena,attendance,shot_team,shot_outcome,shooter,three_pt,free_throw,possession_before,possession_after
0,401587141,2023-11-06,Colorado,Towson,1,1,19:42,2382,2382,KJ Simpson made Layup. Assisted by J'Vonne Had...,...,"Boulder, CO",CU Events Center,6078,Colorado,made,KJ Simpson,False,False,Colorado,Towson
1,401587141,2023-11-06,Colorado,Towson,2,1,19:20,2360,2360,Charles Thompson missed Jumper.,...,"Boulder, CO",CU Events Center,6078,Towson,missed,Charles Thompson,False,False,Towson,Colorado
2,401587141,2023-11-06,Colorado,Towson,3,1,19:17,2357,2357,KJ Simpson Defensive Rebound.,...,"Boulder, CO",CU Events Center,6078,,,,,,Towson,Colorado
3,401587141,2023-11-06,Colorado,Towson,4,1,19:13,2353,2353,KJ Simpson missed Three Point Jumper.,...,"Boulder, CO",CU Events Center,6078,Colorado,missed,KJ Simpson,True,False,Colorado,Towson
4,401587141,2023-11-06,Colorado,Towson,5,1,19:10,2350,2350,Tyler Tejada Defensive Rebound.,...,"Boulder, CO",CU Events Center,6078,,,,,,Colorado,Towson
5,401587141,2023-11-06,Colorado,Towson,6,1,18:52,2332,2332,Tyler Tejada Turnover.,...,"Boulder, CO",CU Events Center,6078,,,,,,Towson,Colorado


In [14]:
df = df.sort_values(['game_id', 'play_id'], ascending=[True,True]).reset_index(drop=True)
runs=[]

for game_id, game_df in df.groupby('game_id'):
    home = game_df.iloc[0]['home']
    away = game_df.iloc[0]['away']
    
    # We track cumulative points scored
    home_score = 0
    away_score = 0
    
    # We also track potential runs
    run_points = {home: 0, away: 0}
    run_start_index = None
    run_start_time = None
    
    for idx, row in game_df.iterrows():
        new_home_score = row['home_score']
        new_away_score = row['away_score']
        
        # Determine who scored (if anyone)
        if new_home_score != home_score or new_away_score != away_score:
            # Someone scored
            scoring_team = home if new_home_score != home_score else away
            points_scored = (new_home_score - home_score) if scoring_team == home else (new_away_score - away_score)
            
            if run_points[scoring_team] == 0:
                # First points in the run
                run_start_index = idx
                run_start_time = row['time_remaining_half']
            
            run_points[scoring_team] += points_scored
            # Reset the opponent's run points if they score
            opponent = away if scoring_team == home else home
            run_points[opponent] = 0
            
            # Check if run is 6-0
            if run_points[scoring_team] >= 6 and run_points[opponent] == 0:
                runs.append({
                    'game_id': game_id,
                    'team': scoring_team,
                    'start_play_id': game_df.loc[run_start_index]['play_id'],
                    'start_time': run_start_time,
                    'end_play_id': row['play_id'],
                    'end_time': row['time_remaining_half'],
                    'points_scored': run_points[scoring_team]
                })
                
                # After finding a run, reset
                run_points[scoring_team] = 0
                run_start_index = None
                run_start_time = None
        
        home_score = new_home_score
        away_score = new_away_score

# Turn the runs list into a DataFrame
runs_df = pd.DataFrame(runs)

In [15]:
runs_df

Unnamed: 0,game_id,team,start_play_id,start_time,end_play_id,end_time,points_scored
0,401576684,Colorado,44,13:43,55,12:29,7
1,401576684,Colorado,87,8:34,95,8:02,6
2,401576684,Colorado,115,6:24,137,3:59,6
3,401576684,Colorado,166,19:54,169,19:34,6
4,401576684,Colorado,280,6:53,293,5:46,6
...,...,...,...,...,...,...,...
196,401625480,Colorado,257,4:54,266,3:48,7
197,401625482,Washington St,178,12:22,190,10:53,7
198,401625483,Colorado,41,13:34,50,12:28,6
199,401625483,Colorado,51,12:28,82,8:52,7


In [16]:
df[(df['game_id']==401576684) &
    (df['play_id'] >= 43) &
    (df['play_id'] <= 56)]

Unnamed: 0,game_id,date,home,away,play_id,half,time_remaining_half,secs_remaining,secs_remaining_absolute,description,...,arena_location,arena,attendance,shot_team,shot_outcome,shooter,three_pt,free_throw,possession_before,possession_after
42,401576684,2023-12-03,Colorado,Pepperdine,43,1,13:59,2039,2039,Cord Stansberry made Three Point Jumper. Assis...,...,"Boulder, CO",CU Events Center,7231,Pepperdine,made,Cord Stansberry,True,False,Pepperdine,Colorado
43,401576684,2023-12-03,Colorado,Pepperdine,44,1,13:43,2023,2023,Julian Hammond III made Three Point Jumper. As...,...,"Boulder, CO",CU Events Center,7231,Colorado,made,Julian Hammond III,True,False,Colorado,Pepperdine
44,401576684,2023-12-03,Colorado,Pepperdine,45,1,13:26,2006,2006,Cord Stansberry Turnover.,...,"Boulder, CO",CU Events Center,7231,,,,,,Pepperdine,Colorado
45,401576684,2023-12-03,Colorado,Pepperdine,46,1,13:26,2006,2006,J'Vonne Hadley Steal.,...,"Boulder, CO",CU Events Center,7231,,,,,,Pepperdine,Colorado
46,401576684,2023-12-03,Colorado,Pepperdine,47,1,13:22,2002,2002,Julian Hammond III made Layup. Assisted by J'V...,...,"Boulder, CO",CU Events Center,7231,Colorado,made,Julian Hammond III,False,False,Colorado,Pepperdine
47,401576684,2023-12-03,Colorado,Pepperdine,48,1,13:22,2002,2002,Pepperdine Timeout,...,"Boulder, CO",CU Events Center,7231,,,,,,Colorado,Pepperdine
48,401576684,2023-12-03,Colorado,Pepperdine,49,1,13:01,1981,1981,Michael Ajayi missed Jumper.,...,"Boulder, CO",CU Events Center,7231,Pepperdine,missed,Michael Ajayi,False,False,Pepperdine,Colorado
49,401576684,2023-12-03,Colorado,Pepperdine,50,1,12:57,1977,1977,Julian Hammond III Defensive Rebound.,...,"Boulder, CO",CU Events Center,7231,,,,,,Pepperdine,Colorado
50,401576684,2023-12-03,Colorado,Pepperdine,51,1,12:45,1965,1965,Eddie Lampkin Jr. missed Layup.,...,"Boulder, CO",CU Events Center,7231,Colorado,missed,Eddie Lampkin Jr.,False,False,Colorado,Pepperdine
51,401576684,2023-12-03,Colorado,Pepperdine,52,1,12:44,1964,1964,Pepperdine Defensive Rebound.,...,"Boulder, CO",CU Events Center,7231,,,,,,Colorado,Pepperdine


In [17]:
df = df.sort_values(['game_id', 'play_id'], ascending=[True, True]).reset_index(drop=True)

df['run_label'] = 0  # Add the column properly
game_df = df         # Now game_df points to the full DataFrame

# Loop through each scoring run
for _, run in runs_df.iterrows():
    game_id = run['game_id']
    team = run['team']
    start_play_id = run['start_play_id']
    end_play_id = run['end_play_id']
    
    # Apply mask: match game_id and play_id in range
    mask = (
        (game_df['game_id'] == game_id) &
        (game_df['play_id'] >= start_play_id) &
        (game_df['play_id'] <= end_play_id)
    )
    
    # Apply the label
    game_df.loc[mask, 'run_label'] = 1
game_df.head(10)

Unnamed: 0,game_id,date,home,away,play_id,half,time_remaining_half,secs_remaining,secs_remaining_absolute,description,...,arena,attendance,shot_team,shot_outcome,shooter,three_pt,free_throw,possession_before,possession_after,run_label
0,401576684,2023-12-03,Colorado,Pepperdine,1,1,19:31,2371,2371,J'Vonne Hadley missed Layup.,...,CU Events Center,7231,Colorado,missed,J'Vonne Hadley,False,False,Colorado,Colorado,0
1,401576684,2023-12-03,Colorado,Pepperdine,2,1,19:31,2371,2371,Jalen Pitre Block.,...,CU Events Center,7231,,,,,,Colorado,Colorado,0
2,401576684,2023-12-03,Colorado,Pepperdine,3,1,19:22,2362,2362,Cody Williams Offensive Rebound.,...,CU Events Center,7231,,,,,,Colorado,Colorado,0
3,401576684,2023-12-03,Colorado,Pepperdine,4,1,19:22,2362,2362,Cody Williams made Layup.,...,CU Events Center,7231,Colorado,made,Cody Williams,False,False,Colorado,Pepperdine,0
4,401576684,2023-12-03,Colorado,Pepperdine,5,1,19:04,2344,2344,Houston Mallette made Jumper. Assisted by Jale...,...,CU Events Center,7231,Pepperdine,made,Houston Mallette,False,False,Pepperdine,Colorado,0
5,401576684,2023-12-03,Colorado,Pepperdine,6,1,18:45,2325,2325,KJ Simpson made Three Point Jumper. Assisted b...,...,CU Events Center,7231,Colorado,made,KJ Simpson,True,False,Colorado,Pepperdine,0
6,401576684,2023-12-03,Colorado,Pepperdine,7,1,18:32,2312,2312,Michael Ajayi made Jumper.,...,CU Events Center,7231,Pepperdine,made,Michael Ajayi,False,False,Pepperdine,Colorado,0
7,401576684,2023-12-03,Colorado,Pepperdine,8,1,18:17,2297,2297,Cody Williams missed Jumper.,...,CU Events Center,7231,Colorado,missed,Cody Williams,False,False,Colorado,Colorado,0
8,401576684,2023-12-03,Colorado,Pepperdine,9,1,18:11,2291,2291,Cody Williams Offensive Rebound.,...,CU Events Center,7231,,,,,,Colorado,Colorado,0
9,401576684,2023-12-03,Colorado,Pepperdine,10,1,18:11,2291,2291,Cody Williams missed Layup.,...,CU Events Center,7231,Colorado,missed,Cody Williams,False,False,Colorado,Pepperdine,0


In [18]:
df['run_label'] = 0  # Initialize all to 0

# Label only the *start* play of each run
for _, run in runs_df.iterrows():
    df.loc[
        (df['game_id'] == run['game_id']) & 
        (df['play_id'] == run['start_play_id']), 
        'run_label'
    ] = 1


In [19]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Preprocess
df['time_seconds'] = df['time_remaining_half'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))
df['points_scored'] = df['home_score'].diff().fillna(0).astype(int)  # crude approximation

# Optional: Basic event encoding from description
df['event_type'] = df['description'].apply(lambda x: x.split()[1] if isinstance(x, str) and 'made' in x or 'missed' in x else 'other')

# Step 2: Mark start of run
df['run_start'] = ((df['run_label'] == 1) & (df['run_label'].shift(1) == 0)).astype(int)

# Encode categorical
le_team = LabelEncoder()
df['team_encoded'] = le_team.fit_transform(df['action_team'])

le_event = LabelEncoder()
df['event_encoded'] = le_event.fit_transform(df['event_type'])

# Step 3: Create sliding windows of 3 plays
X, y = [], []
for i in range(2, len(df)):
    window = df.iloc[i-2:i+1]
    
    # Only include windows within the same game
    if len(window['game_id'].unique()) > 1:
        continue
    
    # Flatten features from 3 plays
    features = []
    for j in range(3):
        features += [
            window.iloc[j]['team_encoded'],
            window.iloc[j]['event_encoded'],
            window.iloc[j]['time_seconds'],
            window.iloc[j]['points_scored']
        ]
    
    X.append(features)
    y.append(df.iloc[i]['run_start'])

X = np.array(X)
y = np.array(y)

df.shape

(10858, 41)

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(solver='saga',class_weight='balanced', max_iter=1000)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.79      0.88      2116
           1       0.06      0.71      0.12        42

    accuracy                           0.79      2158
   macro avg       0.53      0.75      0.50      2158
weighted avg       0.97      0.79      0.87      2158

