In [None]:
from nba_api.stats.endpoints import playbyplayv3
import pandas as pd
from nba_api.stats.endpoints import leaguegamelog
import numpy as np
import scipy 
import matplotlib.pyplot as plt 
from tqdm import tqdm
import time 
import os 
from nba_api.stats.endpoints import playbyplayv2, boxscoretraditionalv2, leaguegamefinder
import requests



In [None]:
def extract_game_states(game_id):
    from nba_api.stats.endpoints import playbyplayv2, boxscoretraditionalv2
    import pandas as pd

    # Get play-by-play and team info
    try:
        pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
        df = pbp.get_data_frames()[0]
    except Exception as e:
        print(f"[{game_id}] Failed to load play-by-play: {e}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    try:
        boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
        teams = boxscore.get_data_frames()[1]
    except Exception as e:
        print(f"[{game_id}] Failed to load boxscore: {e}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    if teams.empty or len(teams) < 2:
        print(f"[{game_id}] Skipping: not enough team data.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    home_team = teams.iloc[0]['TEAM_ABBREVIATION']
    away_team = teams.iloc[1]['TEAM_ABBREVIATION']
    gsw_side = 'home' if home_team == 'GSW' else 'away'

    required_columns = ['PERIOD', 'SCORE', 'PCTIMESTRING', 'HOMEDESCRIPTION', 'VISITORDESCRIPTION']
    for col in required_columns:
        if col not in df.columns:
            print(f"[{game_id}] Missing column in play-by-play: {col}")
            return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    state_data = []

    for _, row in df.iterrows():
        action = row['HOMEDESCRIPTION'] or row['VISITORDESCRIPTION']
        raw_team_abbr = home_team if pd.notna(row['HOMEDESCRIPTION']) else away_team
        team_side = 'GSW' if raw_team_abbr == 'GSW' else 'OPP'
        period = row.get('PERIOD')
        score_str = row.get('SCORE')
        time_string = row.get('PCTIMESTRING')

        if action is None or period is None:
            continue

        if isinstance(score_str, str) and '-' in score_str:
            try:
                home_score, away_score = map(int, score_str.split('-'))
                if gsw_side == 'home':
                    score_margin = int(100 * (home_score - away_score) / max(away_score, home_score)) \
                        if max(home_score, away_score) != 0 else 0
                else:
                    score_margin = int(100 * (away_score - home_score) / max(away_score, home_score)) \
                        if max(home_score, away_score) != 0 else 0
            except:
                score_margin = None
        else:
            score_margin = None

        action_lower = action.lower()
        if '3pt' in action_lower or '3-point' in action_lower:
            state = 'Made 3PT' if 'makes' in action_lower else 'Missed 3PT'
        elif 'layup' in action_lower:
            state = 'Made Layup' if 'makes' in action_lower else 'Missed Layup'
        elif 'mid' in action_lower or 'pullup' in action_lower or 'jump shot' in action_lower:
            state = 'Made Midrange' if 'makes' in action_lower else 'Missed Midrange'
        elif 'foul' in action_lower:
            state = 'Foul'
        elif 'free throw' in action_lower:
            state = 'Freethrow'
        elif 'jump ball' in action_lower:
            state = 'Jump ball'
        elif 'rebound' in action_lower:
            state = 'Rebound'
        elif 'substitution' in action_lower:
            state = 'Substitution'
        elif 'timeout' in action_lower:
            state = 'Timeout'
        elif 'turnover' in action_lower:
            state = 'Turnover'
        elif 'violation' in action_lower:
            state = 'Violation'
        else:
            continue

        state_data.append({
            'Time': time_string,
            'Period': period,
            'Team': team_side,
            'State': f"{state} ({team_side})",
            'Possession': raw_team_abbr,
            'ScoreMargin': score_margin
        })

    df_states = pd.DataFrame(state_data)

    if df_states.empty or 'Period' not in df_states.columns:
        print(f"[{game_id}] Empty or malformed state data.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # Split by quarters
    df_q1 = df_states[df_states['Period'] == 1].reset_index(drop=True)
    df_q2 = df_states[df_states['Period'] == 2].reset_index(drop=True)
    df_q3 = df_states[df_states['Period'] == 3].reset_index(drop=True)
    df_q4_ot = df_states[df_states['Period'] >= 4].reset_index(drop=True)

    return df_q1, df_q2, df_q3, df_q4_ot


In [None]:

def safe_get_gamefinder(team_id, season, retries=3, delay=3):
    for attempt in range(retries):
        try:
            print(f"Attempt {attempt+1} for gamefinder...")
            gamefinder = leaguegamefinder.LeagueGameFinder(
                team_id_nullable=team_id,
                season_nullable=season
            )
            return gamefinder.get_data_frames()[0]
        except requests.exceptions.ReadTimeout:
            print("Read timeout. Retrying...")
            time.sleep(delay)
        except Exception as e:
            print(f"Other error: {e}")
            time.sleep(delay)
    raise RuntimeError("Failed to fetch game list after retries.")

games = safe_get_gamefinder(team_id='1610612744', season='2024-25')
game_ids = games['GAME_ID'].unique().tolist()


In [None]:

def build_transition_matrix(states):
    unique_states = list(set(states))
    state_indices = {state: i for i, state in enumerate(unique_states)}
    matrix = np.zeros((len(unique_states), len(unique_states)))

    for (s1, s2) in zip(states[:-1], states[1:]):
        i, j = state_indices[s1], state_indices[s2]
        matrix[i][j] += 1

    row_sums = matrix.sum(axis=1, keepdims=True)
    transition_matrix = np.divide(matrix, row_sums, where=row_sums != 0)

    return pd.DataFrame(transition_matrix, index=unique_states, columns=unique_states)


In [None]:
# Filter out rows with missing or invalid state or score margin
valid_rows = full_df[
    full_df['State'].notna() & full_df['ScoreMargin'].notna()
].reset_index(drop=True)

# Generate transitions from consecutive valid rows
game_transitions = []
for i in range(len(valid_rows) - 1):
    current_row = valid_rows.iloc[i]
    next_row = valid_rows.iloc[i + 1]

    game_transitions.append({
        'current_state': current_row['State'],
        'next_state': next_row['State'],
        'quarter': current_row['Period'],
        'score_margin': current_row['ScoreMargin'],
        'opponent': opponent
    })


In [None]:
import os
import pickle

os.makedirs("game_pickles", exist_ok=True)

transition_data = []

for game_id in tqdm(game_ids):
    success = False
    while not success:
        try:
            pickle_path = f"game_pickles/{game_id}.pkl"
            if os.path.exists(pickle_path):
                with open(pickle_path, "rb") as f:
                    game_transitions = pickle.load(f)
                    transition_data.extend(game_transitions)
                    print(f"Loaded transitions from {game_id} (cached)")
                break  

            time.sleep(6)  # API rate limiting

            # Extract states
            df_q1, df_q2, df_q3, df_q4_ot = extract_game_states(game_id)
            full_df = pd.concat([df_q1, df_q2, df_q3, df_q4_ot], ignore_index=True)

            if full_df.empty or len(full_df) < 2:
                break

            # Get team info
            box = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
            teams = box.get_data_frames()[1]
            if teams.empty or len(teams) < 2:
                break

            home = teams.iloc[0]['TEAM_ABBREVIATION']
            away = teams.iloc[1]['TEAM_ABBREVIATION']
            opponent = away if home == 'GSW' else home

            # Filter only rows with valid state and score
            valid_rows = full_df[
                full_df['State'].notna() & full_df['ScoreMargin'].notna()
            ].reset_index(drop=True)

            # Construct transitions
            game_transitions = []
            for i in range(len(valid_rows) - 1):
                current_row = valid_rows.iloc[i]
                next_row = valid_rows.iloc[i + 1]

                game_transitions.append({
                    'current_state': current_row['State'],
                    'next_state': next_row['State'],
                    'quarter': current_row['Period'],
                    'score_margin': current_row['ScoreMargin'],
                    'opponent': opponent
                })

            transition_data.extend(game_transitions)

            # Cache transitions
            with open(pickle_path, "wb") as f:
                pickle.dump(game_transitions, f)

            success = True

        except Exception as e:
            print(f"Error processing game {game_id}: {e}")
            continue



In [None]:
transition_df = pd.DataFrame(transition_data)


In [None]:
transition_df.head()

In [None]:
#normalize score_margin
transition_df['score_margin'] = pd.Series((transition_df['score_margin'] - transition_df['score_margin'].mean())/transition_df['score_margin'].std())

from sklearn.preprocessing import LabelEncoder
state_encoder = LabelEncoder()
transition_df['current_state_enc'] = state_encoder.fit_transform(transition_df['current_state'])
transition_df['next_state_enc'] = state_encoder.fit_transform(transition_df['next_state'])

state_encoder = LabelEncoder()
transition_df['opponent_enc'] = state_encoder.fit_transform(transition_df['opponent'])


In [None]:
transition_df.head()

In [None]:
import numpy as np
import pandas as pd

sequence_length = 20 

features = df[['current_state_enc', 'score_margin']].values
labels = df['next_state_enc'].values

X = []
y = []

for i in range(len(features) - sequence_length):
    X.append(features[i:i+sequence_length])
    y.append(labels[i+sequence_length])  # Predict the next state after the full sequence

X = np.array(X)  # shape: (samples, timesteps, features)
y = np.array(y)  # shape: (samples,)


In [None]:
X[0]

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

num_classes = df['next_state_enc'].nunique()

model = models.Sequential([
    layers.Input(shape=(sequence_length, X.shape[2])),  # (timesteps, features)
    layers.Masking(mask_value=0.0),  # In case you use padding later
    layers.LSTM(64, return_sequences=False),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [None]:
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
test_seq = X[0:1]
pred = model.predict(test_seq)
predicted_class = np.argmax(pred, axis=-1)

print("Predicted class ID:", predicted_class[0])


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# transition_df: current_state, next_state, quarter, score_margin, opponent

le_current = LabelEncoder()
le_next = LabelEncoder()
le_opp = LabelEncoder()

transition_df['current_state_enc'] = le_current.fit_transform(transition_df['current_state'])
transition_df['next_state_enc'] = le_next.fit_transform(transition_df['next_state'])
transition_df['opponent_enc'] = le_opp.fit_transform(transition_df['opponent'])

X = transition_df[['current_state_enc', 'quarter', 'score_margin', 'opponent_enc']]
y = transition_df['next_state_enc']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le_next.classes_))


In [None]:
transition_df['current_state'].unique()