In [None]:
import os
import time
import pickle
import pandas as pd
from tqdm import tqdm
from nba_api.stats.endpoints import playbyplayv2, boxscoretraditionalv2

def extract_game_states(game_id):
    try:
        pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
        df = pbp.get_data_frames()[0]
    except Exception as e:
        print(f"[{game_id}] Failed to load play-by-play: {e}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    try:
        boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
        teams = boxscore.get_data_frames()[1]
    except Exception as e:
        print(f"[{game_id}] Failed to load boxscore: {e}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    if teams.empty or len(teams) < 2:
        print(f"[{game_id}] Skipping: not enough team data.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    home_team = teams.iloc[0]['TEAM_ABBREVIATION']
    away_team = teams.iloc[1]['TEAM_ABBREVIATION']

    required_columns = ['PERIOD', 'SCORE', 'PCTIMESTRING', 'HOMEDESCRIPTION', 'VISITORDESCRIPTION']
    for col in required_columns:
        if col not in df.columns:
            print(f"[{game_id}] Missing column in play-by-play: {col}")
            return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    state_data = []

    for _, row in df.iterrows():
        desc = row['HOMEDESCRIPTION'] or row['VISITORDESCRIPTION']
        team = home_team if pd.notna(row['HOMEDESCRIPTION']) else away_team
        period = row.get('PERIOD')
        score_str = row.get('SCORE')
        time_string = row.get('PCTIMESTRING')

        if desc is None or period is None:
            continue

        # Compute score margin from the acting team's perspective
        if isinstance(score_str, str) and '-' in score_str:
            try:
                home_score, away_score = map(int, score_str.split('-'))
                if team == home_team:
                    score_margin = int(100 * (home_score - away_score) / max(home_score, away_score)) \
                        if max(home_score, away_score) != 0 else 0
                else:
                    score_margin = int(100 * (away_score - home_score) / max(home_score, away_score)) \
                        if max(home_score, away_score) != 0 else 0
            except:
                score_margin = None
        else:
            score_margin = None

        desc_lower = desc.lower()
        if '3pt' in desc_lower or '3-point' in desc_lower:
            state = 'Made 3PT' if 'makes' in desc_lower else 'Missed 3PT'
        elif 'layup' in desc_lower:
            state = 'Made Layup' if 'makes' in desc_lower else 'Missed Layup'
        elif 'mid' in desc_lower or 'pullup' in desc_lower or 'jump shot' in desc_lower:
            state = 'Made Midrange' if 'makes' in desc_lower else 'Missed Midrange'
        elif 'foul' in desc_lower:
            state = 'Foul'
        elif 'free throw' in desc_lower:
            state = 'Freethrow'
        elif 'jump ball' in desc_lower:
            state = 'Jump ball'
        elif 'rebound' in desc_lower:
            state = 'Rebound'
        elif 'substitution' in desc_lower:
            state = 'Substitution'
        elif 'timeout' in desc_lower:
            state = 'Timeout'
        elif 'turnover' in desc_lower:
            state = 'Turnover'
        elif 'violation' in desc_lower:
            state = 'Violation'
        else:
            continue

        state_data.append({
            'Time': time_string,
            'Period': period,
            'Team': team,
            'State': state,
            'ScoreMargin': score_margin
        })

    df_states = pd.DataFrame(state_data)

    if df_states.empty or 'Period' not in df_states.columns:
        print(f"[{game_id}] Empty or malformed state data.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    df_q1 = df_states[df_states['Period'] == 1].reset_index(drop=True)
    df_q2 = df_states[df_states['Period'] == 2].reset_index(drop=True)
    df_q3 = df_states[df_states['Period'] == 3].reset_index(drop=True)
    df_q4_ot = df_states[df_states['Period'] >= 4].reset_index(drop=True)

    return df_q1, df_q2, df_q3, df_q4_ot


# === MAIN DATA PREP PHASE ===
def prepare_transition_dataframe(game_ids):
    os.makedirs("game_pickles", exist_ok=True)
    transition_data = []

    for game_id in tqdm(game_ids):
        success = False
        while not success:
            try:
                pickle_path = f"game_pickles/{game_id}.pkl"
                if os.path.exists(pickle_path):
                    with open(pickle_path, "rb") as f:
                        game_transitions = pickle.load(f)
                        transition_data.extend(game_transitions)
                        print(f"Loaded transitions from {game_id} (cached)")
                    break

                time.sleep(0.6)

                df_q1, df_q2, df_q3, df_q4_ot = extract_game_states(game_id)
                full_df = pd.concat([df_q1, df_q2, df_q3, df_q4_ot], ignore_index=True)

                if full_df.empty or len(full_df) < 2:
                    break

                box = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
                teams = box.get_data_frames()[1]
                if teams.empty or len(teams) < 2:
                    break

                home = teams.iloc[0]['TEAM_ABBREVIATION']
                away = teams.iloc[1]['TEAM_ABBREVIATION']

                valid_rows = full_df[
                    full_df['State'].notna() & full_df['ScoreMargin'].notna()
                ].reset_index(drop=True)

                game_transitions = []
                for i in range(len(valid_rows) - 1):
                    current_row = valid_rows.iloc[i]
                    next_row = valid_rows.iloc[i + 1]

                    acting_team = current_row['Team']
                    opponent_team = away if acting_team == home else home

                    game_transitions.append({
                        'current_state': current_row['State'],
                        'next_state': next_row['State'],
                        'quarter': current_row['Period'],
                        'score_margin': current_row['ScoreMargin'],
                        'opponent': opponent_team,
                        'team': acting_team
                    })

                transition_data.extend(game_transitions)

                with open(pickle_path, "wb") as f:
                    pickle.dump(game_transitions, f)

                success = True

            except Exception as e:
                print(f"Error processing game {game_id}: {e}")
                continue

    return pd.DataFrame(transition_data)


In [None]:
import os
import time
import pickle
import pandas as pd
from tqdm import tqdm
from nba_api.stats.endpoints import playbyplayv2, boxscoretraditionalv2

def extract_game_states(game_id):
    try:
        pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
        df = pbp.get_data_frames()[0]
    except Exception as e:
        print(f"[{game_id}] Failed to load play-by-play: {e}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    try:
        boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
        teams = boxscore.get_data_frames()[1]
    except Exception as e:
        print(f"[{game_id}] Failed to load boxscore: {e}")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    if teams.empty or len(teams) < 2:
        print(f"[{game_id}] Skipping: not enough team data.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    home_team = teams.iloc[0]['TEAM_ABBREVIATION']
    away_team = teams.iloc[1]['TEAM_ABBREVIATION']

    required_columns = ['PERIOD', 'SCORE', 'PCTIMESTRING', 'HOMEDESCRIPTION', 'VISITORDESCRIPTION']
    for col in required_columns:
        if col not in df.columns:
            print(f"[{game_id}] Missing column in play-by-play: {col}")
            return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    state_data = []

    for _, row in df.iterrows():
        desc = row['HOMEDESCRIPTION'] or row['VISITORDESCRIPTION']
        team = home_team if pd.notna(row['HOMEDESCRIPTION']) else away_team
        is_home = (team == home_team)
        period = row.get('PERIOD')
        score_str = row.get('SCORE')
        time_string = row.get('PCTIMESTRING')

        if desc is None or period is None:
            continue

        # Compute score margin from acting team's POV
        if isinstance(score_str, str) and '-' in score_str:
            try:
                home_score, away_score = map(int, score_str.split('-'))
                if is_home:
                    score_margin = int((home_score - away_score) / max(home_score, away_score)) \
                        if max(home_score, away_score) != 0 else 0
                else:
                    score_margin = int((away_score - home_score) / max(home_score, away_score)) \
                        if max(home_score, away_score) != 0 else 0
            except:
                score_margin = None
        else:
            score_margin = None

        desc_lower = desc.lower()
        if '3pt' in desc_lower or '3-point' in desc_lower:
            state = 'Made 3PT' if 'makes' in desc_lower else 'Missed 3PT'
        elif 'layup' in desc_lower:
            state = 'Made Layup' if 'makes' in desc_lower else 'Missed Layup'
        elif 'mid' in desc_lower or 'pullup' in desc_lower or 'jump shot' in desc_lower:
            state = 'Made Midrange' if 'makes' in desc_lower else 'Missed Midrange'
        elif 'foul' in desc_lower:
            state = 'Foul'
        elif 'free throw' in desc_lower:
            state = 'Freethrow'
        elif 'jump ball' in desc_lower:
            state = 'Jump ball'
        elif 'rebound' in desc_lower:
            state = 'Rebound'
        elif 'substitution' in desc_lower:
            state = 'Substitution'
        elif 'timeout' in desc_lower:
            state = 'Timeout'
        elif 'turnover' in desc_lower:
            state = 'Turnover'
        elif 'violation' in desc_lower:
            state = 'Violation'
        else:
            continue

        side = 'Home' if is_home else 'Away'

        state_data.append({
            'Time': time_string,
            'Period': period,
            'Team': team,
            'State': f"{state} ({side})",
            'ScoreMargin': score_margin,
            'home_team': home_team,
            'away_team': away_team,
            'is_home_team': is_home
        })

    df_states = pd.DataFrame(state_data)

    if df_states.empty or 'Period' not in df_states.columns:
        print(f"[{game_id}] Empty or malformed state data.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    df_q1 = df_states[df_states['Period'] == 1].reset_index(drop=True)
    df_q2 = df_states[df_states['Period'] == 2].reset_index(drop=True)
    df_q3 = df_states[df_states['Period'] == 3].reset_index(drop=True)
    df_q4_ot = df_states[df_states['Period'] >= 4].reset_index(drop=True)

    return df_q1, df_q2, df_q3, df_q4_ot


# === MAIN DATA PREP PHASE ===
def prepare_transition_dataframe(game_ids):
    os.makedirs("game_pickles", exist_ok=True)
    transition_data = []

    for game_id in tqdm(game_ids):
        success = False
        while not success:
            try:
                pickle_path = f"game_pickles/{game_id}.pkl"
                if os.path.exists(pickle_path):
                    with open(pickle_path, "rb") as f:
                        game_transitions = pickle.load(f)
                        transition_data.extend(game_transitions)
                        print(f"Loaded transitions from {game_id} (cached)")
                    break

                time.sleep(0.6)

                df_q1, df_q2, df_q3, df_q4_ot = extract_game_states(game_id)
                full_df = pd.concat([df_q1, df_q2, df_q3, df_q4_ot], ignore_index=True)

                if full_df.empty or len(full_df) < 2:
                    break

                box = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
                teams = box.get_data_frames()[1]
                if teams.empty or len(teams) < 2:
                    break

                home = teams.iloc[0]['TEAM_ABBREVIATION']
                away = teams.iloc[1]['TEAM_ABBREVIATION']

                valid_rows = full_df[
                    full_df['State'].notna() & full_df['ScoreMargin'].notna()
                ].reset_index(drop=True)

                game_transitions = []
                for i in range(len(valid_rows) - 1):
                    current_row = valid_rows.iloc[i]
                    next_row = valid_rows.iloc[i + 1]

                    acting_team = current_row['Team']
                    opponent_team = away if acting_team == home else home

                    game_transitions.append({
                        'current_state': current_row['State'],
                        'next_state': next_row['State'],
                        'quarter': current_row['Period'],
                        'score_margin': current_row['ScoreMargin'],
                        'opponent': opponent_team,
                        'team': acting_team,
                        'is_home_team': current_row['is_home_team'],
                        'home_team': current_row['home_team'],
                        'away_team': current_row['away_team']
                    })

                transition_data.extend(game_transitions)

                with open(pickle_path, "wb") as f:
                    pickle.dump(game_transitions, f)

                success = True

            except Exception as e:
                print(f"Error processing game {game_id}: {e}")
                continue

    return pd.DataFrame(transition_data)


In [None]:
from nba_api.stats.endpoints import leaguegamelog

log = leaguegamelog.LeagueGameLog(season='2023-24', season_type_all_star='Regular Season')
game_ids = log.get_data_frames()[0]['GAME_ID'].unique()

# Prepare data from first 50 games (for testing)
transition_df = prepare_transition_dataframe(game_ids[:50])


In [None]:
(transition_df.head())