In [1]:
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd
import time
import pickle

# Define NBA seasons from 1976–77 to 2023–24
seasons = [f"{year}-{str(year + 1)[-2:]}" for year in range(1976, 2024)]

# Store separate game ID sets
regular_season_ids = []
playoff_ids = []

for season in seasons:
    print(f"\n📅 Season: {season}")
    
    time.sleep(1)
    # Regular Season
    reg_finder = leaguegamefinder.LeagueGameFinder(
        season_nullable=season,
        season_type_nullable="Regular Season"
    )
    reg_df = reg_finder.get_data_frames()[0]
    reg_ids = reg_df['GAME_ID'].unique().tolist()
    regular_season_ids.extend(reg_ids)
    print(f"  🏀 Regular Season games: {len(reg_ids)}")
    
    time.sleep(1)
    # Playoffs
    playoff_finder = leaguegamefinder.LeagueGameFinder(
        season_nullable=season,
        season_type_nullable="Playoffs"
    )
    playoff_df = playoff_finder.get_data_frames()[0]
    po_ids = playoff_df['GAME_ID'].unique().tolist()
    playoff_ids.extend(po_ids)
    print(f"  🔥 Playoff games: {len(po_ids)}")

# Remove duplicates (just in case)
regular_season_ids = list(set(regular_season_ids))
playoff_ids = list(set(playoff_ids))

# Total report
print(f"\n✅ Total Regular Season GAME_IDs: {len(regular_season_ids)}")
print(f"✅ Total Playoff GAME_IDs: {len(playoff_ids)}")

# Save both sets
with open("data/raw/regular_season_ids.pkl", "wb") as f:
    pickle.dump(regular_season_ids, f)

with open("data/raw/playoff_ids.pkl", "wb") as f:
    pickle.dump(playoff_ids, f)



📅 Season: 1976-77
  🏀 Regular Season games: 0
  🔥 Playoff games: 0

📅 Season: 1977-78
  🏀 Regular Season games: 0
  🔥 Playoff games: 0

📅 Season: 1978-79
  🏀 Regular Season games: 0
  🔥 Playoff games: 0

📅 Season: 1979-80
  🏀 Regular Season games: 0
  🔥 Playoff games: 0

📅 Season: 1980-81
  🏀 Regular Season games: 0
  🔥 Playoff games: 0

📅 Season: 1981-82
  🏀 Regular Season games: 0
  🔥 Playoff games: 0

📅 Season: 1982-83
  🏀 Regular Season games: 0
  🔥 Playoff games: 0

📅 Season: 1983-84
  🏀 Regular Season games: 943
  🔥 Playoff games: 79

📅 Season: 1984-85
  🏀 Regular Season games: 943
  🔥 Playoff games: 68

📅 Season: 1985-86
  🏀 Regular Season games: 943
  🔥 Playoff games: 68

📅 Season: 1986-87
  🏀 Regular Season games: 943
  🔥 Playoff games: 71

📅 Season: 1987-88
  🏀 Regular Season games: 943
  🔥 Playoff games: 80

📅 Season: 1988-89
  🏀 Regular Season games: 1025
  🔥 Playoff games: 62

📅 Season: 1989-90
  🏀 Regular Season games: 1107
  🔥 Playoff games: 72

📅 Season: 1990-91
  🏀 Re

In [2]:
import pickle
import time
import os
from nba_api.stats.endpoints import boxscoretraditionalv2
import pandas as pd
import glob

# === Load Regular Season GAME_IDs ===
with open('../data/raw/regular_season_ids.pkl', 'rb') as f:
    all_game_ids = pickle.load(f)

# === Check existing batches ===
batch_files = glob.glob('../data/raw/boxscores_batch_*.pkl')
batch_files.sort()

games_done = len(batch_files) * 200
remaining_game_ids = all_game_ids[games_done:]

print(f"✅ Found {len(batch_files)} batches already saved.")
print(f"🔁 Resuming from game {games_done + 1} of {len(all_game_ids)}")
print(f"🗂️  Remaining GAME_IDs to process: {len(remaining_game_ids)}")

# === Setup ===
batch_size = 200
batch_data = []
failed_ids = []

# === Start loop ===
for i, game_id in enumerate(remaining_game_ids, start=games_done + 1):
    try:
        print(f"📥 Pulling game {i}: {game_id}")
        time.sleep(1)  # avoid API rate limits

        boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
        df = boxscore.get_data_frames()[0]

        # Filter bench players (START_POSITION == '')
        bench_df = df[df['START_POSITION'] == '']

        # Add metadata
        bench_df['GAME_ID'] = game_id
        batch_data.append(bench_df)

        # Save batch every 200 games
        if i % batch_size == 0:
            batch_num = i // batch_size
            out_path = f'../data/raw/boxscores_batch_{batch_num}.pkl'
            pd.concat(batch_data).to_pickle(out_path)
            print(f"✅ Saved batch {batch_num} to: {out_path}")
            batch_data = []  # reset for next batch

    except Exception as e:
        print(f"❌ Failed to pull game {game_id}: {e}")
        failed_ids.append(game_id)

# Save any leftover data
if batch_data:
    batch_num = (games_done + len(remaining_game_ids)) // batch_size + 1
    out_path = f'../data/raw/boxscores_batch_{batch_num}.pkl'
    pd.concat(batch_data).to_pickle(out_path)
    print(f"✅ Saved final (partial) batch {batch_num}")

# Save failed IDs to retry later
if failed_ids:
    with open('../logs/failed_game_ids.txt', 'a') as f:
        for fail_id in failed_ids:
            f.write(f"{fail_id}\n")
    print(f"⚠️  Logged {len(failed_ids)} failed game IDs to logs/failed_game_ids.txt")


FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/regular_season_ids.pkl'