# NBA Game Data Collection - Year by Year Processing
## Optimized for 7,500 Requests/Day

This notebook:
1. Processes ONE season at a time
2. Saves each season to a separate CSV
3. Calculates all team-level features
4. Includes proper target variable (team_won) for XGBoost
5. Handles 7,500 requests/day efficiently

## Configuration

In [12]:
import requests
import pandas as pd
import numpy as np
import os
import json
import time
from datetime import datetime
from dotenv import load_dotenv
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [13]:
# Select which season to process
SEASON_TO_PROCESS = 2020  # Change this to process different seasons: 2018, 2019, 2020, 2021, 2022, 2023, or 2024

# API Configuration
load_dotenv()
API_KEY = os.getenv("RAPIDAPI_KEY")
API_HOST = "v2.nba.api-sports.io"
BASE_URL = "https://v2.nba.api-sports.io"

HEADERS = {
    "X-RapidAPI-Key": API_KEY,
    "X-RapidAPI-Host": API_HOST
}

# Output directory for year-specific CSVs
OUTPUT_DIR = "nba_features_by_year"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Cache directory
CACHE_DIR = f"nba_cache_{SEASON_TO_PROCESS}"
os.makedirs(CACHE_DIR, exist_ok=True)

print(f"{'='*70}")
print(f"NBA DATA COLLECTION - SEASON {SEASON_TO_PROCESS}")
print(f"{'='*70}")
print(f"API Rate Limit: 7,500 requests/day")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Cache directory: {CACHE_DIR}")
print(f"\nProcessing Season: {SEASON_TO_PROCESS}")

NBA DATA COLLECTION - SEASON 2020
API Rate Limit: 7,500 requests/day
Output directory: nba_features_by_year
Cache directory: nba_cache_2020

Processing Season: 2020


## API Helper Functions

In [14]:
def get_cache_path(endpoint, params):
    """Generate cache file path"""
    param_str = '_'.join([f"{k}-{v}" for k, v in sorted(params.items())])
    filename = f"{endpoint.replace('/', '_')}_{param_str}.json"
    return os.path.join(CACHE_DIR, filename)

def load_from_cache(cache_path):
    """Load from cache if exists"""
    if os.path.exists(cache_path):
        with open(cache_path, 'r') as f:
            return json.load(f)
    return None

def save_to_cache(cache_path, data):
    """Save to cache"""
    with open(cache_path, 'w') as f:
        json.dump(data, f, indent=2)

def make_api_request(endpoint, params=None, use_cache=True):
    """
    Make API request with caching.
    With 7,500 requests/day, we have ~5 requests/second available.
    """
    if params is None:
        params = {}
    
    # Check cache
    cache_path = get_cache_path(endpoint, params)
    if use_cache:
        cached_data = load_from_cache(cache_path)
        if cached_data:
            logging.info(f"‚úì Cache hit: {endpoint}")
            return cached_data
    
    # Make request
    url = f"{BASE_URL}/{endpoint}"
    
    try:
        response = requests.get(url, headers=HEADERS, params=params)
        response.raise_for_status()
        data = response.json()
        
        # Save to cache
        save_to_cache(cache_path, data)
        
        # Rate limiting: 0.2 seconds between requests (5 req/sec)
        time.sleep(0.2)
        
        remaining = response.headers.get('x-ratelimit-requests-remaining', 'N/A')
        logging.info(f"‚úì API call successful. Remaining today: {remaining}")
        
        return data
        
    except requests.exceptions.RequestException as e:
        logging.error(f"‚úó Error: {e}")
        return None

print("‚úì API functions ready")

‚úì API functions ready


## Step 1: Collect All Games for Selected Season

In [15]:
def collect_games_for_season(season):
    """
    Collect all games for a specific season.
    Returns DataFrame with game metadata including winner.
    """
    logging.info(f"\n{'='*70}")
    logging.info(f"Collecting games for season {season}")
    logging.info(f"{'='*70}")
    
    games_data = make_api_request("games", params={
        "season": season,
        "league": "standard"
    })
    
    if not games_data or 'response' not in games_data:
        logging.error(f"Failed to fetch games for season {season}")
        return pd.DataFrame()
    
    games_list = games_data['response']
    
    # Extract game information with winner
    completed_games = []
    for game in games_list:
        home_score = game.get('scores', {}).get('home', {}).get('points')
        away_score = game.get('scores', {}).get('visitors', {}).get('points')
        
        # Only include completed games with scores
        if home_score is not None and away_score is not None:
            home_team_id = game.get('teams', {}).get('home', {}).get('id')
            away_team_id = game.get('teams', {}).get('visitors', {}).get('id')
            
            # Determine winner
            winning_team_id = home_team_id if home_score > away_score else away_team_id
            
            completed_games.append({
                'game_id': game['id'],
                'season': season,
                'date': game.get('date', {}).get('start', ''),
                'home_team_id': home_team_id,
                'away_team_id': away_team_id,
                'home_score': home_score,
                'away_score': away_score,
                'winning_team_id': winning_team_id
            })
    
    games_df = pd.DataFrame(completed_games)
    
    logging.info(f"‚úì Found {len(games_df)} completed games for season {season}")
    logging.info(f"  Date range: {games_df['date'].min()} to {games_df['date'].max()}")
    
    return games_df

# Collect games for selected season
print(f"\nüèÄ Collecting games for season {SEASON_TO_PROCESS}...\n")
games_df = collect_games_for_season(SEASON_TO_PROCESS)

print(f"\n‚úì Game collection complete!")
print(f"Total games: {len(games_df)}")
print(f"\nSample games:")
games_df.head()

2025-11-23 18:25:40,595 - INFO - 
2025-11-23 18:25:40,596 - INFO - Collecting games for season 2020



üèÄ Collecting games for season 2020...



2025-11-23 18:25:41,864 - INFO - ‚úì API call successful. Remaining today: 7499
2025-11-23 18:25:41,887 - INFO - ‚úì Found 1221 completed games for season 2020
2025-11-23 18:25:41,898 - INFO -   Date range: 2020-12-12T00:00:00.000Z to 2021-07-21T01:00:00.000Z



‚úì Game collection complete!
Total games: 1221

Sample games:


Unnamed: 0,game_id,season,date,home_team_id,away_team_id,home_score,away_score,winning_team_id
0,7501,2020,2020-12-19T01:00:00.000Z,2,4,89,113,4
1,7502,2020,2020-12-19T01:00:00.000Z,23,21,127,113,23
2,7503,2020,2020-12-19T01:00:00.000Z,25,6,103,105,6
3,7504,2020,2020-12-19T02:00:00.000Z,9,29,129,96,9
4,7505,2020,2020-12-19T03:30:00.000Z,28,17,113,114,17


## Step 2: Feature Engineering Functions

These match your original notebook exactly

In [16]:
def convert_minutes_to_numeric(minutes_str):
    """
    Convert minutes from 'MM:SS' format to decimal.
    Example: '25:30' -> 25.5
    """
    if pd.isna(minutes_str) or minutes_str == '0' or minutes_str == 0:
        return 0.0
    
    try:
        if isinstance(minutes_str, str) and ':' in minutes_str:
            parts = minutes_str.split(':')
            mins = float(parts[0])
            secs = float(parts[1]) if len(parts) > 1 else 0
            return mins + (secs / 60.0)
        else:
            return float(minutes_str)
    except:
        return 0.0

def calculate_all_team_features(player_stats_df):
    """
    Calculate ALL team-level features from player statistics.
    Includes features from: Sanika, Aditya, Dev, and Chinmayee.
    """
    df = player_stats_df.copy()
    
    # Convert percentage columns to float
    percentage_cols = ['fg_pct', 'three_pct', 'ft_pct']
    for col in percentage_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    # Convert plus_minus to numeric
    df['plus_minus'] = pd.to_numeric(df['plus_minus'], errors='coerce').fillna(0)
    
    # Convert minutes
    df['minutes_numeric'] = df['minutes'].apply(convert_minutes_to_numeric)
    
    # Convert other numeric fields
    numeric_cols = ['points', 'total_rebounds', 'assists', 'steals', 'blocks', 
                   'turnovers', 'personal_fouls', 'fg_made', 'fg_attempted',
                   'three_made', 'three_attempted', 'ft_made', 'ft_attempted',
                   'offensive_reb', 'defensive_reb']
    
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    # Filter to only players who played (minutes > 0)
    df_played = df[df['minutes_numeric'] > 0].copy()
    
    # Define aggregation dictionary - ALL features
    agg_dict = {
        # Sanika's features - shooting metrics
        'ft_pct': 'mean',
        'fg_made': ['sum', 'mean'],
        'fg_attempted': ['sum', 'mean'],
        'three_made': ['sum', 'mean'],
        'three_attempted': ['sum', 'mean'],
        'ft_made': ['sum', 'mean'],
        
        # Aditya's features
        'ft_attempted': ['sum', 'mean'],
        'minutes_numeric': ['sum', 'mean', 'max'],
        
        # Dev's features
        'points': ['sum', 'mean', 'std', 'max'],
        'assists': ['sum', 'mean', 'std', 'max'],
        'total_rebounds': ['sum', 'mean', 'std', 'max'],
        'offensive_reb': ['sum', 'mean'],
        'defensive_reb': ['sum', 'mean'],
        'steals': ['sum', 'mean'],
        
        # Chinmayee's features
        'blocks': ['sum', 'mean'],
        'turnovers': ['sum', 'mean'],
        'personal_fouls': ['sum', 'mean'],
        'plus_minus': ['sum', 'mean'],
        'fg_pct': 'mean',
        'three_pct': 'mean',
        
        # Identifiers
        'team_name': 'first',
        'season': 'first'
    }
    
    # Group by game and team
    team_features = df_played.groupby(['game_id', 'team_id']).agg(agg_dict).reset_index()
    
    # Flatten column names
    team_features.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col 
                             for col in team_features.columns.values]
    
    # Clean up names
    team_features.columns = [col.replace('_first', '') for col in team_features.columns]
    
    # Rename minutes columns
    rename_map = {
        'minutes_numeric_sum': 'minutes_sum',
        'minutes_numeric_mean': 'minutes_mean',
        'minutes_numeric_max': 'minutes_max'
    }
    team_features = team_features.rename(columns=rename_map)
    
    # Calculate Aditya's engineered features
    team_features['assist_to_turnover_ratio'] = team_features.apply(
        lambda row: row['assists_sum'] / row['turnovers_sum'] 
        if row['turnovers_sum'] > 0 else row['assists_sum'],
        axis=1
    )
    
    team_features['three_point_rate'] = team_features.apply(
        lambda row: row['three_made_sum'] / row['fg_made_sum']
        if row['fg_made_sum'] > 0 else 0.0,
        axis=1
    )
    
    team_features['offensive_reb_rate'] = team_features.apply(
        lambda row: row['offensive_reb_sum'] / row['total_rebounds_sum']
        if row['total_rebounds_sum'] > 0 else 0.0,
        axis=1
    )
    
    team_features['true_shooting_pct'] = team_features.apply(
        lambda row: (row['points_sum'] / 
                    (2 * (row['fg_attempted_sum'] + 0.44 * row['ft_attempted_sum'])))
        if (row['fg_attempted_sum'] + 0.44 * row['ft_attempted_sum']) > 0 else 0.0,
        axis=1
    )
    
    return team_features

print("‚úì Feature engineering functions ready")

‚úì Feature engineering functions ready


## Step 3: Fetch Player Statistics for All Games

With 7,500 requests/day, a typical season (~1,230 games) takes about 3-4 hours

In [17]:
def fetch_player_stats_for_game(game_id):
    """
    Fetch player statistics for a single game.
    """
    player_stats_data = make_api_request("players/statistics", params={"game": game_id})
    
    if not player_stats_data or 'response' not in player_stats_data:
        return []
    
    records = []
    for stat in player_stats_data['response']:
        record = {
            'game_id': game_id,
            'season': SEASON_TO_PROCESS,
            'player_id': stat['player']['id'],
            'player_name': stat['player']['firstname'] + ' ' + stat['player']['lastname'],
            'team_id': stat['team']['id'],
            'team_name': stat['team']['name'],
            'position': stat.get('pos', 'N/A'),
            'minutes': stat.get('min', '0'),
            'points': stat.get('points', 0),
            'total_rebounds': stat.get('totReb', 0),
            'assists': stat.get('assists', 0),
            'steals': stat.get('steals', 0),
            'blocks': stat.get('blocks', 0),
            'turnovers': stat.get('turnovers', 0),
            'personal_fouls': stat.get('pFouls', 0),
            'fg_made': stat.get('fgm', 0),
            'fg_attempted': stat.get('fga', 0),
            'fg_pct': stat.get('fgp', '0'),
            'three_made': stat.get('tpm', 0),
            'three_attempted': stat.get('tpa', 0),
            'three_pct': stat.get('tpp', '0'),
            'ft_made': stat.get('ftm', 0),
            'ft_attempted': stat.get('fta', 0),
            'ft_pct': stat.get('ftp', '0'),
            'offensive_reb': stat.get('offReb', 0),
            'defensive_reb': stat.get('defReb', 0),
            'plus_minus': stat.get('plusMinus', '0'),
        }
        records.append(record)
    
    return records

def process_all_games(games_df):
    """
    Process all games: fetch stats and calculate features.
    """
    total_games = len(games_df)
    game_ids = games_df['game_id'].tolist()
    
    logging.info(f"\n{'='*70}")
    logging.info(f"Processing {total_games} games")
    logging.info(f"Estimated time: {total_games * 0.2 / 60:.1f} minutes")
    logging.info(f"{'='*70}\n")
    
    all_player_stats = []
    start_time = time.time()
    
    for idx, game_id in enumerate(game_ids, 1):
        player_stats = fetch_player_stats_for_game(game_id)
        all_player_stats.extend(player_stats)
        
        # Progress updates
        if idx % 50 == 0 or idx == total_games:
            elapsed = time.time() - start_time
            games_per_sec = idx / elapsed if elapsed > 0 else 0
            remaining_games = total_games - idx
            eta_seconds = remaining_games / games_per_sec if games_per_sec > 0 else 0
            
            logging.info(f"Progress: {idx}/{total_games} games ({idx/total_games*100:.1f}%) | "
                        f"Speed: {games_per_sec:.1f} games/sec | "
                        f"ETA: {eta_seconds/60:.1f} min")
    
    if not all_player_stats:
        logging.error("No player stats collected!")
        return pd.DataFrame()
    
    # Convert to DataFrame
    player_stats_df = pd.DataFrame(all_player_stats)
    
    logging.info(f"\n‚úì Player stats collected: {len(player_stats_df)} records")
    logging.info(f"  Total time: {(time.time() - start_time)/60:.1f} minutes")
    
    # Calculate team features
    logging.info(f"\nCalculating team features...")
    team_features_df = calculate_all_team_features(player_stats_df)
    
    logging.info(f"‚úì Team features calculated: {len(team_features_df)} teams")
    
    return team_features_df

# Process all games for this season
print(f"\nüéØ Processing all games for season {SEASON_TO_PROCESS}...\n")
team_features_df = process_all_games(games_df)

print(f"\n‚úì Feature calculation complete!")
print(f"Total team records: {len(team_features_df)}")
print(f"Features calculated: {len(team_features_df.columns) - 3}")  # Exclude game_id, team_id, season

2025-11-23 18:25:41,930 - INFO - 
2025-11-23 18:25:41,931 - INFO - Processing 1221 games
2025-11-23 18:25:41,932 - INFO - Estimated time: 4.1 minutes




üéØ Processing all games for season 2020...



2025-11-23 18:25:42,440 - INFO - ‚úì API call successful. Remaining today: 7498
2025-11-23 18:25:42,984 - INFO - ‚úì API call successful. Remaining today: 7497
2025-11-23 18:25:43,493 - INFO - ‚úì API call successful. Remaining today: 7496
2025-11-23 18:25:44,018 - INFO - ‚úì API call successful. Remaining today: 7495
2025-11-23 18:25:44,571 - INFO - ‚úì API call successful. Remaining today: 7494
2025-11-23 18:25:45,099 - INFO - ‚úì API call successful. Remaining today: 7493
2025-11-23 18:25:45,609 - INFO - ‚úì API call successful. Remaining today: 7492
2025-11-23 18:25:46,336 - INFO - ‚úì API call successful. Remaining today: 7491
2025-11-23 18:25:46,835 - INFO - ‚úì API call successful. Remaining today: 7490
2025-11-23 18:25:47,348 - INFO - ‚úì API call successful. Remaining today: 7489
2025-11-23 18:25:47,863 - INFO - ‚úì API call successful. Remaining today: 7488
2025-11-23 18:25:48,369 - INFO - ‚úì API call successful. Remaining today: 7487
2025-11-23 18:25:48,867 - INFO - ‚úì API


‚úì Feature calculation complete!
Total team records: 2442
Features calculated: 49


## Step 4: Add Target Variable (team_won)

**For XGBoost Binary Classification:**
- `team_won = 1` if this team won the game
- `team_won = 0` if this team lost the game

Each game has 2 rows (one for each team), so the classes are naturally balanced 50/50.

In [18]:
# Merge with game metadata to get winning team info
final_df = team_features_df.merge(
    games_df[['game_id', 'season', 'winning_team_id', 'date']],
    on='game_id',
    how='left',
    suffixes=('', '_meta')
)

# Create binary target variable for XGBoost
# 1 = this team won, 0 = this team lost
final_df['team_won'] = (final_df['team_id'] == final_df['winning_team_id']).astype(int)

print(f"\n{'='*70}")
print(f"TARGET VARIABLE CREATED")
print(f"{'='*70}")
print(f"\nTarget variable: team_won")
print(f"  1 = Team won this game")
print(f"  0 = Team lost this game")
print(f"\nDistribution:")
print(final_df['team_won'].value_counts().to_dict())
print(f"\nBalance: {final_df['team_won'].value_counts()[1] / len(final_df) * 100:.1f}% wins")
print(f"‚úì Perfectly balanced for binary classification!")


TARGET VARIABLE CREATED

Target variable: team_won
  1 = Team won this game
  0 = Team lost this game

Distribution:
{0: 1221, 1: 1221}

Balance: 50.0% wins
‚úì Perfectly balanced for binary classification!


## Step 5: Organize and Save Dataset

In [19]:
# Select columns for final dataset
# Order: identifiers, features, target

# Identify feature columns (exclude metadata)
exclude_cols = ['winning_team_id', 'season_meta']
feature_cols = [col for col in final_df.columns 
                if col not in ['game_id', 'team_id', 'season', 'team_name', 'date', 'team_won'] + exclude_cols]

# Create final column order
final_columns = [
    # Identifiers
    'season',
    'game_id', 
    'team_id',
    'team_name',
    'date'
] + feature_cols + [
    # Target variable (last column)
    'team_won'
]

ml_dataset = final_df[final_columns].copy()

print(f"\n{'='*70}")
print(f"FINAL DATASET STRUCTURE")
print(f"{'='*70}")
print(f"\nDataset shape: {ml_dataset.shape}")
print(f"  Rows (team-games): {len(ml_dataset)}")
print(f"  Columns: {len(ml_dataset.columns)}")
print(f"\nColumn breakdown:")
print(f"  Identifiers: 5 (season, game_id, team_id, team_name, date)")
print(f"  Features: {len(feature_cols)}")
print(f"  Target: 1 (team_won)")
print(f"\nFeatures included:")
for i, col in enumerate(feature_cols, 1):
    print(f"  {i:2d}. {col}")

print(f"\n{'='*70}")


FINAL DATASET STRUCTURE

Dataset shape: (2442, 54)
  Rows (team-games): 2442
  Columns: 54

Column breakdown:
  Identifiers: 5 (season, game_id, team_id, team_name, date)
  Features: 48
  Target: 1 (team_won)

Features included:
   1. ft_pct_mean
   2. fg_made_sum
   3. fg_made_mean
   4. fg_attempted_sum
   5. fg_attempted_mean
   6. three_made_sum
   7. three_made_mean
   8. three_attempted_sum
   9. three_attempted_mean
  10. ft_made_sum
  11. ft_made_mean
  12. ft_attempted_sum
  13. ft_attempted_mean
  14. minutes_sum
  15. minutes_mean
  16. minutes_max
  17. points_sum
  18. points_mean
  19. points_std
  20. points_max
  21. assists_sum
  22. assists_mean
  23. assists_std
  24. assists_max
  25. total_rebounds_sum
  26. total_rebounds_mean
  27. total_rebounds_std
  28. total_rebounds_max
  29. offensive_reb_sum
  30. offensive_reb_mean
  31. defensive_reb_sum
  32. defensive_reb_mean
  33. steals_sum
  34. steals_mean
  35. blocks_sum
  36. blocks_mean
  37. turnovers_sum
  

## Step 6: Data Quality Checks

In [20]:
print(f"\n{'='*70}")
print(f"DATA QUALITY CHECKS")
print(f"{'='*70}")

# 1. Missing values
print("\n1. Missing Values:")
missing = ml_dataset.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("   ‚úì No missing values!")

# 2. Games with 2 teams
print("\n2. Teams per Game:")
teams_per_game = ml_dataset.groupby('game_id').size()
if (teams_per_game == 2).all():
    print("   ‚úì All games have exactly 2 teams!")
else:
    weird_games = teams_per_game[teams_per_game != 2]
    print(f"   ‚ö†Ô∏è  {len(weird_games)} games with != 2 teams")
    print(weird_games)

# 3. Target balance
print("\n3. Target Variable Balance:")
wins = (ml_dataset['team_won'] == 1).sum()
losses = (ml_dataset['team_won'] == 0).sum()
print(f"   Wins: {wins}")
print(f"   Losses: {losses}")
print(f"   Difference: {abs(wins - losses)}")
if abs(wins - losses) <= 1:
    print("   ‚úì Perfectly balanced!")
else:
    print(f"   ‚ö†Ô∏è  Imbalance detected")

# 4. Feature statistics
print("\n4. Feature Value Ranges:")
print(ml_dataset[feature_cols].describe().T[['mean', 'std', 'min', 'max']].head(10))

# 5. Season verification
print(f"\n5. Season Verification:")
print(f"   Season: {ml_dataset['season'].unique()}")
print(f"   Games: {ml_dataset['game_id'].nunique()}")
print(f"   Date range: {ml_dataset['date'].min()} to {ml_dataset['date'].max()}")

print(f"\n{'='*70}")


DATA QUALITY CHECKS

1. Missing Values:
   ‚úì No missing values!

2. Teams per Game:
   ‚úì All games have exactly 2 teams!

3. Target Variable Balance:
   Wins: 1221
   Losses: 1221
   Difference: 0
   ‚úì Perfectly balanced!

4. Feature Value Ranges:
                           mean        std        min         max
ft_pct_mean           42.599983  12.647668   6.000000   87.500000
fg_made_sum           41.039722   5.262279  23.000000   68.000000
fg_made_mean           3.893948   0.782006   1.666667    7.000000
fg_attempted_sum      88.319410   6.824086  62.000000  115.000000
fg_attempted_mean      8.380772   1.470407   4.411765   16.166667
three_made_sum        12.714169   4.072595   2.000000   31.000000
three_made_mean        1.203495   0.424274   0.142857    3.125000
three_attempted_sum   34.748976   7.138850  10.000000   72.000000
three_attempted_mean   3.292783   0.849300   1.000000    7.875000
ft_made_sum           17.063882   5.812376   3.000000   38.000000

5. Season Verifica

## Step 7: Save to CSV

In [21]:
# Create filename with season
output_filename = f"nba_features_{SEASON_TO_PROCESS}.csv"
output_path = os.path.join(OUTPUT_DIR, output_filename)

# Save to CSV
ml_dataset.to_csv(output_path, index=False)

print(f"\n{'='*70}")
print(f"‚úÖ DATASET SAVED SUCCESSFULLY!")
print(f"{'='*70}")
print(f"\nFile: {output_path}")
print(f"Size: {os.path.getsize(output_path) / 1024:.1f} KB")
print(f"\nDataset summary:")
print(f"  Season: {SEASON_TO_PROCESS}")
print(f"  Games: {ml_dataset['game_id'].nunique()}")
print(f"  Team-game records: {len(ml_dataset)}")
print(f"  Features: {len(feature_cols)}")
print(f"  Target: team_won (binary: 0=loss, 1=win)")

print(f"\n{'='*70}")
print(f"NEXT STEPS")
print(f"{'='*70}")
print(f"\n1. To process another season:")
print(f"   - Change SEASON_TO_PROCESS variable")
print(f"   - Re-run all cells")
print(f"\n2. After collecting all seasons (2018-2024):")
print(f"   - Combine CSVs: pd.concat([df_2018, df_2019, ..., df_2024])")
print(f"   - Split: train = seasons 2018-2023, test = season 2024")
print(f"   - Train XGBoost model")
print(f"\n3. For ML training:")
print(f"   X = df.drop(['game_id', 'team_id', 'team_name', 'date', 'team_won'], axis=1)")
print(f"   y = df['team_won']")
print(f"   # Keep 'season' column for train/test split!")


‚úÖ DATASET SAVED SUCCESSFULLY!

File: nba_features_by_year/nba_features_2020.csv
Size: 1340.4 KB

Dataset summary:
  Season: 2020
  Games: 1221
  Team-game records: 2442
  Features: 48
  Target: team_won (binary: 0=loss, 1=win)

NEXT STEPS

1. To process another season:
   - Change SEASON_TO_PROCESS variable
   - Re-run all cells

2. After collecting all seasons (2018-2024):
   - Combine CSVs: pd.concat([df_2018, df_2019, ..., df_2024])
   - Split: train = seasons 2018-2023, test = season 2024
   - Train XGBoost model

3. For ML training:
   X = df.drop(['game_id', 'team_id', 'team_name', 'date', 'team_won'], axis=1)
   y = df['team_won']
   # Keep 'season' column for train/test split!


## Sample of Final Dataset

In [22]:
print(f"\n{'='*70}")
print(f"SAMPLE DATA (First 5 Records)")
print(f"{'='*70}\n")

# Show first game (2 teams)
first_game_id = ml_dataset['game_id'].iloc[0]
sample = ml_dataset[ml_dataset['game_id'] == first_game_id]

print(f"Game ID: {first_game_id}")
print(f"\nTeam 1: {sample.iloc[0]['team_name']}")
print(f"  team_won: {sample.iloc[0]['team_won']} ({'WON' if sample.iloc[0]['team_won'] == 1 else 'LOST'})")
print(f"  points_sum: {sample.iloc[0]['points_sum']}")
print(f"  assists_sum: {sample.iloc[0]['assists_sum']}")
print(f"\nTeam 2: {sample.iloc[1]['team_name']}")
print(f"  team_won: {sample.iloc[1]['team_won']} ({'WON' if sample.iloc[1]['team_won'] == 1 else 'LOST'})")
print(f"  points_sum: {sample.iloc[1]['points_sum']}")
print(f"  assists_sum: {sample.iloc[1]['assists_sum']}")

print(f"\nFull dataframe preview:")
ml_dataset.head(4)


SAMPLE DATA (First 5 Records)

Game ID: 7501

Team 1: Boston Celtics
  team_won: 0 (LOST)
  points_sum: 89.0
  assists_sum: 18.0

Team 2: Brooklyn Nets
  team_won: 1 (WON)
  points_sum: 113.0
  assists_sum: 26.0

Full dataframe preview:


Unnamed: 0,season,game_id,team_id,team_name,date,ft_pct_mean,fg_made_sum,fg_made_mean,fg_attempted_sum,fg_attempted_mean,...,personal_fouls_mean,plus_minus_sum,plus_minus_mean,fg_pct_mean,three_pct_mean,assist_to_turnover_ratio,three_point_rate,offensive_reb_rate,true_shooting_pct,team_won
0,2020,7501,2,Boston Celtics,2020-12-19T01:00:00.000Z,38.392857,32.0,2.285714,92.0,6.571429,...,1.642857,-120.0,-8.571429,31.585714,18.085714,0.818182,0.25,0.229167,0.43955,0
1,2020,7501,4,Brooklyn Nets,2020-12-19T01:00:00.000Z,40.471429,38.0,2.714286,88.0,6.285714,...,1.357143,120.0,8.571429,30.478571,28.364286,1.3,0.421053,0.150943,0.565679,1
2,2020,7502,21,Milwaukee Bucks,2020-12-19T01:00:00.000Z,56.875,38.0,2.375,90.0,5.625,...,2.0,-70.0,-4.375,36.65,16.04375,1.235294,0.263158,0.272727,0.547481,0
3,2020,7502,23,New Orleans Pelicans,2020-12-19T01:00:00.000Z,49.509091,44.0,4.0,86.0,7.818182,...,2.272727,70.0,6.363636,53.663636,22.727273,1.923077,0.272727,0.294118,0.602696,1


## Summary: How to Process All Seasons

### Step-by-Step Process:

1. **Process Season 2018**
   ```python
   SEASON_TO_PROCESS = 2018
   # Run all cells
   # Output: nba_features_by_year/nba_features_2018.csv
   ```

2. **Process Season 2019**
   ```python
   SEASON_TO_PROCESS = 2019
   # Run all cells
   # Output: nba_features_by_year/nba_features_2019.csv
   ```

3. **Repeat for 2020, 2021, 2022, 2023, 2024**

4. **Combine All Seasons** (After collecting all)
   ```python
   import pandas as pd
   import glob
   
   # Load all CSV files
   all_files = glob.glob('nba_features_by_year/*.csv')
   dfs = [pd.read_csv(f) for f in all_files]
   
   # Combine
   complete_df = pd.concat(dfs, ignore_index=True)
   
   # Sort by season and game_id
   complete_df = complete_df.sort_values(['season', 'game_id'])
   
   # Save combined dataset
   complete_df.to_csv('nba_features_2018_2024_complete.csv', index=False)
   
   print(f"Complete dataset: {len(complete_df)} records")
   print(f"Seasons: {sorted(complete_df['season'].unique())}")
   print(f"Games: {complete_df['game_id'].nunique()}")
   ```

### Timeline with 7,500 Requests/Day:
- **Per season**: ~1,230 games = ~1,230 requests = 3-4 hours
- **All 7 seasons**: Can be done in 1 day!
- **Recommendation**: Process 2-3 seasons per day to be safe

### Target Variable Format:
- **Column name**: `team_won`
- **Data type**: Integer (0 or 1)
- **Meaning**: 
  - `1` = This team WON the game
  - `0` = This team LOST the game
- **Perfect for XGBoost binary classification!**