In [2]:
# NBA Win Probability Predictor - Colab Setup
# Run this cell first in your Google Colab notebook

"""
This notebook sets up the complete environment for the NBA prediction project.
Run each cell in order.
"""

# =============================================================================
# CELL 1: Mount Google Drive (for persistent storage)
# =============================================================================

from google.colab import drive
import os

# Mount Drive
drive.mount('/content/drive')

# Create project directory structure in Drive
project_root = '/content/drive/MyDrive/nba_predictor'

directories = [
    'data/raw',
    'data/processed',
    'features',
    'models',
    'evaluation',
    'notebooks',
    'simulation'
]

for dir_path in directories:
    full_path = os.path.join(project_root, dir_path)
    os.makedirs(full_path, exist_ok=True)
    print(f"✓ Created: {dir_path}")

# Set working directory
os.chdir(project_root)
print(f"\n✓ Working directory set to: {os.getcwd()}")


# =============================================================================
# CELL 2: Install Dependencies (Minimal, Professional)
# =============================================================================

print("Installing dependencies...\n")

# Install required packages
!pip install -q nba_api pandas numpy scikit-learn xgboost lightgbm matplotlib plotly kaleido

# CELL 2 (FIXED): Verify installations

print("✓ All dependencies installed successfully")

# Verify installations
import nba_api
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
import lightgbm as lgb
import matplotlib
import plotly

print(f"\n✓ nba_api imported successfully")
print(f"✓ pandas version: {pd.__version__}")
print(f"✓ numpy version: {np.__version__}")
print(f"✓ scikit-learn version: {sklearn.__version__}")
print(f"✓ xgboost version: {xgb.__version__}")
print(f"✓ matplotlib version: {matplotlib.__version__}")
print(f"✓ plotly version: {plotly.__version__}")


# =============================================================================
# CELL 3: Import Core Libraries
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.3f}'.format)

# Set matplotlib style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 10

print("✓ Core libraries imported")
print("✓ Display settings configured")


# =============================================================================
# CELL 4: Helper Functions for Data Management
# =============================================================================

def save_checkpoint(df, filename, description=""):
    """Save dataframe to Drive with timestamp"""
    filepath = os.path.join(project_root, 'data/processed', filename)
    df.to_csv(filepath, index=False)
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"✓ [{timestamp}] Saved: {filename}")
    if description:
        print(f"  {description}")
    print(f"  Shape: {df.shape}")
    return filepath

def load_checkpoint(filename):
    """Load dataframe from Drive"""
    filepath = os.path.join(project_root, 'data/processed', filename)
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        print(f"✓ Loaded: {filename}")
        print(f"  Shape: {df.shape}")
        return df
    else:
        print(f"✗ File not found: {filename}")
        return None

def verify_no_leakage(df, date_col='game_date'):
    """Verify data is sorted chronologically"""
    df[date_col] = pd.to_datetime(df[date_col])
    is_sorted = df[date_col].is_monotonic_increasing

    if is_sorted:
        print("✓ Data is properly sorted chronologically")
        print(f"  Date range: {df[date_col].min()} to {df[date_col].max()}")
    else:
        print("✗ WARNING: Data is NOT sorted chronologically!")
        print("  This could cause data leakage!")

    return is_sorted

def check_missing_values(df, name="DataFrame"):
    """Check for missing values"""
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100

    if missing.sum() == 0:
        print(f"✓ {name}: No missing values")
    else:
        print(f"⚠ {name}: Missing values detected")
        missing_df = pd.DataFrame({
            'column': missing.index,
            'missing_count': missing.values,
            'missing_pct': missing_pct.values
        })
        missing_df = missing_df[missing_df['missing_count'] > 0].sort_values(
            'missing_count', ascending=False
        )
        print(missing_df.to_string(index=False))

print("✓ Helper functions defined")


# =============================================================================
# CELL 5: Project Configuration
# =============================================================================

# Project settings
CONFIG = {
    'seasons': ['2021-22', '2022-23', '2023-24'],
    'train_seasons': ['2021-22', '2022-23'],
    'test_season': '2023-24',
    'rolling_windows': [5, 10],
    'primary_window': 5,
    'random_seed': 42,
    'n_simulations': 10000
}

print("✓ Project configuration loaded")
print("\nConfiguration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")


# =============================================================================
# CELL 6: Test NBA API Connection
# =============================================================================

from nba_api.stats.static import teams
from nba_api.stats.endpoints import leaguegamefinder

print("Testing NBA API connection...")

try:
    # Test 1: Get team list
    nba_teams = teams.get_teams()
    print(f"✓ Successfully retrieved {len(nba_teams)} NBA teams")

    # Test 2: Get sample game data
    gamefinder = leaguegamefinder.LeagueGameFinder(
        season_nullable='2023-24',
        league_id_nullable='00',
        season_type_nullable='Regular Season'
    )
    sample_games = gamefinder.get_data_frames()[0]
    print(f"✓ Successfully retrieved sample data: {sample_games.shape[0]} rows")

    # Display sample teams
    print("\nSample teams:")
    for team in nba_teams[:5]:
        print(f"  {team['full_name']} ({team['abbreviation']})")

    print("\n✓ NBA API connection successful!")

except Exception as e:
    print(f"✗ NBA API connection failed: {str(e)}")
    print("  Check your internet connection and try again")


# =============================================================================
# CELL 7: Create Team Lookup Dictionary
# =============================================================================

# Create and save team lookup for consistent use
nba_teams = teams.get_teams()
team_lookup = {team['id']: team['abbreviation'] for team in nba_teams}
team_name_lookup = {team['id']: team['full_name'] for team in nba_teams}

# Save to Drive for reference
team_df = pd.DataFrame(nba_teams)
team_df.to_csv(os.path.join(project_root, 'data/raw/team_reference.csv'), index=False)

print(f"✓ Team lookup created with {len(team_lookup)} teams")
print(f"✓ Team reference saved to data/raw/team_reference.csv")

# Display sample
print("\nSample team mappings:")
for team_id, abbr in list(team_lookup.items())[:5]:
    full_name = team_name_lookup[team_id]
    print(f"  {team_id}: {abbr} - {full_name}")


# =============================================================================
# CELL 8: Environment Verification Summary
# =============================================================================

print("\n" + "="*60)
print("ENVIRONMENT SETUP COMPLETE")
print("="*60)

print("\n✓ Google Drive mounted")
print(f"✓ Project directory: {project_root}")
print("✓ Directory structure created")
print("✓ Dependencies installed and verified")
print("✓ Helper functions loaded")
print("✓ Configuration set")
print("✓ NBA API connection tested")
print("✓ Team lookup created")

print("\n" + "="*60)
print("READY TO BEGIN DATA COLLECTION")
print("="*60)

print("\nNext steps:")
print("1. Run the data collection script (Week 1, Day 1-2)")
print("2. All data will be saved to your Google Drive")
print("3. Progress will persist across Colab sessions")

print("\n⚠ IMPORTANT REMINDERS:")
print("  • Never shuffle data randomly - always use time-based splits")
print("  • Save checkpoints frequently to Drive")
print("  • Verify no data leakage at each step")
print("  • Check for missing values before modeling")

Mounted at /content/drive
✓ Created: data/raw
✓ Created: data/processed
✓ Created: features
✓ Created: models
✓ Created: evaluation
✓ Created: notebooks
✓ Created: simulation

✓ Working directory set to: /content/drive/MyDrive/nba_predictor
Installing dependencies...

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.0/319.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.0/69.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h✓ All dependencies installed successfully

✓ nba_api imported successfully
✓ pandas version: 2.2.2
✓ numpy version: 2.0.2
✓ scikit-learn version: 1.6.1
✓ xgboost version: 3.1.2
✓ matplotlib version: 3.10.0
✓ plotly version: 5.24.1
✓ Core libraries imported
✓ Display settings configured
✓ Helper functions defined
✓ Project configuration loaded

Configuration:
  season

In [3]:
"""
NBA Win Probability Predictor
STAGE 1 — RAW GAME INGESTION (PRODUCTION SAFE)

Guarantees:
- One row per game
- No post-game box score leakage
- Explicit integrity assertions
- Chronologically sorted
"""

import time
import random
from typing import List

import pandas as pd
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
from requests.exceptions import Timeout, ConnectionError

# =============================================================================
# CONFIG
# =============================================================================

CONFIG = {
    "seasons": ["2021-22", "2022-23", "2023-24"],
}

SEASONS: List[str] = CONFIG["seasons"]

# Respectful to NBA API rate limits.
# Randomized sleep avoids bot-like request patterns and reduces throttling risk.
SLEEP_RANGE = (1.0, 2.5)

MAX_RETRIES = 3

# =============================================================================
# STEP 1 — LOCK TEAM IDS
# =============================================================================

print("\n[1] Locking team identifiers")

nba_teams = teams.get_teams()
TEAM_ID_TO_ABBR = {t["id"]: t["abbreviation"] for t in nba_teams}

assert len(TEAM_ID_TO_ABBR) == 30, "Expected 30 NBA teams"

# =============================================================================
# STEP 2 — SAFE API PULL
# =============================================================================

def pull_season_games(season: str) -> pd.DataFrame:
    """Pull raw team-game rows with retry + backoff."""
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"  Pulling {season} (attempt {attempt})")

            finder = leaguegamefinder.LeagueGameFinder(
                season_nullable=season,
                league_id_nullable="00",
                season_type_nullable="Regular Season",
            )

            df = finder.get_data_frames()[0]

            # lock identifiers immediately
            df["team_abbr"] = df["TEAM_ID"].map(TEAM_ID_TO_ABBR)
            df["season"] = season

            time.sleep(random.uniform(*SLEEP_RANGE))
            return df

        except (Timeout, ConnectionError) as e:
            if attempt == MAX_RETRIES:
                raise RuntimeError(f"API failed for {season}") from e
            time.sleep(3 * attempt)

# =============================================================================
# STEP 3 — COLLECT RAW TEAM-GAME DATA
# =============================================================================

print("\n[2] Pulling raw data")

team_game_rows = []

for season in SEASONS:
    df = pull_season_games(season)
    team_game_rows.append(df)

raw_games = pd.concat(team_game_rows, ignore_index=True)

# integrity checks
assert raw_games["GAME_ID"].nunique() * 2 == len(raw_games), \
    "Each GAME_ID must have exactly 2 team rows"

assert raw_games["team_abbr"].isna().sum() == 0, \
    "Missing team abbreviations detected"

# =============================================================================
# STEP 4 — TRANSFORM TO GAME LEVEL (OUTCOMES ONLY)
# =============================================================================

print("\n[3] Transforming to game-level rows")

raw_games["is_home"] = raw_games["MATCHUP"].str.contains("vs.", na=False)

home = raw_games[raw_games["is_home"]].copy()
away = raw_games[~raw_games["is_home"]].copy()

assert home["GAME_ID"].nunique() == away["GAME_ID"].nunique(), \
    "Mismatch between home and away games"

games = home.merge(
    away,
    on=["GAME_ID", "GAME_DATE", "season"],
    suffixes=("_home", "_away"),
    validate="one_to_one",
)

# =============================================================================
# STEP 5 — CLEAN, LEAK-SAFE SCHEMA
# =============================================================================

games = games.rename(columns={
    "GAME_ID": "game_id",
    "GAME_DATE": "game_date",
    "TEAM_ID_home": "home_team_id",
    "TEAM_ID_away": "away_team_id",
    "team_abbr_home": "home_team",
    "team_abbr_away": "away_team",
    "PTS_home": "home_score",
    "PTS_away": "away_score",
})

games["game_date"] = pd.to_datetime(games["game_date"])
games = games.sort_values("game_date").reset_index(drop=True)

games["home_win"] = (games["home_score"] > games["away_score"]).astype(int)

games = games[[
    "game_id",
    "game_date",
    "season",
    "home_team_id",
    "away_team_id",
    "home_team",
    "away_team",
    "home_score",
    "away_score",
    "home_win",
]]

# =============================================================================
# STEP 6 — DATA QUALITY ASSERTIONS
# =============================================================================

print("\n[4] Running integrity checks")

# chronological
assert games["game_date"].is_monotonic_increasing, \
    "Games not sorted chronologically"

# duplicate game protection
assert games["game_id"].nunique() == len(games), \
    "Duplicate game_ids detected"

# win sanity
home_win_rate = games["home_win"].mean()
assert 0.53 < home_win_rate < 0.67, \
    f"Suspicious home win rate: {home_win_rate:.3f}"

# per-season sanity (accounts for COVID-shortened 2019-20)
season_counts = games.groupby("season").size()
for season, n in season_counts.items():
    if season == "2019-20":
        assert 900 < n < 1100, \
            f"{season}: {n} games is suspicious (COVID season)"
    else:
        assert 1100 < n < 1300, \
            f"{season}: {n} games is suspicious"

# =============================================================================
# STEP 7 — SAVE
# =============================================================================

output_path = save_checkpoint(
    games,
    "games_outcomes.csv",
    "Leak-safe game-level outcomes (no box score features)"
)

print("\n✔ DATA INGESTION COMPLETE")
print(f"✔ Games: {len(games)}")
print(f"✔ Seasons: {len(SEASONS)}")
print(f"✔ Home win rate: {home_win_rate:.3f}")
print(f"✔ Saved to: {output_path}")



[1] Locking team identifiers

[2] Pulling raw data
  Pulling 2021-22 (attempt 1)
  Pulling 2022-23 (attempt 1)
  Pulling 2023-24 (attempt 1)

[3] Transforming to game-level rows

[4] Running integrity checks
✓ [2026-01-06 00:56:32] Saved: games_outcomes.csv
  Leak-safe game-level outcomes (no box score features)
  Shape: (3690, 10)

✔ DATA INGESTION COMPLETE
✔ Games: 3690
✔ Seasons: 3
✔ Home win rate: 0.556
✔ Saved to: /content/drive/MyDrive/nba_predictor/data/processed/games_outcomes.csv


In [4]:
"""
NBA Win Probability Predictor
STAGE 2 — CONTEXT FEATURES (LEAK-SAFE, PRODUCTION)

Adds pre-game context known before tip-off:
- Rest days (season-aware)
- Back-to-back indicators
- Rest advantage
- Season opener flags

Guarantees:
- No future information leakage
- Season boundaries respected
- Multi-season safe
"""

import pandas as pd

# =============================================================================
# LOAD GAME OUTCOMES
# =============================================================================

print("\n[1] Loading game outcomes")

games = load_checkpoint("games_outcomes.csv")

assert games is not None, "games_outcomes.csv not found"

# ---------------- FIXED: enforce datetime ----------------
games["game_date"] = pd.to_datetime(games["game_date"], errors="raise")
# ----------------------------------------------------------

assert games["game_date"].is_monotonic_increasing, "Games not sorted chronologically"

print(f"✓ Loaded {len(games)} games")
print(f"✓ Seasons: {games['season'].nunique()}")

# =============================================================================
# STEP 1 — BUILD TEAM SCHEDULE
# =============================================================================

print("\n[2] Building per-team schedules")

def build_team_schedule(games_df: pd.DataFrame) -> pd.DataFrame:
    """
    Each row = one team's participation in one game.
    """
    home = games_df[["game_id", "game_date", "season", "home_team"]].copy()
    home = home.rename(columns={"home_team": "team"})
    home["is_home"] = 1

    away = games_df[["game_id", "game_date", "season", "away_team"]].copy()
    away = away.rename(columns={"away_team": "team"})
    away["is_home"] = 0

    schedule = pd.concat([home, away], ignore_index=True)
    schedule = schedule.sort_values(
        ["team", "season", "game_date"]
    ).reset_index(drop=True)

    return schedule

team_schedule = build_team_schedule(games)

print(f"✓ Team-game rows: {len(team_schedule)}")
print(f"✓ Teams: {team_schedule['team'].nunique()}")

# Season-aware sanity check
seasons_loaded = games["season"].nunique()
expected_games = 82 * seasons_loaded
games_per_team = team_schedule.groupby("team").size()

assert games_per_team.min() > expected_games * 0.90, \
    "Suspiciously few games for at least one team"

assert games_per_team.max() < expected_games * 1.05, \
    "Suspiciously many games for at least one team"

# =============================================================================
# STEP 2 — COMPUTE REST DAYS (SEASON-AWARE)
# =============================================================================

print("\n[3] Computing rest days")

def compute_rest_days(schedule: pd.DataFrame) -> pd.DataFrame:
    """
    Rest days logic:
    - Computed only within season
    - Season openers flagged explicitly
    - Rest days capped only for missing previous games
    """
    schedule = schedule.copy()

    schedule["prev_game_date"] = (
        schedule
        .groupby(["team", "season"])["game_date"]
        .shift(1)
    )

    schedule["is_season_opener"] = schedule["prev_game_date"].isna().astype(int)

    schedule["rest_days"] = (
        schedule["game_date"] - schedule["prev_game_date"]
    ).dt.days

    # Season openers: cap rest (offseason gap is not informative)
    schedule["rest_days"] = schedule["rest_days"].fillna(7).astype(int)

    assert schedule["rest_days"].min() >= 0, "Negative rest days detected"
    assert schedule["rest_days"].max() <= 30, "Unrealistic rest days detected"

    return schedule

team_schedule = compute_rest_days(team_schedule)

print(
    f"✓ Rest days range: "
    f"{team_schedule['rest_days'].min()}–{team_schedule['rest_days'].max()}"
)

# =============================================================================
# STEP 3 — BACK-TO-BACK INDICATOR
# =============================================================================

print("\n[4] Computing back-to-back flags")

team_schedule["is_back_to_back"] = (team_schedule["rest_days"] == 1).astype(int)

# Validate per season (not averaged away)
b2b_by_season = team_schedule.groupby("season")["is_back_to_back"].mean()

assert b2b_by_season.between(0.12, 0.30).all(), \
    f"Suspicious B2B rate detected:\n{b2b_by_season}"

print("✓ Back-to-back rates by season:")
print(b2b_by_season.round(3))

# =============================================================================
# STEP 4 — MERGE CONTEXT BACK TO GAME LEVEL
# =============================================================================

print("\n[5] Merging context features")

context_cols = ["game_id", "team", "rest_days", "is_back_to_back", "is_season_opener"]

# Home merge
games = games.merge(
    team_schedule[context_cols],
    left_on=["game_id", "home_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team")

games = games.rename(columns={
    "rest_days": "home_rest_days",
    "is_back_to_back": "home_b2b",
    "is_season_opener": "home_season_opener"
})

# Away merge
games = games.merge(
    team_schedule[context_cols],
    left_on=["game_id", "away_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team")

games = games.rename(columns={
    "rest_days": "away_rest_days",
    "is_back_to_back": "away_b2b",
    "is_season_opener": "away_season_opener"
})

# Missing-value protection
context_features = [
    "home_rest_days", "away_rest_days",
    "home_b2b", "away_b2b",
    "home_season_opener", "away_season_opener"
]

assert games[context_features].isna().sum().sum() == 0, \
    "Missing context features detected"

print("✓ Context features merged cleanly")

# =============================================================================
# STEP 5 — RELATIVE CONTEXT FEATURES
# =============================================================================

print("\n[6] Computing relative features")

games["rest_advantage"] = games["home_rest_days"] - games["away_rest_days"]
games["both_b2b"] = (
    (games["home_b2b"] == 1) & (games["away_b2b"] == 1)
).astype(int)

# =============================================================================
# STEP 6 — VALIDATION
# =============================================================================

print("\n[7] Validating distributions")

print(f"Mean rest advantage: {games['rest_advantage'].mean():.3f}")
print(f"Both teams B2B rate: {games['both_b2b'].mean():.3f}")

# NOTE:
# Correlations below are diagnostic only.
# They do NOT indicate model usefulness or causality.

print("\nContext correlations with home_win (diagnostic only):")
print(
    games[
        context_features + ["rest_advantage", "both_b2b", "home_win"]
    ]
    .corr()["home_win"]
    .drop("home_win")
    .sort_values(ascending=False)
)

# =============================================================================
# STEP 7 — SAVE
# =============================================================================

print("\n[8] Saving enriched dataset")

output_path = save_checkpoint(
    games,
    "games_with_context.csv",
    "Leak-safe game outcomes with rest & back-to-back context"
)

print("\n✔ CONTEXT FEATURES COMPLETE")
print(f"✔ Games: {len(games)}")
print(f"✔ Features added: {len(context_features) + 2}")
print(f"✔ Saved to: {output_path}")

print("\n" + "=" * 60)
print("READY FOR STAGE 3 — ROLLING PERFORMANCE FEATURES")
print("=" * 60)



[1] Loading game outcomes
✓ Loaded: games_outcomes.csv
  Shape: (3690, 10)
✓ Loaded 3690 games
✓ Seasons: 3

[2] Building per-team schedules
✓ Team-game rows: 7380
✓ Teams: 30

[3] Computing rest days
✓ Rest days range: 1–9

[4] Computing back-to-back flags
✓ Back-to-back rates by season:
season
2021-22   0.172
2022-23   0.163
2023-24   0.172
Name: is_back_to_back, dtype: float64

[5] Merging context features
✓ Context features merged cleanly

[6] Computing relative features

[7] Validating distributions
Mean rest advantage: 0.102
Both teams B2B rate: 0.043

Context correlations with home_win (diagnostic only):
rest_advantage        0.060
away_b2b              0.050
home_rest_days        0.038
both_b2b              0.011
home_season_opener    0.008
away_season_opener    0.002
away_rest_days       -0.018
home_b2b             -0.057
Name: home_win, dtype: float64

[8] Saving enriched dataset
✓ [2026-01-06 00:56:33] Saved: games_with_context.csv
  Leak-safe game outcomes with rest & back

In [5]:
"""
NBA Win Probability Predictor
STAGE 3 — ROLLING PERFORMANCE FEATURES (CACHED, PRODUCTION)

Computes rolling team performance metrics using ONLY past games.
All computations happen once and are cached for fast lookups.

Features computed (per window):
- Points scored/allowed (offensive/defensive ratings proxy)
- Net rating (scored - allowed)
- Win rate
- Games in rolling window

Guarantees:
- No data leakage (only past games used)
- Season boundaries respected
- Early-season partial windows handled safely
- Aggressive caching (no recomputation)
"""

import pandas as pd
import numpy as np

# =============================================================================
# CONFIG FALLBACK
# =============================================================================

if "CONFIG" not in globals():
    CONFIG = {"rolling_windows": [5, 10], "primary_window": 5}

WINDOWS = CONFIG.get("rolling_windows", [5, 10])
PRIMARY_WINDOW = CONFIG.get("primary_window", 5)

# =============================================================================
# LOAD GAMES WITH CONTEXT
# =============================================================================

print("\n[1] Loading games with context")

games = load_checkpoint("games_with_context.csv")
assert games is not None, "games_with_context.csv not found"

# Enforce proper date type
games["game_date"] = pd.to_datetime(games["game_date"], errors="raise")
assert games["game_date"].is_monotonic_increasing, "Games not sorted chronologically"

print(f"✓ Loaded {len(games)} games from {games['season'].nunique()} seasons")

# =============================================================================
# STEP 1 — BUILD TEAM GAME LOG
# =============================================================================

print("\n[2] Building team game logs")

def build_team_game_log(games_df: pd.DataFrame) -> pd.DataFrame:
    """
    Each row = one team's stats from one game
    """
    # Home games
    home_log = games_df[[
        "game_id", "game_date", "season",
        "home_team", "away_team",
        "home_score", "away_score"
    ]].copy()

    home_log = home_log.rename(columns={
        "home_team": "team",
        "away_team": "opponent",
        "home_score": "pts_scored",
        "away_score": "pts_allowed"
    })
    home_log["is_home"] = 1

    # Away games
    away_log = games_df[[
        "game_id", "game_date", "season",
        "away_team", "home_team",
        "away_score", "home_score"
    ]].copy()

    away_log = away_log.rename(columns={
        "away_team": "team",
        "home_team": "opponent",
        "away_score": "pts_scored",
        "home_score": "pts_allowed"
    })
    away_log["is_home"] = 0

    # Combine and sort
    team_log = pd.concat([home_log, away_log], ignore_index=True)
    team_log = team_log.sort_values(["team", "season", "game_date"]).reset_index(drop=True)

    # Derived metrics
    team_log["net_rating"] = team_log["pts_scored"] - team_log["pts_allowed"]
    team_log["won"] = (team_log["pts_scored"] > team_log["pts_allowed"]).astype(int)

    # Ensure numeric types
    numeric_cols = ["pts_scored", "pts_allowed", "net_rating", "won"]
    team_log[numeric_cols] = team_log[numeric_cols].apply(pd.to_numeric, errors="raise")

    return team_log

team_log = build_team_game_log(games)

print(f"✓ Team game log: {len(team_log)} rows, {team_log['team'].nunique()} teams")
print(f"✓ Date range: {team_log['game_date'].min().date()} to {team_log['game_date'].max().date()}")

# =============================================================================
# STEP 2 — PRECOMPUTE ROLLING STATS
# =============================================================================

print("\n[3] Precomputing rolling statistics (past games only)")

def precompute_rolling_stats(team_log: pd.DataFrame, windows: list) -> pd.DataFrame:
    log = team_log.copy()
    grouped = log.groupby(["team", "season"])

    for window in windows:
        print(f"    Rolling window: {window} games")

        log[f"pts_scored_L{window}"] = grouped["pts_scored"].shift(1).rolling(window, min_periods=1).mean()
        log[f"pts_allowed_L{window}"] = grouped["pts_allowed"].shift(1).rolling(window, min_periods=1).mean()
        log[f"net_rating_L{window}"] = grouped["net_rating"].shift(1).rolling(window, min_periods=1).mean()
        log[f"win_rate_L{window}"] = grouped["won"].shift(1).rolling(window, min_periods=1).mean()
        log[f"games_in_window_L{window}"] = grouped["won"].shift(1).rolling(window, min_periods=1).count()

    return log

team_log = precompute_rolling_stats(team_log, WINDOWS)

rolling_cols = [c for c in team_log.columns if "_L" in c]
nan_counts = team_log[rolling_cols].isna().sum()
if nan_counts.sum() > 0:
    print("⚠ NaNs detected in rolling stats (expected for early-season games)")
else:
    print("✓ No NaNs in rolling stats")

# =============================================================================
# STEP 3 — SAVE ROLLING STATS CACHE
# =============================================================================

print("\n[4] Saving rolling stats cache")

cache_path = save_checkpoint(team_log, "team_rolling_stats_cache.csv",
                             "Precomputed rolling stats for all teams/dates")

print(f"✓ Cache saved: {cache_path}, rows: {len(team_log)}, columns: {len(team_log.columns)}")

# =============================================================================
# STEP 4 — MERGE TO GAME LEVEL
# =============================================================================

print("\n[5] Merging rolling stats into game-level data")

merge_cols = [
    "game_id", "team",
    f"pts_scored_L{PRIMARY_WINDOW}", f"pts_allowed_L{PRIMARY_WINDOW}",
    f"net_rating_L{PRIMARY_WINDOW}", f"win_rate_L{PRIMARY_WINDOW}",
    f"games_in_window_L{PRIMARY_WINDOW}"
]

# Home team
games = games.merge(team_log[merge_cols], left_on=["game_id", "home_team"],
                    right_on=["game_id", "team"], how="left", validate="one_to_one").drop(columns="team")

games = games.rename(columns={
    f"pts_scored_L{PRIMARY_WINDOW}": f"home_pts_L{PRIMARY_WINDOW}",
    f"pts_allowed_L{PRIMARY_WINDOW}": f"home_pa_L{PRIMARY_WINDOW}",
    f"net_rating_L{PRIMARY_WINDOW}": f"home_net_L{PRIMARY_WINDOW}",
    f"win_rate_L{PRIMARY_WINDOW}": f"home_winpct_L{PRIMARY_WINDOW}",
    f"games_in_window_L{PRIMARY_WINDOW}": f"home_games_L{PRIMARY_WINDOW}"
})

# Away team
games = games.merge(team_log[merge_cols], left_on=["game_id", "away_team"],
                    right_on=["game_id", "team"], how="left", validate="one_to_one").drop(columns="team")

games = games.rename(columns={
    f"pts_scored_L{PRIMARY_WINDOW}": f"away_pts_L{PRIMARY_WINDOW}",
    f"pts_allowed_L{PRIMARY_WINDOW}": f"away_pa_L{PRIMARY_WINDOW}",
    f"net_rating_L{PRIMARY_WINDOW}": f"away_net_L{PRIMARY_WINDOW}",
    f"win_rate_L{PRIMARY_WINDOW}": f"away_winpct_L{PRIMARY_WINDOW}",
    f"games_in_window_L{PRIMARY_WINDOW}": f"away_games_L{PRIMARY_WINDOW}"
})

# =============================================================================
# STEP 5 — SEASON-ZSCORE NORMALIZATION
# =============================================================================

print("\n[6] Normalizing rolling stats within each season")

def normalize_within_season(df: pd.DataFrame, window: int) -> pd.DataFrame:
    df = df.copy()
    cols = [
        f"home_pts_L{window}", f"home_pa_L{window}", f"home_net_L{window}",
        f"away_pts_L{window}", f"away_pa_L{window}", f"away_net_L{window}"
    ]

    for col in cols:
        df[f"{col}_z"] = df.groupby("season")[col].transform(
            lambda x: (x - x.mean()) / x.std(ddof=0) if x.std(ddof=0) != 0 else 0
        )

    return df

games = normalize_within_season(games, PRIMARY_WINDOW)

# =============================================================================
# STEP 6 — RELATIVE MATCHUP FEATURES
# =============================================================================

print("\n[7] Computing matchup features")

games[f"net_diff_L{PRIMARY_WINDOW}"] = (
    games[f"home_net_L{PRIMARY_WINDOW}_z"] - games[f"away_net_L{PRIMARY_WINDOW}_z"]
)

games[f"off_vs_def_L{PRIMARY_WINDOW}"] = (
    games[f"home_pts_L{PRIMARY_WINDOW}_z"] - games[f"away_pa_L{PRIMARY_WINDOW}_z"]
)

games[f"winpct_diff_L{PRIMARY_WINDOW}"] = (
    games[f"home_winpct_L{PRIMARY_WINDOW}"] - games[f"away_winpct_L{PRIMARY_WINDOW}"]
)

# =============================================================================
# STEP 7 — SAVE FINAL DATASET
# =============================================================================

print("\n[8] Saving games with rolling features")

output_path = save_checkpoint(
    games,
    "games_with_rolling_features.csv",
    f"Games with context + L{PRIMARY_WINDOW} rolling performance features"
)

print("\n✔ ROLLING FEATURES COMPLETE")
print(f"✔ Games: {len(games)}")
print(f"✔ Features added: {len([c for c in games.columns if '_L' in c]) + 3}")
print(f"✔ Saved to: {output_path}")

print("\n" + "=" * 60)
print("READY FOR WEEK 3: MODELING & EVALUATION")
print("=" * 60)



[1] Loading games with context
✓ Loaded: games_with_context.csv
  Shape: (3690, 18)
✓ Loaded 3690 games from 3 seasons

[2] Building team game logs
✓ Team game log: 7380 rows, 30 teams
✓ Date range: 2021-10-19 to 2024-04-14

[3] Precomputing rolling statistics (past games only)
    Rolling window: 5 games
    Rolling window: 10 games
⚠ NaNs detected in rolling stats (expected for early-season games)

[4] Saving rolling stats cache
✓ [2026-01-06 00:56:34] Saved: team_rolling_stats_cache.csv
  Precomputed rolling stats for all teams/dates
  Shape: (7380, 20)
✓ Cache saved: /content/drive/MyDrive/nba_predictor/data/processed/team_rolling_stats_cache.csv, rows: 7380, columns: 20

[5] Merging rolling stats into game-level data

[6] Normalizing rolling stats within each season

[7] Computing matchup features

[8] Saving games with rolling features
✓ [2026-01-06 00:56:35] Saved: games_with_rolling_features.csv
  Games with context + L5 rolling performance features
  Shape: (3690, 37)

✔ ROLL

In [6]:
"""
NBA Win Probability Predictor
STAGE 4 — MODELING & EVALUATION (PRODUCTION)

Builds and evaluates prediction models:
1. Baseline: Training home win rate (honest constant)
2. Logistic Regression: Linear baseline
3. XGBoost: Primary model

Evaluation metrics:
- Accuracy (secondary)
- Log Loss (primary)
- Brier Score
- Calibration

Guarantees:
- Time-based train/test split (no random shuffle)
- Honest evaluation (no peeking at test set)
- Multiple metrics (not just accuracy)
"""

import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from sklearn.calibration import calibration_curve
import xgboost as xgb

# ========================
# GLOBAL CONFIG & DIRECTORIES
# ========================
CONFIG = {
    "primary_window": 5,
    "train_seasons": ["2021-22", "2022-23"],
    "val_fraction": 0.1,  # fraction of training used as validation
    "test_season": "2023-24",
    "random_seed": 42,
    "rolling_windows": [5, 10]
}

np.random.seed(CONFIG["random_seed"])

os.makedirs("evaluation", exist_ok=True)
os.makedirs("models", exist_ok=True)

# ========================
# LOAD FEATURE-RICH DATASET
# ========================
print("\n[1] Loading dataset with all features")
games = load_checkpoint("games_with_rolling_features.csv")
assert games is not None, "games_with_rolling_features.csv not found"

games["game_date"] = pd.to_datetime(games["game_date"])
print(f"✓ Loaded: {games.shape}")
print(f"✓ Seasons: {games['season'].unique()}")

# ========================
# DEFINE FEATURE SET
# ========================
PRIMARY_WINDOW = CONFIG.get("primary_window", 5)

context_features = [
    "home_rest_days", "away_rest_days", "rest_advantage",
    "home_b2b", "away_b2b", "both_b2b"
]

rolling_features = [
    f"home_pts_L{PRIMARY_WINDOW}_z",
    f"home_pa_L{PRIMARY_WINDOW}_z",
    f"home_net_L{PRIMARY_WINDOW}_z",
    f"away_pts_L{PRIMARY_WINDOW}_z",
    f"away_pa_L{PRIMARY_WINDOW}_z",
    f"away_net_L{PRIMARY_WINDOW}_z",
    f"home_winpct_L{PRIMARY_WINDOW}",
    f"away_winpct_L{PRIMARY_WINDOW}"
]

matchup_features = [
    f"net_diff_L{PRIMARY_WINDOW}",
    f"off_vs_def_L{PRIMARY_WINDOW}",
    f"winpct_diff_L{PRIMARY_WINDOW}"
]

feature_cols = context_features + rolling_features + matchup_features
print(f"✓ Total features: {len(feature_cols)}")

# ========================
# HANDLE MISSING VALUES
# ========================
X_full = games[feature_cols].copy()
missing_counts = X_full.isna().sum()
if missing_counts.sum() > 0:
    print("\n⚠ Missing values detected in features:")
    print(missing_counts[missing_counts > 0])
    # Impute missing values with column mean
    X_full = X_full.fillna(X_full.mean())
    print("✓ Missing values imputed with column mean")

# Ensure features are float
X_full = X_full.astype(float)
y_full = games["home_win"].copy()
seasons_full = games["season"].copy()

# ========================
# TIME-BASED TRAIN/VALIDATION/TEST SPLIT
# ========================
TRAIN_SEASONS = CONFIG.get("train_seasons", ["2021-22", "2022-23"])
TEST_SEASON = CONFIG.get("test_season", "2023-24")
VAL_FRACTION = CONFIG.get("val_fraction", 0.1)

train_mask = seasons_full.isin(TRAIN_SEASONS)
test_mask = seasons_full == TEST_SEASON

X_train_full = X_full[train_mask].reset_index(drop=True)
y_train_full = y_full[train_mask].reset_index(drop=True)
X_test = X_full[test_mask].reset_index(drop=True)
y_test = y_full[test_mask].reset_index(drop=True)
games_test = games[test_mask].reset_index(drop=True)

# Split validation from training
val_size = int(len(X_train_full) * VAL_FRACTION)
X_train = X_train_full[:-val_size].reset_index(drop=True)
y_train = y_train_full[:-val_size].reset_index(drop=True)
X_val = X_train_full[-val_size:].reset_index(drop=True)
y_val = y_train_full[-val_size:].reset_index(drop=True)

print(f"\n✓ Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")
print(f"✓ Train home win rate: {y_train.mean():.3f}, Test home win rate: {y_test.mean():.3f}")

# ========================
# BASELINE MODEL
# ========================
print("\n[2] Baseline model (constant predictor)")
baseline_prob = y_train.mean()
baseline_preds = np.full(len(y_test), baseline_prob)
baseline_acc = accuracy_score(y_test, baseline_preds > 0.5)
baseline_logloss = log_loss(y_test, baseline_preds)
baseline_brier = brier_score_loss(y_test, baseline_preds)
print(f"Accuracy: {baseline_acc:.4f}, LogLoss: {baseline_logloss:.4f}, Brier: {baseline_brier:.4f}")

# ========================
# LOGISTIC REGRESSION
# ========================
print("\n[3] Logistic Regression")
lr = LogisticRegression(
    penalty="l2",
    C=1.0,
    max_iter=1000,
    random_state=CONFIG["random_seed"],
    solver="lbfgs"
)
lr.fit(X_train, y_train)
lr_probs = lr.predict_proba(X_test)[:, 1]
lr_acc = accuracy_score(y_test, lr_probs > 0.5)
lr_logloss = log_loss(y_test, lr_probs)
lr_brier = brier_score_loss(y_test, lr_probs)
print(f"Accuracy: {lr_acc:.4f}, LogLoss: {lr_logloss:.4f}, Brier: {lr_brier:.4f}")

# ========================
# XGBOOST
# ========================
print("\n[4] Training XGBoost")
xgb_params = {
    "max_depth": 4,
    "learning_rate": 0.05,
    "n_estimators": 300,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": CONFIG["random_seed"],
    "n_jobs": -1,
}

# Safe early stopping with XGBoost versions: use callbacks if fit() doesn't accept early_stopping_rounds
model = xgb.XGBClassifier(**xgb_params)

try:
    # Try the newer API
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=30,
        verbose=False
    )
except TypeError:
    # Fallback for older versions
    print("⚠ early_stopping_rounds not supported by this XGBoost version; training without early stopping")
    model.fit(X_train, y_train)

xgb_probs = model.predict_proba(X_test)[:, 1]
xgb_acc = accuracy_score(y_test, xgb_probs > 0.5)
xgb_logloss = log_loss(y_test, xgb_probs)
xgb_brier = brier_score_loss(y_test, xgb_probs)
print(f"Accuracy: {xgb_acc:.4f}, LogLoss: {xgb_logloss:.4f}, Brier: {xgb_brier:.4f}")

# ========================
# FEATURE IMPORTANCE
# ========================
importance_gain = pd.DataFrame({
    "feature": feature_cols,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)
importance_gain.to_csv("evaluation/feature_importance.csv", index=False)
print("✓ Feature importance saved to evaluation/feature_importance.csv")

# ========================
# MODEL COMPARISON
# ========================
comparison = pd.DataFrame({
    "Model": ["Baseline", "Logistic Regression", "XGBoost"],
    "Accuracy": [baseline_acc, lr_acc, xgb_acc],
    "Log Loss": [baseline_logloss, lr_logloss, xgb_logloss],
    "Brier Score": [baseline_brier, lr_brier, xgb_brier]
})
print("\n" + comparison.to_string(index=False))
best_model_idx = comparison["Log Loss"].idxmin()
best_model = comparison.loc[best_model_idx, "Model"]
print(f"\n✓ Best model (by Log Loss): {best_model}")

# ========================
# CALIBRATION ANALYSIS
# ========================
prob_true, prob_pred = calibration_curve(y_test, xgb_probs, n_bins=10, strategy="uniform")
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], "k--", label="Perfect calibration", linewidth=2)
plt.plot(prob_pred, prob_true, "o-", label="XGBoost", linewidth=2, markersize=8)
plt.xlabel("Predicted Probability")
plt.ylabel("True Probability")
plt.title("Calibration Plot - XGBoost")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("evaluation/calibration_curve.png", dpi=150)
plt.close()
calibration_error = np.abs(prob_true - prob_pred).mean()
print(f"Mean calibration error: {calibration_error:.4f}")

# ========================
# PREDICTION DISTRIBUTION
# ========================
plt.figure(figsize=(10, 5))
plt.hist(xgb_probs, bins=30, edgecolor="black", alpha=0.7)
plt.axvline(0.5, color="red", linestyle="--", linewidth=2)
plt.xlabel("Predicted Home Win Probability")
plt.ylabel("Count")
plt.title("Distribution of XGBoost Predictions")
plt.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.savefig("evaluation/prediction_distribution.png", dpi=150)
plt.close()
print("✓ Prediction distribution saved")

# Confidence analysis
confident_correct = ((xgb_probs > 0.7) & (y_test == 1)).sum() + ((xgb_probs < 0.3) & (y_test == 0)).sum()
confident_total = ((xgb_probs > 0.7) | (xgb_probs < 0.3)).sum()
confident_acc = confident_correct / confident_total if confident_total > 0 else 0
print(f"High-confidence predictions: {confident_total} ({100*confident_total/len(y_test):.1f}%), Accuracy: {confident_acc:.3f}")

# ========================
# SAVE RESULTS & MODEL
# ========================
results_path = save_checkpoint(
    games_test.assign(predicted_home_win_prob=xgb_probs, actual_home_win=y_test),
    "test_predictions.csv",
    "Test set with XGBoost predictions"
)

# Use XGBoost native save_model for safety
model_path = "models/xgboost_model.json"
model.save_model(model_path)
print(f"✓ Model saved to {model_path}")

print("\n✔ MODELING COMPLETE")
print(f"✔ Best model: {best_model}")
print(f"✔ Test accuracy: {xgb_acc:.4f}")
print(f"✔ Test log loss: {xgb_logloss:.4f}")
print(f"✔ Predictions saved: {results_path}")



[1] Loading dataset with all features
✓ Loaded: games_with_rolling_features.csv
  Shape: (3690, 37)
✓ Loaded: (3690, 37)
✓ Seasons: ['2021-22' '2022-23' '2023-24']
✓ Total features: 17

⚠ Missing values detected in features:
home_pts_L5_z     1
home_pa_L5_z      1
home_net_L5_z     1
home_winpct_L5    1
net_diff_L5       1
off_vs_def_L5     1
winpct_diff_L5    1
dtype: int64
✓ Missing values imputed with column mean

✓ Train: 2214, Validation: 246, Test: 1230
✓ Train home win rate: 0.565, Test home win rate: 0.543

[2] Baseline model (constant predictor)
Accuracy: 0.5431, LogLoss: 0.6904, Brier: 0.2486

[3] Logistic Regression
Accuracy: 0.6138, LogLoss: 0.6576, Brier: 0.2327

[4] Training XGBoost
⚠ early_stopping_rounds not supported by this XGBoost version; training without early stopping
Accuracy: 0.5886, LogLoss: 0.6892, Brier: 0.2465
✓ Feature importance saved to evaluation/feature_importance.csv

              Model  Accuracy  Log Loss  Brier Score
           Baseline     0.543  

In [7]:
"""
NBA Win Probability Predictor
STAGE 4B — XGBOOST FIX (PROPER VALIDATION)

Fixes XGBoost training with proper early stopping using the native API.
The sklearn wrapper has compatibility issues with early stopping.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from sklearn.calibration import calibration_curve

print("\n" + "="*60)
print("XGBOOST RETRAIN - FIXED VERSION")
print("="*60)

# =============================================================================
# LOAD PREVIOUS RESULTS
# =============================================================================

print("\n[1] Loading previous train/test split")

# Reload the dataset
games = load_checkpoint("games_with_rolling_features.csv")
games["game_date"] = pd.to_datetime(games["game_date"])

# Recreate the exact same split
PRIMARY_WINDOW = 5
TRAIN_SEASONS = ["2021-22", "2022-23"]
TEST_SEASON = "2023-24"

context_features = [
    "home_rest_days", "away_rest_days", "rest_advantage",
    "home_b2b", "away_b2b", "both_b2b"
]

rolling_features = [
    f"home_pts_L{PRIMARY_WINDOW}_z",
    f"home_pa_L{PRIMARY_WINDOW}_z",
    f"home_net_L{PRIMARY_WINDOW}_z",
    f"away_pts_L{PRIMARY_WINDOW}_z",
    f"away_pa_L{PRIMARY_WINDOW}_z",
    f"away_net_L{PRIMARY_WINDOW}_z",
    f"home_winpct_L{PRIMARY_WINDOW}",
    f"away_winpct_L{PRIMARY_WINDOW}"
]

matchup_features = [
    f"net_diff_L{PRIMARY_WINDOW}",
    f"off_vs_def_L{PRIMARY_WINDOW}",
    f"winpct_diff_L{PRIMARY_WINDOW}"
]

feature_cols = context_features + rolling_features + matchup_features

X = games[feature_cols].copy().fillna(games[feature_cols].mean()).astype(float)
y = games["home_win"].copy()
seasons = games["season"].copy()

train_mask = seasons.isin(TRAIN_SEASONS)
test_mask = seasons == TEST_SEASON

X_train_full = X[train_mask].reset_index(drop=True)
y_train_full = y[train_mask].reset_index(drop=True)
X_test = X[test_mask].reset_index(drop=True)
y_test = y[test_mask].reset_index(drop=True)

# Create validation set (last 10% of training)
val_size = int(len(X_train_full) * 0.1)
X_val = X_train_full.iloc[-val_size:].reset_index(drop=True)
y_val = y_train_full.iloc[-val_size:].reset_index(drop=True)
X_train = X_train_full.iloc[:-val_size].reset_index(drop=True)
y_train = y_train_full.iloc[:-val_size].reset_index(drop=True)

print(f"✓ Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# =============================================================================
# XGBOOST USING NATIVE API (PROPER EARLY STOPPING)
# =============================================================================

print("\n[2] Training XGBoost with native API (proper early stopping)")

# Convert to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feature_cols)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=feature_cols)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feature_cols)

# XGBoost parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 4,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
    'tree_method': 'hist'  # Faster training
}

# Train with early stopping
print("  Training with early stopping...")
evals = [(dtrain, 'train'), (dval, 'val')]
evals_result = {}

model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,  # Max iterations
    evals=evals,
    early_stopping_rounds=30,
    evals_result=evals_result,
    verbose_eval=50  # Print every 50 rounds
)

print(f"\n✓ Best iteration: {model.best_iteration}")
print(f"✓ Best validation log loss: {model.best_score:.4f}")

# =============================================================================
# EVALUATE ON TEST SET
# =============================================================================

print("\n[3] Evaluating on test set")

xgb_probs = model.predict(dtest)

xgb_acc = accuracy_score(y_test, xgb_probs > 0.5)
xgb_logloss = log_loss(y_test, xgb_probs)
xgb_brier = brier_score_loss(y_test, xgb_probs)

# Compare to baseline and logistic regression
baseline_prob = y_train.mean()
baseline_logloss = log_loss(y_test, np.full(len(y_test), baseline_prob))

print("\n" + "="*60)
print("MODEL COMPARISON (UPDATED)")
print("="*60)

comparison = pd.DataFrame({
    "Model": ["Baseline", "Logistic Regression", "XGBoost (Fixed)"],
    "Accuracy": [0.543, 0.614, xgb_acc],
    "Log Loss": [0.690, 0.658, xgb_logloss],
    "Brier Score": [0.249, 0.233, xgb_brier]
})

print("\n" + comparison.to_string(index=False))

# Improvement metrics
print(f"\nXGBoost vs Baseline:")
print(f"  Accuracy: {xgb_acc - 0.543:+.4f}")
print(f"  Log Loss: {xgb_logloss - 0.690:+.4f}")

print(f"\nXGBoost vs Logistic Regression:")
print(f"  Accuracy: {xgb_acc - 0.614:+.4f}")
print(f"  Log Loss: {xgb_logloss - 0.658:+.4f}")

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================

print("\n[4] Feature importance analysis")

# Get importance scores
importance_dict = model.get_score(importance_type='gain')

# Convert to dataframe
importance_df = pd.DataFrame([
    {'feature': k, 'importance': v}
    for k, v in importance_dict.items()
]).sort_values('importance', ascending=False)

print("\nTop 10 features:")
print(importance_df.head(10).to_string(index=False))

# Save
importance_df.to_csv("evaluation/feature_importance_fixed.csv", index=False)

# =============================================================================
# CALIBRATION ANALYSIS
# =============================================================================

print("\n[5] Calibration analysis")

prob_true, prob_pred = calibration_curve(y_test, xgb_probs, n_bins=10, strategy="uniform")

plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], "k--", label="Perfect calibration", linewidth=2)
plt.plot(prob_pred, prob_true, "o-", label="XGBoost (Fixed)", linewidth=2, markersize=8)
plt.xlabel("Predicted Probability", fontsize=12)
plt.ylabel("True Probability", fontsize=12)
plt.title("Calibration Plot - XGBoost (Fixed)", fontsize=14, fontweight="bold")
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("evaluation/calibration_curve_fixed.png", dpi=150, bbox_inches="tight")
print("✓ Calibration plot saved")
plt.close()

calibration_error = np.abs(prob_true - prob_pred).mean()
print(f"  Mean calibration error: {calibration_error:.4f}")

# =============================================================================
# TRAINING CURVE VISUALIZATION
# =============================================================================

print("\n[6] Plotting training curves")

train_logloss = evals_result['train']['logloss']
val_logloss = evals_result['val']['logloss']

plt.figure(figsize=(10, 6))
plt.plot(train_logloss, label='Train', linewidth=2)
plt.plot(val_logloss, label='Validation', linewidth=2)
plt.axvline(model.best_iteration, color='red', linestyle='--', label=f'Best iteration ({model.best_iteration})')
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Log Loss', fontsize=12)
plt.title('XGBoost Training Curve', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("evaluation/training_curve.png", dpi=150, bbox_inches="tight")
print("✓ Training curve saved")
plt.close()

# =============================================================================
# SAVE FIXED MODEL AND PREDICTIONS
# =============================================================================

print("\n[7] Saving fixed model and predictions")

# Save model
model.save_model("models/xgboost_model_fixed.json")
print("✓ Model saved to models/xgboost_model_fixed.json")

# Save predictions
test_results = games[test_mask].reset_index(drop=True).copy()
test_results["predicted_home_win_prob"] = xgb_probs
test_results["actual_home_win"] = y_test.values

results_path = save_checkpoint(
    test_results,
    "test_predictions_fixed.csv",
    "Test predictions with fixed XGBoost model"
)

print(f"✓ Predictions saved to {results_path}")

# =============================================================================
# FINAL VERDICT
# =============================================================================

print("\n" + "="*60)
print("FINAL RESULTS")
print("="*60)

if xgb_logloss < 0.658:
    print("\n✓ XGBoost now BEATS Logistic Regression!")
    print(f"  XGBoost log loss: {xgb_logloss:.4f}")
    print(f"  Logistic Regression log loss: 0.658")
    best_model = "XGBoost"
else:
    print("\n→ Logistic Regression remains best model")
    print(f"  Logistic Regression log loss: 0.658")
    print(f"  XGBoost log loss: {xgb_logloss:.4f}")
    print("\n  This is OKAY - simpler models sometimes win!")
    print("  Logistic Regression is:")
    print("    • Faster to train")
    print("    • More interpretable")
    print("    • Less prone to overfitting")
    best_model = "Logistic Regression"

print(f"\n✔ Best model for deployment: {best_model}")
print(f"✔ Test accuracy: {xgb_acc:.4f}")
print(f"✔ Test log loss: {xgb_logloss:.4f}")

print("\n" + "="*60)
print("READY FOR WEEK 4: MONTE CARLO SIMULATION")
print("="*60)


XGBOOST RETRAIN - FIXED VERSION

[1] Loading previous train/test split
✓ Loaded: games_with_rolling_features.csv
  Shape: (3690, 37)
✓ Train: 2214, Val: 246, Test: 1230

[2] Training XGBoost with native API (proper early stopping)
  Training with early stopping...
[0]	train-logloss:0.68137	val-logloss:0.68776
[50]	train-logloss:0.60404	val-logloss:0.65987
[100]	train-logloss:0.56425	val-logloss:0.65789
[138]	train-logloss:0.53903	val-logloss:0.66080

✓ Best iteration: 108
✓ Best validation log loss: 0.6576

[3] Evaluating on test set

MODEL COMPARISON (UPDATED)

              Model  Accuracy  Log Loss  Brier Score
           Baseline     0.543     0.690        0.249
Logistic Regression     0.614     0.658        0.233
    XGBoost (Fixed)     0.598     0.669        0.238

XGBoost vs Baseline:
  Accuracy: +0.0546
  Log Loss: -0.0207

XGBoost vs Logistic Regression:
  Accuracy: -0.0164
  Log Loss: +0.0113

[4] Feature importance analysis

Top 10 features:
       feature  importance
   ne

In [8]:
"""
NBA Win Probability Predictor
STAGE 5 — MONTE CARLO SEASON SIMULATION

Simulates season outcomes using predicted win probabilities.
Preserves actual schedule structure (not independent Bernoullis).

Outputs:
- Expected wins per team
- Win distribution uncertainty
- Playoff probability estimates
- Upset analysis
"""

import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from collections import defaultdict

# =============================================================================
# SETUP DIRECTORIES
# =============================================================================

Path("evaluation").mkdir(parents=True, exist_ok=True)

# =============================================================================
# LOAD CONFIG WITH FALLBACKS
# =============================================================================

N_SIMULATIONS = CONFIG.get("n_simulations", 10000) if 'CONFIG' in globals() else 10000
RANDOM_SEED = CONFIG.get("random_seed", 42) if 'CONFIG' in globals() else 42
np.random.seed(RANDOM_SEED)

# =============================================================================
# LOAD TEST PREDICTIONS
# =============================================================================

print("\n" + "="*60)
print("MONTE CARLO SEASON SIMULATION")
print("="*60)

print("\n[1] Loading test predictions")

try:
    test_preds = load_checkpoint("test_predictions_fixed.csv")
    assert test_preds is not None
except Exception as e:
    raise FileNotFoundError("test_predictions_fixed.csv not found. Ensure predictions exist.") from e

test_preds["game_date"] = pd.to_datetime(test_preds["game_date"])
test_preds["predicted_home_win_prob"] = test_preds["predicted_home_win_prob"].astype(float)
test_preds["actual_home_win"] = test_preds["actual_home_win"].astype(int)

print(f"✓ Loaded {len(test_preds)} test games")
print(f"✓ Test season: {test_preds['season'].unique()}")
print(f"✓ Date range: {test_preds['game_date'].min().date()} to {test_preds['game_date'].max().date()}")

# =============================================================================
# STEP 1 — VERIFY PREDICTION QUALITY
# =============================================================================

print("\n[2] Verifying prediction quality")

from sklearn.metrics import accuracy_score, log_loss, brier_score_loss

actual = test_preds["actual_home_win"].values
predicted_prob = test_preds["predicted_home_win_prob"].values

acc = accuracy_score(actual, predicted_prob > 0.5)
logloss = log_loss(actual, predicted_prob)
brier = brier_score_loss(actual, predicted_prob)

print(f"✓ Accuracy: {acc:.4f}")
print(f"✓ Log Loss: {logloss:.4f}")
print(f"✓ Brier Score: {brier:.4f}")

print(f"\nPrediction distribution:")
print(f"  Mean: {predicted_prob.mean():.3f}")
print(f"  Std: {predicted_prob.std():.3f}")
print(f"  Min: {predicted_prob.min():.3f}")
print(f"  Max: {predicted_prob.max():.3f}")

# =============================================================================
# STEP 2 — MONTE CARLO SIMULATION (VECTORIZED)
# =============================================================================

print("\n[3] Running Monte Carlo simulation")
print(f"  Simulations: {N_SIMULATIONS:,}")
print(f"  ⏳ This may take a few seconds...")

# Precompute indices for home/away teams
teams = sorted(set(test_preds["home_team"]).union(test_preds["away_team"]))
team_to_idx = {team: i for i, team in enumerate(teams)}
n_teams = len(teams)
n_games = len(test_preds)

# Vectorized simulation
sim_matrix = np.random.binomial(1, predicted_prob, size=(N_SIMULATIONS, n_games))
team_wins_matrix = np.zeros((N_SIMULATIONS, n_teams), dtype=int)

home_indices = test_preds["home_team"].map(team_to_idx).values
away_indices = test_preds["away_team"].map(team_to_idx).values

for g in range(n_games):
    team_wins_matrix[:, home_indices[g]] += sim_matrix[:, g]
    team_wins_matrix[:, away_indices[g]] += (1 - sim_matrix[:, g])

# Aggregate results
team_wins = {team: team_wins_matrix[:, i] for team, i in team_to_idx.items()}

print(f"✓ Simulation complete")

# =============================================================================
# STEP 3 — AGGREGATE TEAM STATISTICS
# =============================================================================

print("\n[4] Computing team statistics")

team_stats = pd.DataFrame([
    {
        "team": team,
        "expected_wins": wins.mean(),
        "std_wins": wins.std(),
        "min_wins": wins.min(),
        "max_wins": wins.max(),
        "p10": np.percentile(wins, 10),
        "p90": np.percentile(wins, 90)
    }
    for team, wins in team_wins.items()
]).sort_values("expected_wins", ascending=False)

print("\nTop 10 teams by expected wins:")
print(team_stats.head(10).to_string(index=False))

print("\nBottom 5 teams by expected wins:")
print(team_stats.tail(5).to_string(index=False))

# =============================================================================
# STEP 4 — COMPARE TO ACTUAL RESULTS (VECTORIZED)
# =============================================================================

print("\n[5] Comparing simulated vs actual results")

home_wins = test_preds.groupby("home_team")["actual_home_win"].sum()
away_wins = test_preds.groupby("away_team")["actual_home_win"].apply(lambda x: len(x) - x.sum())
actual_wins = home_wins.add(away_wins, fill_value=0)

team_stats["actual_wins"] = team_stats["team"].map(actual_wins)
team_stats["prediction_error"] = team_stats["expected_wins"] - team_stats["actual_wins"]

mae = team_stats["prediction_error"].abs().mean()
rmse = np.sqrt((team_stats["prediction_error"] ** 2).mean())

print(f"\n✓ Prediction quality:")
print(f"  Mean Absolute Error: {mae:.2f} wins")
print(f"  Root Mean Squared Error: {rmse:.2f} wins")

print("\nBiggest prediction errors:")
print(team_stats.nlargest(5, "prediction_error")[["team", "expected_wins", "actual_wins", "prediction_error"]].to_string(index=False))

# =============================================================================
# STEP 5 — UPSET ANALYSIS
# =============================================================================

print("\n[6] Analyzing upsets")

low_prob_games = test_preds[test_preds["predicted_home_win_prob"] < 0.3]
upsets = low_prob_games[low_prob_games["actual_home_win"] == 1]
upset_rate = len(upsets) / len(low_prob_games) if len(low_prob_games) > 0 else np.nan
expected_upset_rate = low_prob_games["predicted_home_win_prob"].mean() if len(low_prob_games) > 0 else np.nan

print(f"\nUpset analysis (home win prob < 0.3):")
print(f"  Total low-probability predictions: {len(low_prob_games)}")
print(f"  Actual upsets: {len(upsets)}")
print(f"  Upset rate: {upset_rate:.3f}")
print(f"  Expected upset rate: {expected_upset_rate:.3f}")
print(f"  Calibration: {'Good' if abs(upset_rate - expected_upset_rate) < 0.05 else 'Needs improvement'}")

# =============================================================================
# STEP 6 — VISUALIZATIONS
# =============================================================================

print("\n[7] Creating visualizations")

# Scatter plot: Expected vs Actual Wins
plt.figure(figsize=(10, 8))
plt.scatter(team_stats["expected_wins"], team_stats["actual_wins"], alpha=0.6, s=100)
plt.plot([20, 70], [20, 70], "r--", label="Perfect prediction", linewidth=2)
for _, row in team_stats.iterrows():
    if abs(row["prediction_error"]) > 5:
        plt.annotate(row["team"], (row["expected_wins"], row["actual_wins"]), fontsize=8, alpha=0.7)
plt.xlabel("Expected Wins (Simulated)", fontsize=12)
plt.ylabel("Actual Wins", fontsize=12)
plt.title("Simulated vs Actual Season Performance", fontsize=14, fontweight="bold")
plt.legend(fontsize=11)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("evaluation/simulated_vs_actual.png", dpi=150, bbox_inches="tight")
plt.close()
print("✓ Simulated vs actual plot saved")

# Win distribution for top 5 teams (interactive)
top_5_teams = team_stats.head(5)["team"].values
fig = go.Figure()
for team in top_5_teams:
    fig.add_trace(go.Violin(y=team_wins[team], name=team, box_visible=True, meanline_visible=True))
fig.update_layout(title="Win Distribution - Top 5 Teams (Monte Carlo)", yaxis_title="Total Wins", showlegend=True, height=600)
fig.write_html("evaluation/win_distribution_top5.html")
print("✓ Win distribution plot saved (interactive HTML)")

# Season uncertainty
plt.figure(figsize=(12, 8))
team_stats_sorted = team_stats.sort_values("expected_wins")
y_pos = np.arange(len(team_stats_sorted))
expected = team_stats_sorted["expected_wins"].values
p10 = team_stats_sorted["p10"].values
p90 = team_stats_sorted["p90"].values
plt.barh(y_pos, expected, alpha=0.6, label="Expected wins")
plt.errorbar(expected, y_pos, xerr=[expected - p10, p90 - expected], fmt="none", ecolor="black", alpha=0.5, capsize=3, label="10th-90th percentile")
plt.yticks(y_pos, team_stats_sorted["team"], fontsize=8)
plt.xlabel("Wins", fontsize=12)
plt.title("Simulated Season: Expected Wins with Uncertainty", fontsize=14, fontweight="bold")
plt.legend(fontsize=10)
plt.grid(axis="x", alpha=0.3)
plt.tight_layout()
plt.savefig("evaluation/season_uncertainty.png", dpi=150, bbox_inches="tight")
plt.close()
print("✓ Uncertainty plot saved")

# =============================================================================
# STEP 7 — SAVE RESULTS
# =============================================================================

print("\n[8] Saving simulation results")

results_path = save_checkpoint(team_stats, "simulation_results.csv", "Monte Carlo simulation results per team")
print(f"✓ Simulation results saved to {results_path}")

# =============================================================================
# SUMMARY
# =============================================================================

print("\n" + "="*60)
print("MONTE CARLO SIMULATION COMPLETE")
print("="*60)
print(f"\n✓ Simulations run: {N_SIMULATIONS:,}")
print(f"✓ Teams analyzed: {len(team_stats)}")
print(f"✓ Mean absolute error: {mae:.2f} wins per team")
print(f"✓ Prediction uncertainty: {team_stats['std_wins'].mean():.2f} wins (avg)")
print("\n✓ Visualizations created:")
print("  • evaluation/simulated_vs_actual.png")
print("  • evaluation/win_distribution_top5.html")
print("  • evaluation/season_uncertainty.png")
print("\n" + "="*60)
print("PROJECT COMPLETE!")
print("="*60)



MONTE CARLO SEASON SIMULATION

[1] Loading test predictions
✓ Loaded: test_predictions_fixed.csv
  Shape: (1230, 39)
✓ Loaded 1230 test games
✓ Test season: ['2023-24']
✓ Date range: 2023-10-24 to 2024-04-14

[2] Verifying prediction quality
✓ Accuracy: 0.5976
✓ Log Loss: 0.6693
✓ Brier Score: 0.2382

Prediction distribution:
  Mean: 0.561
  Std: 0.144
  Min: 0.135
  Max: 0.910

[3] Running Monte Carlo simulation
  Simulations: 10,000
  ⏳ This may take a few seconds...
✓ Simulation complete

[4] Computing team statistics

Top 10 teams by expected wins:
team  expected_wins  std_wins  min_wins  max_wins    p10    p90
 BOS         48.482     4.272        33        65 43.000 54.000
 MIN         46.379     4.307        29        63 41.000 52.000
 OKC         45.740     4.294        28        64 40.000 51.000
 NYK         45.215     4.152        25        59 40.000 51.000
 NOP         44.804     4.252        27        60 39.000 50.000
 DEN         44.802     4.311        25        59 39.000

In [9]:
"""
NBA Win Probability Predictor
Stage 6 IMPROVEMENTS - Quick Wins

Adds:
1. L10 rolling window (more stable than L5)
2. Exponentially weighted rolling stats (recent games matter more)
3. Days since trade deadline feature
4. Ensemble model (Logistic Regression + XGBoost)
"""

import os
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import datetime

# =============================================================================
# SETUP DIRECTORIES
# =============================================================================

Path("evaluation").mkdir(parents=True, exist_ok=True)

print("\n" + "="*60)
print("PHASE 1: ADVANCED FEATURES & ENSEMBLE")
print("="*60)

# =============================================================================
# CONFIGURATION
# =============================================================================

TRADE_DEADLINES = {
    "2021-22": "2022-02-10",
    "2022-23": "2023-02-09",
    "2023-24": "2024-02-08"
}

DECAY_ALPHA = 0.95  # Exponential decay weight (0.95 = 5% decay per game back)

# =============================================================================
# LOAD EXISTING DATA
# =============================================================================

print("\n[1] Loading team game log")

try:
    team_log = load_checkpoint("team_rolling_stats_cache.csv")
    assert team_log is not None
except Exception as e:
    raise FileNotFoundError("team_rolling_stats_cache.csv not found.") from e

team_log["game_date"] = pd.to_datetime(team_log["game_date"])

# Ensure numeric types
team_log["pts_scored"] = team_log["pts_scored"].astype(float)
team_log["pts_allowed"] = team_log["pts_allowed"].astype(float)
team_log["net_rating"] = team_log["net_rating"].astype(float)
team_log["won"] = team_log["won"].astype(int)

print(f"✓ Loaded {len(team_log)} team-game rows")

# =============================================================================
# STEP 1 — L10 ROLLING WINDOW
# =============================================================================

print("\n[2] Adding L10 rolling window")

def add_l10_window(df: pd.DataFrame) -> pd.DataFrame:
    grouped = df.groupby(["team", "season"])
    df = df.copy()

    df["pts_scored_L10"] = grouped["pts_scored"].shift(1).rolling(10, min_periods=1).mean()
    df["pts_allowed_L10"] = grouped["pts_allowed"].shift(1).rolling(10, min_periods=1).mean()
    df["net_rating_L10"] = grouped["net_rating"].shift(1).rolling(10, min_periods=1).mean()
    df["win_rate_L10"] = grouped["won"].shift(1).rolling(10, min_periods=1).mean()

    return df

team_log = add_l10_window(team_log)
print("✓ L10 window added")

# =============================================================================
# STEP 2 — EXPONENTIALLY WEIGHTED STATS
# =============================================================================

print("\n[3] Adding exponentially weighted rolling stats")

def add_ew_stats(
    df: pd.DataFrame,
    alpha: float = 0.95,
    window: int = 10
) -> pd.DataFrame:
    """
    Add exponentially weighted rolling stats (EWMA),
    shifted by 1 game to prevent leakage.
    """
    df = df.copy()
    ew_frames = []

    for (team, season), group in df.groupby(["team", "season"]):
        group = group.sort_values("game_date").reset_index(drop=True)

        # Shift to avoid leakage
        shifted = group[["net_rating", "pts_scored", "pts_allowed"]].shift(1)

        # EWMA
        ew = shifted.ewm(alpha=1 - alpha, adjust=False).mean()

        # Optional: cap influence to last N games
        if window is not None:
            ew = ew.rolling(window, min_periods=1).mean()

        group["net_rating_EW10"] = ew["net_rating"]
        group["pts_scored_EW10"] = ew["pts_scored"]
        group["pts_allowed_EW10"] = ew["pts_allowed"]

        ew_frames.append(group)

    return pd.concat(ew_frames, ignore_index=True)


team_log = add_ew_stats(team_log, alpha=DECAY_ALPHA, window=10)
print("✓ Exponentially weighted stats added")

# =============================================================================
# STEP 3 — DAYS SINCE TRADE DEADLINE
# =============================================================================

print("\n[4] Adding days since trade deadline feature")

def add_trade_deadline(df: pd.DataFrame, deadlines: dict) -> pd.DataFrame:
    df = df.copy()
    deadlines_dt = {s: pd.to_datetime(d) for s, d in deadlines.items()}

    df["trade_deadline"] = df["season"].map(deadlines_dt)
    df["days_since_deadline"] = (df["game_date"] - df["trade_deadline"]).dt.days
    df["days_since_deadline"] = df["days_since_deadline"].clip(-60, 60)
    df["is_post_deadline"] = (df["days_since_deadline"] > 0).astype(int)
    return df

team_log = add_trade_deadline(team_log, TRADE_DEADLINES)
print("✓ Trade deadline feature added")
print(f"  Games before deadline: {(team_log['days_since_deadline'] < 0).sum()}")
print(f"  Games after deadline: {(team_log['days_since_deadline'] > 0).sum()}")

# =============================================================================
# STEP 4 — SAVE ENHANCED CACHE
# =============================================================================

print("\n[5] Saving enhanced rolling stats cache")

cache_path = save_checkpoint(
    team_log,
    "team_rolling_stats_enhanced.csv",
    "Enhanced cache with L10, EW stats, and trade deadline"
)
print(f"✓ Enhanced cache saved: {cache_path}")

# =============================================================================
# STEP 5 — MERGE TO GAME LEVEL
# =============================================================================

print("\n[6] Merging enhanced features to game level")

games = load_checkpoint("games_with_context.csv")
games["game_date"] = pd.to_datetime(games["game_date"])

merge_cols = [
    "game_id", "team",
    "net_rating_L10", "pts_scored_L10", "pts_allowed_L10", "win_rate_L10",
    "net_rating_EW10", "pts_scored_EW10", "pts_allowed_EW10",
    "days_since_deadline", "is_post_deadline"
]

# Home merge
games = games.merge(
    team_log[merge_cols],
    left_on=["game_id", "home_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team")

games = games.rename(columns={
    "net_rating_L10": "home_net_L10",
    "pts_scored_L10": "home_pts_L10",
    "pts_allowed_L10": "home_pa_L10",
    "win_rate_L10": "home_winpct_L10",
    "net_rating_EW10": "home_net_EW10",
    "pts_scored_EW10": "home_pts_EW10",
    "pts_allowed_EW10": "home_pa_EW10",
    "days_since_deadline": "home_days_since_deadline",
    "is_post_deadline": "home_post_deadline"
})

# Away merge
games = games.merge(
    team_log[merge_cols],
    left_on=["game_id", "away_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team")

games = games.rename(columns={
    "net_rating_L10": "away_net_L10",
    "pts_scored_L10": "away_pts_L10",
    "pts_allowed_L10": "away_pa_L10",
    "win_rate_L10": "away_winpct_L10",
    "net_rating_EW10": "away_net_EW10",
    "pts_scored_EW10": "away_pts_EW10",
    "pts_allowed_EW10": "away_pa_EW10",
    "days_since_deadline": "away_days_since_deadline",
    "is_post_deadline": "away_post_deadline"
})

print("✓ Enhanced features merged to game level")

# =============================================================================
# STEP 6 — NORMALIZE FEATURES
# =============================================================================

print("\n[7] Normalizing new features within seasons")

def normalize_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    cols = [
        "home_net_L10", "home_pts_L10", "home_pa_L10",
        "away_net_L10", "away_pts_L10", "away_pa_L10",
        "home_net_EW10", "home_pts_EW10", "home_pa_EW10",
        "away_net_EW10", "away_pts_EW10", "away_pa_EW10"
    ]
    for col in cols:
        df[f"{col}_z"] = df.groupby("season")[col].transform(
            lambda x: (x - x.mean()) / x.std(ddof=0) if x.std(ddof=0) != 0 else 0
        )
    return df

games = normalize_features(games)
print("✓ New features normalized")

# =============================================================================
# STEP 7 — CREATE MATCHUP FEATURES
# =============================================================================

print("\n[8] Creating enhanced matchup features")

games["net_diff_L10"] = games["home_net_L10_z"] - games["away_net_L10_z"]
games["off_vs_def_L10"] = games["home_pts_L10_z"] - games["away_pa_L10_z"]
games["winpct_diff_L10"] = games["home_winpct_L10"] - games["away_winpct_L10"]

games["net_diff_EW10"] = games["home_net_EW10_z"] - games["away_net_EW10_z"]
games["off_vs_def_EW10"] = games["home_pts_EW10_z"] - games["away_pa_EW10_z"]

games["deadline_advantage"] = games["home_days_since_deadline"] - games["away_days_since_deadline"]
games["both_post_deadline"] = ((games["home_post_deadline"] == 1) & (games["away_post_deadline"] == 1)).astype(int)

print("✓ Enhanced matchup features created")

# =============================================================================
# STEP 8 — SAVE ENHANCED DATASET
# =============================================================================

print("\n[9] Saving enhanced game-level dataset")

output_path = save_checkpoint(
    games,
    "games_with_enhanced_features.csv",
    "Games with L10, EW, and trade deadline features"
)

print(f"✓ Enhanced dataset saved: {output_path}")
print(f"  Total features: {len(games.columns)}")

# =============================================================================
# SUMMARY
# =============================================================================

print("\n" + "="*60)
print("PHASE 1 FEATURES COMPLETE")
print("="*60)
print("\n✓ New features added:")
print("  • L10 rolling window (10-game averages)")
print("  • Exponentially weighted stats (recent games weighted higher)")
print("  • Days since trade deadline")
print("  • Post-deadline binary indicator")
print("  • Enhanced matchup differentials")
print(f"\n✓ Total games: {len(games)}")
print(f"✓ Total features: {len(games.columns)}")
print("\n" + "="*60)
print("NEXT: Train models with enhanced features")
print("="*60)



PHASE 1: ADVANCED FEATURES & ENSEMBLE

[1] Loading team game log
✓ Loaded: team_rolling_stats_cache.csv
  Shape: (7380, 20)
✓ Loaded 7380 team-game rows

[2] Adding L10 rolling window
✓ L10 window added

[3] Adding exponentially weighted rolling stats
✓ Exponentially weighted stats added

[4] Adding days since trade deadline feature
✓ Trade deadline feature added
  Games before deadline: 4826
  Games after deadline: 2514

[5] Saving enhanced rolling stats cache
✓ [2026-01-06 00:57:01] Saved: team_rolling_stats_enhanced.csv
  Enhanced cache with L10, EW stats, and trade deadline
  Shape: (7380, 26)
✓ Enhanced cache saved: /content/drive/MyDrive/nba_predictor/data/processed/team_rolling_stats_enhanced.csv

[6] Merging enhanced features to game level
✓ Loaded: games_with_context.csv
  Shape: (3690, 18)
✓ Enhanced features merged to game level

[7] Normalizing new features within seasons
✓ New features normalized

[8] Creating enhanced matchup features
✓ Enhanced matchup features created


In [10]:
"""
NBA Win Probability Predictor
Stage 7 — ENSEMBLE MODEL (PRODUCTION, FIXED)

Ensemble of:
- Logistic Regression (scaled)
- XGBoost (native API, safe early stopping)

Trains ONLY on enhanced feature set (L10 + EW10 + Deadline).
"""

# =============================================================================
# HARD SAFETY GUARANTEES (DO NOT MOVE)
# =============================================================================

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Path("evaluation").mkdir(parents=True, exist_ok=True)
Path("models").mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# =============================================================================
# IMPORTS
# =============================================================================

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.calibration import calibration_curve
from scipy.optimize import minimize
import xgboost as xgb
import pickle
import json

# =============================================================================
# LOAD DATA (FAIL LOUDLY)
# =============================================================================

print("\n" + "=" * 60)
print("ENSEMBLE MODEL WITH ENHANCED FEATURES (PRODUCTION)")
print("=" * 60)

games = load_checkpoint("games_with_enhanced_features.csv")
if games is None:
    raise FileNotFoundError(
        "games_with_enhanced_features.csv missing. "
        "Run Phase 1 feature generation first."
    )

games["game_date"] = pd.to_datetime(games["game_date"])
print(f"✓ Loaded {len(games)} games")

# =============================================================================
# FEATURE DEFINITIONS (ENHANCED ONLY)
# =============================================================================

context_features = [
    "home_rest_days", "away_rest_days", "rest_advantage",
    "home_b2b", "away_b2b", "both_b2b"
]

l10_features = [
    "home_net_L10_z", "away_net_L10_z",
    "home_pts_L10_z", "away_pts_L10_z",
    "home_pa_L10_z", "away_pa_L10_z",
    "home_winpct_L10", "away_winpct_L10",
    "net_diff_L10", "off_vs_def_L10", "winpct_diff_L10"
]

ew_features = [
    "home_net_EW10_z", "away_net_EW10_z",
    "home_pts_EW10_z", "away_pts_EW10_z",
    "home_pa_EW10_z", "away_pa_EW10_z",
    "net_diff_EW10", "off_vs_def_EW10"
]

deadline_features = [
    "home_days_since_deadline", "away_days_since_deadline",
    "deadline_advantage", "both_post_deadline"
]

all_features = (
    context_features
    + l10_features
    + ew_features
    + deadline_features
)

# =============================================================================
# FEATURE CONTRACT VALIDATION (CRITICAL)
# =============================================================================

missing = [f for f in all_features if f not in games.columns]
if missing:
    raise RuntimeError(f"Missing features in dataset: {missing}")

# =============================================================================
# TYPE SAFETY + IMPUTATION
# =============================================================================

y = games["home_win"].astype(int)

X = (
    games[all_features]
    .apply(pd.to_numeric, errors="coerce")
    .fillna(games[all_features].mean())
    .astype(float)
)

# =============================================================================
# TIME-SAFE TRAIN / VAL / TEST SPLIT
# =============================================================================

TRAIN_SEASONS = ["2021-22", "2022-23"]
TEST_SEASON = "2023-24"

train_df = games[games["season"].isin(TRAIN_SEASONS)].sort_values("game_date")
test_df = games[games["season"] == TEST_SEASON].sort_values("game_date")

X_train_full = X.loc[train_df.index].reset_index(drop=True)
y_train_full = y.loc[train_df.index].reset_index(drop=True)

X_test = X.loc[test_df.index].reset_index(drop=True)
y_test = y.loc[test_df.index].reset_index(drop=True)

val_size = int(len(X_train_full) * 0.1)

X_train = X_train_full.iloc[:-val_size]
y_train = y_train_full.iloc[:-val_size]
X_val = X_train_full.iloc[-val_size:]
y_val = y_train_full.iloc[-val_size:]

print(f"✓ Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

# =============================================================================
# LOGISTIC REGRESSION
# =============================================================================

print("\n[2] Training Logistic Regression")

lr = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        max_iter=1000,
        solver="lbfgs",
        random_state=RANDOM_SEED
    ))
])

lr.fit(X_train, y_train)

lr_val = lr.predict_proba(X_val)[:, 1]
lr_test = lr.predict_proba(X_test)[:, 1]

# =============================================================================
# XGBOOST (NATIVE API)
# =============================================================================

print("\n[3] Training XGBoost")

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 4,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": RANDOM_SEED,
    "tree_method": "hist",
}

model_xgb = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dval, "val")],
    early_stopping_rounds=30,
    verbose_eval=False
)

xgb_val = model_xgb.predict(dval)
xgb_test = model_xgb.predict(dtest)

# =============================================================================
# ENSEMBLE — WEIGHT OPTIMIZATION
# =============================================================================

def ensemble_logloss(w):
    p = w[0] * lr_val + w[1] * xgb_val
    p = np.clip(p, 1e-15, 1 - 1e-15)
    return log_loss(y_val, p)

result = minimize(
    ensemble_logloss,
    x0=[0.5, 0.5],
    bounds=[(0, 1), (0, 1)],
    constraints={"type": "eq", "fun": lambda w: w.sum() - 1}
)

weights = result.x if result.success else np.array([0.5, 0.5])

ensemble_test = weights[0] * lr_test + weights[1] * xgb_test

# =============================================================================
# CALIBRATION PLOT
# =============================================================================

plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], "k--")

for probs, label in [
    (lr_test, "Logistic Regression"),
    (xgb_test, "XGBoost"),
    (ensemble_test, "Ensemble"),
]:
    t, p = calibration_curve(y_test, probs, n_bins=10)
    plt.plot(p, t, marker="o", label=label)

plt.legend()
plt.tight_layout()
plt.savefig("evaluation/calibration_models.png", dpi=150)
plt.close()

# =============================================================================
# SAVE MODELS
# =============================================================================

with open("models/logistic_regression_enhanced.pkl", "wb") as f:
    pickle.dump(lr, f)

model_xgb.save_model("models/xgboost_enhanced.json")

with open("models/ensemble_config.json", "w") as f:
    json.dump(
        {"weights": weights.tolist(), "seed": RANDOM_SEED},
        f,
        indent=2
    )

print("✓ Production ensemble training complete")



ENSEMBLE MODEL WITH ENHANCED FEATURES (PRODUCTION)
✓ Loaded: games_with_enhanced_features.csv
  Shape: (3690, 55)
✓ Loaded 3690 games
✓ Train: 2214 | Val: 246 | Test: 1230

[2] Training Logistic Regression

[3] Training XGBoost
✓ Production ensemble training complete


In [11]:
"""
NBA Win Probability Predictor
PHASE 2 — HISTORICAL DATA EXPANSION

Pulls 10 seasons of NBA data: 2015-16 through 2024-25
(Note: 2024-25 is current season, may be incomplete)

Production features:
- Intelligent retry with exponential backoff
- Progress tracking with ETA
- Checkpoint saves after each season
- Handles API rate limits gracefully
"""

import time
import random
from typing import List, Optional
from datetime import datetime

import pandas as pd
from nba_api.stats.endpoints import leaguegamefinder
from nba_api.stats.static import teams
from requests.exceptions import Timeout, ConnectionError

print("\n" + "="*60)
print("PHASE 2: HISTORICAL DATA COLLECTION (2015-2024)")
print("="*60)

# =============================================================================
# CONFIGURATION
# =============================================================================

# All seasons from 2015-16 to 2023-24
HISTORICAL_SEASONS = [
    "2015-16", "2016-17", "2017-18", "2018-19", "2019-20",  # 5 new seasons
    "2020-21", "2021-22", "2022-23", "2023-24"              # Already have these
]

# API settings
SLEEP_RANGE = (2.0, 4.0)  # Longer delays for historical pulls
MAX_RETRIES = 5
CHECKPOINT_FREQUENCY = 1  # Save after each season

print(f"\n✓ Configuration:")
print(f"  Seasons to pull: {len(HISTORICAL_SEASONS)}")
print(f"  Date range: {HISTORICAL_SEASONS[0]} to {HISTORICAL_SEASONS[-1]}")
print(f"  Sleep range: {SLEEP_RANGE[0]}-{SLEEP_RANGE[1]} seconds")
print(f"  Max retries: {MAX_RETRIES}")

# =============================================================================
# STEP 1 — LOCK TEAM IDS (WITH HISTORICAL AWARENESS)
# =============================================================================

print("\n[1] Locking team identifiers")

nba_teams = teams.get_teams()
TEAM_ID_TO_ABBR = {t["id"]: t["abbreviation"] for t in nba_teams}

# Note: Some franchises changed names/cities
# - Charlotte Bobcats → Charlotte Hornets (2014)
# - New Jersey Nets → Brooklyn Nets (2012)
# - Seattle SuperSonics → Oklahoma City Thunder (2008)
# nba_api handles these transitions automatically

print(f"✓ Locked {len(TEAM_ID_TO_ABBR)} team identifiers")

# Special handling for historical franchises
HISTORICAL_TEAM_NOTES = {
    "CHA": "Charlotte became Hornets in 2014-15 (from Bobcats)",
}

print("\n  Historical notes:")
for abbr, note in HISTORICAL_TEAM_NOTES.items():
    print(f"    {abbr}: {note}")

# =============================================================================
# STEP 2 — SMART API PULL WITH RETRY LOGIC
# =============================================================================

def pull_season_games(
    season: str,
    retry_delay_multiplier: float = 1.5
) -> Optional[pd.DataFrame]:
    """
    Pull raw team-game rows with intelligent retry + backoff.

    Args:
        season: Season string (e.g., "2015-16")
        retry_delay_multiplier: Exponential backoff multiplier

    Returns:
        DataFrame of team-game rows, or None if all retries fail
    """
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"    Attempt {attempt}/{MAX_RETRIES}...", end=" ")

            finder = leaguegamefinder.LeagueGameFinder(
                season_nullable=season,
                league_id_nullable="00",
                season_type_nullable="Regular Season",
            )

            df = finder.get_data_frames()[0]

            # Lock team abbreviations immediately
            df["team_abbr"] = df["TEAM_ID"].map(TEAM_ID_TO_ABBR)
            df["season"] = season

            # Validate
            assert df["team_abbr"].notna().all(), "Missing team abbreviations"
            assert len(df) > 2000, f"Too few rows: {len(df)}"

            print(f"✓ {len(df)} rows")

            # Respectful sleep (longer for historical data)
            sleep_time = random.uniform(*SLEEP_RANGE)
            time.sleep(sleep_time)

            return df

        except (Timeout, ConnectionError) as e:
            wait_time = SLEEP_RANGE[1] * (retry_delay_multiplier ** attempt)
            print(f"✗ Network error. Waiting {wait_time:.1f}s...")
            time.sleep(wait_time)

        except AssertionError as e:
            print(f"✗ Data quality issue: {e}")
            return None

        except Exception as e:
            print(f"✗ Unexpected error: {type(e).__name__}")
            if attempt < MAX_RETRIES:
                time.sleep(SLEEP_RANGE[1] * attempt)
            else:
                return None

    print(f"    ✗ All {MAX_RETRIES} attempts failed")
    return None

# =============================================================================
# STEP 3 — PROGRESSIVE COLLECTION WITH CHECKPOINTS
# =============================================================================

print("\n[2] Pulling historical game data")
print("  ⏳ This will take 5-10 minutes (10 seasons × ~30 seconds each)")
print()

start_time = datetime.now()
successful_seasons = []
failed_seasons = []
all_team_games = []

for idx, season in enumerate(HISTORICAL_SEASONS, 1):
    print(f"  [{idx}/{len(HISTORICAL_SEASONS)}] Pulling {season}...")

    df = pull_season_games(season)

    if df is not None:
        all_team_games.append(df)
        successful_seasons.append(season)

        # Checkpoint after each season
        if idx % CHECKPOINT_FREQUENCY == 0:
            checkpoint_df = pd.concat(all_team_games, ignore_index=True)
            checkpoint_path = f"data/raw/checkpoint_after_{season.replace('-', '_')}.csv"
            checkpoint_df.to_csv(checkpoint_path, index=False)
            print(f"    💾 Checkpoint saved: {len(checkpoint_df)} total rows")
    else:
        failed_seasons.append(season)
        print(f"    ⚠ Season {season} failed - continuing...")

    # Progress estimate
    elapsed = (datetime.now() - start_time).total_seconds()
    avg_time_per_season = elapsed / idx
    remaining_seasons = len(HISTORICAL_SEASONS) - idx
    eta_seconds = remaining_seasons * avg_time_per_season
    eta_minutes = eta_seconds / 60

    if idx < len(HISTORICAL_SEASONS):
        print(f"    ⏱ ETA: {eta_minutes:.1f} minutes remaining")
    print()

# =============================================================================
# STEP 4 — CONSOLIDATE AND VALIDATE
# =============================================================================

print("\n[3] Consolidating results")

if len(all_team_games) == 0:
    print("✗ CRITICAL: No data collected. Check API connection.")
    raise RuntimeError("Data collection failed completely")

raw_games = pd.concat(all_team_games, ignore_index=True)

print(f"✓ Collection complete!")
print(f"  Successful: {len(successful_seasons)} seasons")
print(f"  Failed: {len(failed_seasons)} seasons")
print(f"  Total team-game rows: {len(raw_games):,}")

if failed_seasons:
    print(f"\n⚠ Failed seasons: {', '.join(failed_seasons)}")
    print("  Re-run this script to retry failed seasons")

# =============================================================================
# STEP 5 — DATA QUALITY CHECKS
# =============================================================================

print("\n[4] Running data quality checks")

# Check 1: Expected games per season
games_per_season = raw_games.groupby("season").size()
print("\nTeam-game rows per season:")
for season, count in games_per_season.items():
    expected = 2460  # 30 teams × 82 games
    status = "✓" if 2300 < count < 2600 else "⚠"
    print(f"  {status} {season}: {count:,} rows (expected ~{expected:,})")

# Check 2: Missing team abbreviations
missing_abbr = raw_games["team_abbr"].isna().sum()
if missing_abbr > 0:
    print(f"\n⚠ WARNING: {missing_abbr} rows missing team abbreviations")
else:
    print("\n✓ All rows have team abbreviations")

# Check 3: Date range
raw_games["GAME_DATE"] = pd.to_datetime(raw_games["GAME_DATE"])
date_range = (raw_games["GAME_DATE"].min(), raw_games["GAME_DATE"].max())
print(f"\n✓ Date range: {date_range[0].date()} to {date_range[1].date()}")

# Check 4: Unique games
n_unique_games = raw_games["GAME_ID"].nunique()
print(f"✓ Unique games: {n_unique_games:,}")
print(f"  Expected team-game rows: {n_unique_games * 2:,}")
print(f"  Actual team-game rows: {len(raw_games):,}")

if abs(len(raw_games) - n_unique_games * 2) > 10:
    print("  ⚠ Row count mismatch - possible data quality issue")

# =============================================================================
# STEP 6 — SAVE RAW HISTORICAL DATA
# =============================================================================

print("\n[5] Saving raw historical data")

raw_filepath = save_checkpoint(
    raw_games,
    "raw_games_historical_2015_2024.csv",
    "Historical team-game data (2015-2024)"
)

print(f"✓ Raw data saved: {raw_filepath}")
print(f"  Rows: {len(raw_games):,}")
print(f"  Columns: {len(raw_games.columns)}")
print(f"  Size: {raw_games.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# =============================================================================
# STEP 7 — TRANSFORM TO GAME LEVEL
# =============================================================================

print("\n[6] Transforming to game-level format")

def transform_historical_to_game_level(raw_games: pd.DataFrame) -> pd.DataFrame:
    """
    Transform team-game rows to game-level rows.
    One row per game with home/away split.
    """
    # Identify home vs away
    raw_games["is_home"] = raw_games["MATCHUP"].str.contains("vs.", na=False)

    home = raw_games[raw_games["is_home"]].copy()
    away = raw_games[~raw_games["is_home"]].copy()

    print(f"  Home rows: {len(home):,}")
    print(f"  Away rows: {len(away):,}")

    # Merge
    games = home.merge(
        away,
        on=["GAME_ID", "GAME_DATE", "season"],
        suffixes=("_home", "_away"),
        validate="one_to_one"
    )

    print(f"  Merged games: {len(games):,}")

    # Clean schema
    games = games.rename(columns={
        "GAME_ID": "game_id",
        "GAME_DATE": "game_date",
        "TEAM_ID_home": "home_team_id",
        "TEAM_ID_away": "away_team_id",
        "team_abbr_home": "home_team",
        "team_abbr_away": "away_team",
        "PTS_home": "home_score",
        "PTS_away": "away_score",
    })

    # Create target
    games["home_win"] = (games["home_score"] > games["away_score"]).astype(int)

    # Sort chronologically
    games["game_date"] = pd.to_datetime(games["game_date"])
    games = games.sort_values("game_date").reset_index(drop=True)

    # Select essential columns
    essential_cols = [
        "game_id", "game_date", "season",
        "home_team_id", "away_team_id",
        "home_team", "away_team",
        "home_score", "away_score", "home_win"
    ]

    return games[essential_cols]

games_historical = transform_historical_to_game_level(raw_games)

print(f"\n✓ Transformation complete")
print(f"  Games: {len(games_historical):,}")

# =============================================================================
# STEP 8 — HISTORICAL VALIDATION
# =============================================================================

print("\n[7] Validating historical dataset")

# Check: Chronological ordering
assert games_historical["game_date"].is_monotonic_increasing, \
    "Games not sorted chronologically"
print("✓ Chronologically sorted")

# Check: Home win rate per season
print("\nHome win rates by season:")
for season in games_historical["season"].unique():
    season_games = games_historical[games_historical["season"] == season]
    home_win_rate = season_games["home_win"].mean()
    status = "✓" if 0.53 < home_win_rate < 0.65 else "⚠"
    print(f"  {status} {season}: {home_win_rate:.3f}")

# Overall home win rate
overall_home_win = games_historical["home_win"].mean()
print(f"\n✓ Overall home win rate: {overall_home_win:.3f}")

# Check: Games per season
games_per_season = games_historical.groupby("season").size()
print("\nGames per season:")
for season, count in games_per_season.items():
    expected = 1230  # 30 teams × 82 games / 2
    status = "✓" if 1150 < count < 1300 else "⚠"

    # Special case: 2019-20 COVID season
    if season == "2019-20" and 900 < count < 1050:
        status = "✓ (COVID)"

    print(f"  {status} {season}: {count} games")

# =============================================================================
# STEP 9 — SAVE HISTORICAL GAME-LEVEL DATA
# =============================================================================

print("\n[8] Saving historical game-level data")

games_filepath = save_checkpoint(
    games_historical,
    "games_outcomes_historical_2015_2024.csv",
    "Historical game-level outcomes (2015-2024)"
)

print(f"✓ Historical games saved: {games_filepath}")

# =============================================================================
# SUMMARY
# =============================================================================

print("\n" + "="*60)
print("HISTORICAL DATA COLLECTION COMPLETE")
print("="*60)

elapsed_time = (datetime.now() - start_time).total_seconds() / 60

print(f"\n✓ Collection time: {elapsed_time:.1f} minutes")
print(f"✓ Seasons collected: {len(successful_seasons)}/{len(HISTORICAL_SEASONS)}")
print(f"✓ Total games: {len(games_historical):,}")
print(f"✓ Date range: {games_historical['game_date'].min().date()} to {games_historical['game_date'].max().date()}")
print(f"✓ Overall home win rate: {overall_home_win:.3f}")

if successful_seasons:
    print(f"\n✓ Successful seasons:")
    for season in successful_seasons:
        print(f"    • {season}")

if failed_seasons:
    print(f"\n⚠ Failed seasons (retry recommended):")
    for season in failed_seasons:
        print(f"    • {season}")

print("\n" + "="*60)
print("NEXT STEPS")
print("="*60)

print("\n1. Run context features script on historical data")
print("2. Run rolling stats with 10 seasons of history")
print("3. Train models with 5x more data")
print("4. Expected improvements:")
print("   • Log loss: 0.650 → 0.620-0.630")
print("   • MAE: 7.5 → 6.0-6.5 wins")
print("   • More stable feature importance")


PHASE 2: HISTORICAL DATA COLLECTION (2015-2024)

✓ Configuration:
  Seasons to pull: 9
  Date range: 2015-16 to 2023-24
  Sleep range: 2.0-4.0 seconds
  Max retries: 5

[1] Locking team identifiers
✓ Locked 30 team identifiers

  Historical notes:
    CHA: Charlotte became Hornets in 2014-15 (from Bobcats)

[2] Pulling historical game data
  ⏳ This will take 5-10 minutes (10 seasons × ~30 seconds each)

  [1/9] Pulling 2015-16...
    Attempt 1/5... ✓ 2460 rows
    💾 Checkpoint saved: 2460 total rows
    ⏱ ETA: 1.2 minutes remaining

  [2/9] Pulling 2016-17...
    Attempt 1/5... ✓ 2460 rows
    💾 Checkpoint saved: 4920 total rows
    ⏱ ETA: 1.6 minutes remaining

  [3/9] Pulling 2017-18...
    Attempt 1/5... ✓ 2460 rows
    💾 Checkpoint saved: 7380 total rows
    ⏱ ETA: 1.2 minutes remaining

  [4/9] Pulling 2018-19...
    Attempt 1/5... ✓ 2460 rows
    💾 Checkpoint saved: 9840 total rows
    ⏱ ETA: 1.0 minutes remaining

  [5/9] Pulling 2019-20...
    Attempt 1/5... ✓ 2118 rows
    💾 

In [12]:
"""
NBA Win Probability Predictor
HISTORICAL DATA PROCESSING PIPELINE

Processes 10,749 games (2015-2024) through complete feature pipeline:
1. Context features (rest, B2B, trade deadline)
2. Rolling stats (L5, L10, exponentially weighted)
3. Season normalization
4. Enhanced matchup features

This is a consolidated pipeline - runs all steps sequentially.
"""

import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict

print("\n" + "="*60)
print("HISTORICAL DATA PROCESSING PIPELINE")
print("="*60)

# =============================================================================
# CONFIGURATION
# =============================================================================

# Trade deadlines for all historical seasons
TRADE_DEADLINES = {
    "2015-16": "2016-02-18",
    "2016-17": "2017-02-23",
    "2017-18": "2018-02-08",
    "2018-19": "2019-02-07",
    "2019-20": "2020-02-06",
    "2020-21": "2021-03-25",  # COVID delay
    "2021-22": "2022-02-10",
    "2022-23": "2023-02-09",
    "2023-24": "2024-02-08"
}

DECAY_ALPHA = 0.95
WINDOWS = [5, 10]
PRIMARY_WINDOW = 5

print(f"\n✓ Configuration:")
print(f"  Seasons: {len(TRADE_DEADLINES)}")
print(f"  Rolling windows: {WINDOWS}")
print(f"  Exponential decay alpha: {DECAY_ALPHA}")

# =============================================================================
# STAGE 1: LOAD HISTORICAL GAMES
# =============================================================================

print("\n" + "="*60)
print("STAGE 1: CONTEXT FEATURES")
print("="*60)

print("\n[1] Loading historical games")

games = pd.read_csv("data/raw/games_outcomes_historical_2015_2024.csv")
games["game_date"] = pd.to_datetime(games["game_date"])

print(f"✓ Loaded {len(games):,} games")
print(f"✓ Date range: {games['game_date'].min().date()} to {games['game_date'].max().date()}")
print(f"✓ Seasons: {sorted(games['season'].unique())}")

# =============================================================================
# STAGE 1A: BUILD TEAM SCHEDULE
# =============================================================================

print("\n[2] Building team schedules")

def build_team_schedule(games_df: pd.DataFrame) -> pd.DataFrame:
    """Create chronological schedule for each team"""
    home = games_df[["game_id", "game_date", "season", "home_team"]].copy()
    home = home.rename(columns={"home_team": "team"})
    home["is_home"] = 1

    away = games_df[["game_id", "game_date", "season", "away_team"]].copy()
    away = away.rename(columns={"away_team": "team"})
    away["is_home"] = 0

    schedule = pd.concat([home, away], ignore_index=True)
    schedule = schedule.sort_values(["team", "season", "game_date"]).reset_index(drop=True)

    return schedule

team_schedule = build_team_schedule(games)

print(f"✓ Team-game rows: {len(team_schedule):,}")
print(f"✓ Teams: {team_schedule['team'].nunique()}")

# =============================================================================
# STAGE 1B: COMPUTE REST DAYS
# =============================================================================

print("\n[3] Computing rest days (season-aware)")

def compute_rest_days(schedule: pd.DataFrame) -> pd.DataFrame:
    """Compute rest days within each season"""
    schedule = schedule.copy()

    schedule["prev_game_date"] = (
        schedule.groupby(["team", "season"])["game_date"].shift(1)
    )

    schedule["is_season_opener"] = schedule["prev_game_date"].isna().astype(int)

    schedule["rest_days"] = (
        schedule["game_date"] - schedule["prev_game_date"]
    ).dt.days

    # Season openers: cap at 7 days
    schedule["rest_days"] = schedule["rest_days"].fillna(7).astype(int)

    # Back-to-back indicator
    schedule["is_back_to_back"] = (schedule["rest_days"] == 1).astype(int)

    return schedule

team_schedule = compute_rest_days(team_schedule)

print(f"✓ Rest days computed")
print(f"  Range: {team_schedule['rest_days'].min()}-{team_schedule['rest_days'].max()} days")
print(f"  Back-to-back rate: {team_schedule['is_back_to_back'].mean():.3f}")

# =============================================================================
# STAGE 1C: ADD TRADE DEADLINE FEATURES
# =============================================================================

print("\n[4] Adding trade deadline features")

def add_trade_deadline_features(schedule: pd.DataFrame, deadlines: dict) -> pd.DataFrame:
    """Add days since trade deadline per season"""
    schedule = schedule.copy()

    deadlines_dt = {season: pd.to_datetime(date) for season, date in deadlines.items()}

    schedule["trade_deadline"] = schedule["season"].map(deadlines_dt)
    schedule["days_since_deadline"] = (
        schedule["game_date"] - schedule["trade_deadline"]
    ).dt.days

    schedule["days_since_deadline"] = schedule["days_since_deadline"].clip(-60, 60)
    schedule["is_post_deadline"] = (schedule["days_since_deadline"] > 0).astype(int)

    return schedule

team_schedule = add_trade_deadline_features(team_schedule, TRADE_DEADLINES)

print(f"✓ Trade deadline features added")
print(f"  Pre-deadline games: {(team_schedule['days_since_deadline'] < 0).sum():,}")
print(f"  Post-deadline games: {(team_schedule['days_since_deadline'] > 0).sum():,}")

# =============================================================================
# STAGE 1D: MERGE CONTEXT TO GAME LEVEL
# =============================================================================

print("\n[5] Merging context features to game level")

context_cols = ["game_id", "team", "rest_days", "is_back_to_back",
                "is_season_opener", "days_since_deadline", "is_post_deadline"]

# Home team
games = games.merge(
    team_schedule[context_cols],
    left_on=["game_id", "home_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team").rename(columns={
    "rest_days": "home_rest_days",
    "is_back_to_back": "home_b2b",
    "is_season_opener": "home_season_opener",
    "days_since_deadline": "home_days_since_deadline",
    "is_post_deadline": "home_post_deadline"
})

# Away team
games = games.merge(
    team_schedule[context_cols],
    left_on=["game_id", "away_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team").rename(columns={
    "rest_days": "away_rest_days",
    "is_back_to_back": "away_b2b",
    "is_season_opener": "away_season_opener",
    "days_since_deadline": "away_days_since_deadline",
    "is_post_deadline": "away_post_deadline"
})

# Relative features
games["rest_advantage"] = games["home_rest_days"] - games["away_rest_days"]
games["both_b2b"] = ((games["home_b2b"] == 1) & (games["away_b2b"] == 1)).astype(int)
games["deadline_advantage"] = games["home_days_since_deadline"] - games["away_days_since_deadline"]
games["both_post_deadline"] = (
    (games["home_post_deadline"] == 1) & (games["away_post_deadline"] == 1)
).astype(int)

print("✓ Context features merged")

# =============================================================================
# STAGE 2: ROLLING STATS
# =============================================================================

print("\n" + "="*60)
print("STAGE 2: ROLLING PERFORMANCE FEATURES")
print("="*60)

print("\n[6] Building team game log")

def build_team_game_log(games_df: pd.DataFrame) -> pd.DataFrame:
    """Create game log with team performance stats"""
    # Home games
    home_log = games_df[[
        "game_id", "game_date", "season",
        "home_team", "away_team",
        "home_score", "away_score"
    ]].copy()
    home_log = home_log.rename(columns={
        "home_team": "team",
        "away_team": "opponent",
        "home_score": "pts_scored",
        "away_score": "pts_allowed"
    })
    home_log["is_home"] = 1

    # Away games
    away_log = games_df[[
        "game_id", "game_date", "season",
        "away_team", "home_team",
        "away_score", "home_score"
    ]].copy()
    away_log = away_log.rename(columns={
        "away_team": "team",
        "home_team": "opponent",
        "away_score": "pts_scored",
        "home_score": "pts_allowed"
    })
    away_log["is_home"] = 0

    team_log = pd.concat([home_log, away_log], ignore_index=True)
    team_log = team_log.sort_values(["team", "season", "game_date"]).reset_index(drop=True)

    team_log["net_rating"] = team_log["pts_scored"] - team_log["pts_allowed"]
    team_log["won"] = (team_log["pts_scored"] > team_log["pts_allowed"]).astype(int)

    return team_log

team_log = build_team_game_log(games)

print(f"✓ Team game log: {len(team_log):,} rows")

# =============================================================================
# STAGE 2A: PRECOMPUTE ROLLING STATS
# =============================================================================

print("\n[7] Precomputing rolling stats (L5, L10)")
print("  ⏳ This may take 30-60 seconds...")

def precompute_rolling_stats(team_log_df: pd.DataFrame, windows: list) -> pd.DataFrame:
    """Precompute rolling averages for all windows"""
    log = team_log_df.copy()
    grouped = log.groupby(["team", "season"])

    for window in windows:
        print(f"    Computing L{window} window...")

        log[f"pts_scored_L{window}"] = (
            grouped["pts_scored"].shift(1).rolling(window, min_periods=1).mean()
        )
        log[f"pts_allowed_L{window}"] = (
            grouped["pts_allowed"].shift(1).rolling(window, min_periods=1).mean()
        )
        log[f"net_rating_L{window}"] = (
            grouped["net_rating"].shift(1).rolling(window, min_periods=1).mean()
        )
        log[f"win_rate_L{window}"] = (
            grouped["won"].shift(1).rolling(window, min_periods=1).mean()
        )

    return log

team_log = precompute_rolling_stats(team_log, WINDOWS)

print("✓ Rolling stats computed")

# =============================================================================
# STAGE 2B: EXPONENTIALLY WEIGHTED STATS
# =============================================================================

print("\n[8] Computing exponentially weighted stats")
print("  ⏳ This will take 1-2 minutes...")

def add_exponential_weighted_stats(
    team_log_df: pd.DataFrame,
    alpha: float = 0.95,
    window: int = 10
) -> pd.DataFrame:
    """Add exponentially weighted rolling stats"""
    log = team_log_df.copy()

    ew_stats = []

    for (team, season), group in log.groupby(["team", "season"]):
        group = group.sort_values("game_date").reset_index(drop=True)

        ew_net = []
        ew_pts = []
        ew_pa = []

        for idx in range(len(group)):
            if idx == 0:
                ew_net.append(np.nan)
                ew_pts.append(np.nan)
                ew_pa.append(np.nan)
            else:
                past_games = group.iloc[max(0, idx - window):idx]
                n_past = len(past_games)
                weights = np.array([alpha ** (n_past - i - 1) for i in range(n_past)])
                weights = weights / weights.sum()

                ew_net.append(np.average(past_games["net_rating"], weights=weights))
                ew_pts.append(np.average(past_games["pts_scored"], weights=weights))
                ew_pa.append(np.average(past_games["pts_allowed"], weights=weights))

        group["net_rating_EW10"] = ew_net
        group["pts_scored_EW10"] = ew_pts
        group["pts_allowed_EW10"] = ew_pa

        ew_stats.append(group)

    return pd.concat(ew_stats, ignore_index=True)

team_log = add_exponential_weighted_stats(team_log, alpha=DECAY_ALPHA, window=10)

print("✓ Exponentially weighted stats computed")

# =============================================================================
# STAGE 2C: SAVE ENHANCED CACHE
# =============================================================================

print("\n[9] Saving enhanced rolling stats cache")

cache_path = save_checkpoint(
    team_log,
    "team_rolling_stats_historical_enhanced.csv",
    "Historical rolling stats cache (2015-2024)"
)

print(f"✓ Cache saved: {cache_path}")
print(f"  Columns: {len(team_log.columns)}")

# =============================================================================
# STAGE 3: MERGE & NORMALIZE
# =============================================================================

print("\n" + "="*60)
print("STAGE 3: MERGE & NORMALIZATION")
print("="*60)

print("\n[10] Merging rolling stats to game level")

# Merge L5 features
for window in WINDOWS:
    merge_cols = [
        "game_id", "team",
        f"pts_scored_L{window}", f"pts_allowed_L{window}",
        f"net_rating_L{window}", f"win_rate_L{window}"
    ]

    # Home team
    games = games.merge(
        team_log[merge_cols],
        left_on=["game_id", "home_team"],
        right_on=["game_id", "team"],
        how="left",
        validate="one_to_one"
    ).drop(columns="team").rename(columns={
        f"pts_scored_L{window}": f"home_pts_L{window}",
        f"pts_allowed_L{window}": f"home_pa_L{window}",
        f"net_rating_L{window}": f"home_net_L{window}",
        f"win_rate_L{window}": f"home_winpct_L{window}"
    })

    # Away team
    games = games.merge(
        team_log[merge_cols],
        left_on=["game_id", "away_team"],
        right_on=["game_id", "team"],
        how="left",
        validate="one_to_one"
    ).drop(columns="team").rename(columns={
        f"pts_scored_L{window}": f"away_pts_L{window}",
        f"pts_allowed_L{window}": f"away_pa_L{window}",
        f"net_rating_L{window}": f"away_net_L{window}",
        f"win_rate_L{window}": f"away_winpct_L{window}"
    })

# Merge EW10 features
ew_cols = ["game_id", "team", "net_rating_EW10", "pts_scored_EW10", "pts_allowed_EW10"]

games = games.merge(
    team_log[ew_cols],
    left_on=["game_id", "home_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team").rename(columns={
    "net_rating_EW10": "home_net_EW10",
    "pts_scored_EW10": "home_pts_EW10",
    "pts_allowed_EW10": "home_pa_EW10"
})

games = games.merge(
    team_log[ew_cols],
    left_on=["game_id", "away_team"],
    right_on=["game_id", "team"],
    how="left",
    validate="one_to_one"
).drop(columns="team").rename(columns={
    "net_rating_EW10": "away_net_EW10",
    "pts_scored_EW10": "away_pts_EW10",
    "pts_allowed_EW10": "away_pa_EW10"
})

print("✓ Rolling stats merged")

# =============================================================================
# STAGE 3A: SEASON-BASED NORMALIZATION
# =============================================================================

print("\n[11] Normalizing features within seasons")

def normalize_features(df: pd.DataFrame, windows: list) -> pd.DataFrame:
    """Normalize rolling features within each season"""
    df = df.copy()

    # Build list of columns to normalize
    cols_to_normalize = []

    for window in windows:
        cols_to_normalize.extend([
            f"home_pts_L{window}", f"home_pa_L{window}", f"home_net_L{window}",
            f"away_pts_L{window}", f"away_pa_L{window}", f"away_net_L{window}"
        ])

    cols_to_normalize.extend([
        "home_net_EW10", "home_pts_EW10", "home_pa_EW10",
        "away_net_EW10", "away_pts_EW10", "away_pa_EW10"
    ])

    for col in cols_to_normalize:
        df[f"{col}_z"] = df.groupby("season")[col].transform(
            lambda x: (x - x.mean()) / x.std(ddof=0) if x.std(ddof=0) != 0 else 0
        )

    return df

games = normalize_features(games, WINDOWS)

print("✓ Features normalized")

# =============================================================================
# STAGE 3B: CREATE MATCHUP FEATURES
# =============================================================================

print("\n[12] Creating matchup features")

# L5 matchup features
games["net_diff_L5"] = games["home_net_L5_z"] - games["away_net_L5_z"]
games["off_vs_def_L5"] = games["home_pts_L5_z"] - games["away_pa_L5_z"]
games["winpct_diff_L5"] = games["home_winpct_L5"] - games["away_winpct_L5"]

# L10 matchup features
games["net_diff_L10"] = games["home_net_L10_z"] - games["away_net_L10_z"]
games["off_vs_def_L10"] = games["home_pts_L10_z"] - games["away_pa_L10_z"]
games["winpct_diff_L10"] = games["home_winpct_L10"] - games["away_winpct_L10"]

# EW10 matchup features
games["net_diff_EW10"] = games["home_net_EW10_z"] - games["away_net_EW10_z"]
games["off_vs_def_EW10"] = games["home_pts_EW10_z"] - games["away_pa_EW10_z"]

print("✓ Matchup features created")

# =============================================================================
# STAGE 4: SAVE FINAL DATASET
# =============================================================================

print("\n[13] Saving final historical dataset")

output_path = save_checkpoint(
    games,
    "games_historical_complete_2015_2024.csv",
    "Complete historical dataset with all features (2015-2024)"
)

print(f"✓ Final dataset saved: {output_path}")
print(f"  Games: {len(games):,}")
print(f"  Features: {len(games.columns)}")

# =============================================================================
# SUMMARY
# =============================================================================

print("\n" + "="*60)
print("HISTORICAL PIPELINE COMPLETE")
print("="*60)

print(f"\n✓ Processed {len(games):,} games")
print(f"✓ Seasons: {games['season'].nunique()}")
print(f"✓ Date range: {games['game_date'].min().date()} to {games['game_date'].max().date()}")
print(f"✓ Total features: {len(games.columns)}")

print("\n✓ Feature summary:")
print(f"  Context features: 12")
print(f"  Rolling features (L5, L10, EW10): ~40")
print(f"  Matchup features: 9")
print(f"  Total: {len(games.columns)}")

print("\n" + "="*60)
print("READY FOR HISTORICAL MODEL TRAINING")
print("="*60)

print("\nNext step: Train models on historical data")
print("Expected improvements:")
print("  • 3x more training data (8,289 train games vs 2,460)")
print("  • Log loss: 0.650 → 0.620-0.630")
print("  • MAE: 7.5 → 6.0-6.5 wins")
print("  • More stable feature importance")


HISTORICAL DATA PROCESSING PIPELINE

✓ Configuration:
  Seasons: 9
  Rolling windows: [5, 10]
  Exponential decay alpha: 0.95

STAGE 1: CONTEXT FEATURES

[1] Loading historical games
✓ Loaded 10,749 games
✓ Date range: 2015-10-27 to 2024-04-14
✓ Seasons: ['2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

[2] Building team schedules
✓ Team-game rows: 21,498
✓ Teams: 30

[3] Computing rest days (season-aware)
✓ Rest days computed
  Range: 1-146 days
  Back-to-back rate: 0.180

[4] Adding trade deadline features
✓ Trade deadline features added
  Pre-deadline games: 14,190
  Post-deadline games: 7,206

[5] Merging context features to game level
✓ Context features merged

STAGE 2: ROLLING PERFORMANCE FEATURES

[6] Building team game log
✓ Team game log: 21,498 rows

[7] Precomputing rolling stats (L5, L10)
  ⏳ This may take 30-60 seconds...
    Computing L5 window...
    Computing L10 window...
✓ Rolling stats computed

[8] Computing expone

In [13]:
"""
NBA Win Probability Predictor
HISTORICAL MODEL TRAINING (2015-2024)

Trains ensemble on 9 seasons of data with multiple evaluation strategies:
1. Single holdout: Train on 2015-2022, test on 2023-24
2. Rolling validation: Test on each recent season
3. Full ensemble with optimized weights

Expected improvements:
- Log loss: 0.650 → 0.620-0.630
- MAE: 7.5 → 6.0-6.5 wins
- More stable feature importance
"""

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from sklearn.calibration import calibration_curve
from scipy.optimize import minimize
import xgboost as xgb
import pickle
import json

# Setup
Path("evaluation").mkdir(parents=True, exist_ok=True)
Path("models").mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("\n" + "="*60)
print("HISTORICAL MODEL TRAINING (2015-2024)")
print("="*60)

# =============================================================================
# LOAD HISTORICAL DATA
# =============================================================================

print("\n[1] Loading historical dataset")

games = pd.read_csv("data/processed/games_historical_complete_2015_2024.csv")
games["game_date"] = pd.to_datetime(games["game_date"])

print(f"✓ Loaded {len(games):,} games")
print(f"✓ Features: {len(games.columns)}")
print(f"✓ Seasons: {sorted(games['season'].unique())}")

# =============================================================================
# DEFINE FEATURE SET
# =============================================================================

print("\n[2] Defining feature set")

context_features = [
    "home_rest_days", "away_rest_days", "rest_advantage",
    "home_b2b", "away_b2b", "both_b2b",
    "home_days_since_deadline", "away_days_since_deadline",
    "deadline_advantage", "both_post_deadline"
]

# L5 features
l5_features = [
    "home_net_L5_z", "away_net_L5_z",
    "home_pts_L5_z", "away_pts_L5_z",
    "home_pa_L5_z", "away_pa_L5_z",
    "home_winpct_L5", "away_winpct_L5",
    "net_diff_L5", "off_vs_def_L5", "winpct_diff_L5"
]

# L10 features
l10_features = [
    "home_net_L10_z", "away_net_L10_z",
    "home_pts_L10_z", "away_pts_L10_z",
    "home_pa_L10_z", "away_pa_L10_z",
    "home_winpct_L10", "away_winpct_L10",
    "net_diff_L10", "off_vs_def_L10", "winpct_diff_L10"
]

# EW features
ew_features = [
    "home_net_EW10_z", "away_net_EW10_z",
    "home_pts_EW10_z", "away_pts_EW10_z",
    "home_pa_EW10_z", "away_pa_EW10_z",
    "net_diff_EW10", "off_vs_def_EW10"
]

all_features = context_features + l5_features + l10_features + ew_features

print(f"✓ Total features: {len(all_features)}")
print(f"  Context: {len(context_features)}")
print(f"  L5: {len(l5_features)}")
print(f"  L10: {len(l10_features)}")
print(f"  Exponential weighted: {len(ew_features)}")

# =============================================================================
# PREPARE DATA
# =============================================================================

print("\n[3] Preparing data")

y = games["home_win"].astype(int)
X = games[all_features].apply(pd.to_numeric, errors="coerce").fillna(games[all_features].mean())

print(f"✓ X shape: {X.shape}")
print(f"✓ Missing values after imputation: {X.isna().sum().sum()}")

# =============================================================================
# STRATEGY 1: SINGLE HOLDOUT (2015-2022 TRAIN, 2023-24 TEST)
# =============================================================================

print("\n" + "="*60)
print("STRATEGY 1: SINGLE HOLDOUT VALIDATION")
print("="*60)

TRAIN_SEASONS_SINGLE = ["2015-16", "2016-17", "2017-18", "2018-19",
                        "2019-20", "2020-21", "2021-22", "2022-23"]
TEST_SEASON_SINGLE = "2023-24"

print(f"\n[4] Creating train/val/test split")
print(f"  Train: {TRAIN_SEASONS_SINGLE[0]} to {TRAIN_SEASONS_SINGLE[-1]}")
print(f"  Test: {TEST_SEASON_SINGLE}")

train_mask = games["season"].isin(TRAIN_SEASONS_SINGLE)
test_mask = games["season"] == TEST_SEASON_SINGLE

X_train_full = X[train_mask].reset_index(drop=True)
y_train_full = y[train_mask].reset_index(drop=True)
X_test = X[test_mask].reset_index(drop=True)
y_test = y[test_mask].reset_index(drop=True)

# Validation set (last 10% of training)
val_size = int(len(X_train_full) * 0.1)
X_train = X_train_full.iloc[:-val_size]
y_train = y_train_full.iloc[:-val_size]
X_val = X_train_full.iloc[-val_size:]
y_val = y_train_full.iloc[-val_size:]

print(f"\n✓ Train: {len(X_train):,} games")
print(f"✓ Val: {len(X_val):,} games")
print(f"✓ Test: {len(X_test):,} games")
print(f"✓ Train home win rate: {y_train.mean():.3f}")
print(f"✓ Test home win rate: {y_test.mean():.3f}")

# =============================================================================
# TRAIN BASELINE
# =============================================================================

print("\n[5] Training baseline")

baseline_prob = y_train.mean()
baseline_preds = np.full(len(y_test), baseline_prob)

baseline_acc = accuracy_score(y_test, baseline_preds > 0.5)
baseline_logloss = log_loss(y_test, baseline_preds)
baseline_brier = brier_score_loss(y_test, baseline_preds)

print(f"✓ Baseline (constant {baseline_prob:.3f})")
print(f"  Accuracy: {baseline_acc:.4f}")
print(f"  Log Loss: {baseline_logloss:.4f}")

# =============================================================================
# TRAIN LOGISTIC REGRESSION
# =============================================================================

print("\n[6] Training Logistic Regression")

lr = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        penalty="l2",
        C=1.0,
        max_iter=1000,
        solver="lbfgs",
        random_state=RANDOM_SEED
    ))
])

lr.fit(X_train, y_train)

lr_probs_val = lr.predict_proba(X_val)[:, 1]
lr_probs_test = lr.predict_proba(X_test)[:, 1]

lr_acc = accuracy_score(y_test, lr_probs_test > 0.5)
lr_logloss = log_loss(y_test, lr_probs_test)

print(f"✓ Logistic Regression")
print(f"  Accuracy: {lr_acc:.4f} ({lr_acc - baseline_acc:+.4f})")
print(f"  Log Loss: {lr_logloss:.4f} ({lr_logloss - baseline_logloss:+.4f})")

# =============================================================================
# TRAIN XGBOOST
# =============================================================================

print("\n[7] Training XGBoost")

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 4,
    "learning_rate": 0.05,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": RANDOM_SEED,
}

model_xgb = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=[(dval, "val")],
    early_stopping_rounds=30,
    verbose_eval=False
)

xgb_probs_val = model_xgb.predict(dval)
xgb_probs_test = model_xgb.predict(dtest)

xgb_acc = accuracy_score(y_test, xgb_probs_test > 0.5)
xgb_logloss = log_loss(y_test, xgb_probs_test)

print(f"✓ XGBoost")
print(f"  Best iteration: {model_xgb.best_iteration}")
print(f"  Accuracy: {xgb_acc:.4f} ({xgb_acc - baseline_acc:+.4f})")
print(f"  Log Loss: {xgb_logloss:.4f} ({xgb_logloss - baseline_logloss:+.4f})")

# =============================================================================
# ENSEMBLE
# =============================================================================

print("\n[8] Creating ensemble")

# Simple average
ensemble_simple = (lr_probs_test + xgb_probs_test) / 2

# Optimized weights
def ensemble_loss(w, p1, p2, y_true):
    p = w[0] * p1 + w[1] * p2
    p = np.clip(p, 1e-15, 1 - 1e-15)
    return log_loss(y_true, p)

result = minimize(
    ensemble_loss,
    x0=[0.5, 0.5],
    args=(lr_probs_val, xgb_probs_val, y_val),
    bounds=[(0, 1), (0, 1)],
    constraints={"type": "eq", "fun": lambda w: w.sum() - 1},
)

optimal_weights = result.x if result.success else np.array([0.5, 0.5])

ensemble_weighted = (
    optimal_weights[0] * lr_probs_test +
    optimal_weights[1] * xgb_probs_test
)

ensemble_acc = accuracy_score(y_test, ensemble_weighted > 0.5)
ensemble_logloss = log_loss(y_test, ensemble_weighted)

print(f"✓ Ensemble (weights: LR={optimal_weights[0]:.2f}, XGB={optimal_weights[1]:.2f})")
print(f"  Accuracy: {ensemble_acc:.4f} ({ensemble_acc - baseline_acc:+.4f})")
print(f"  Log Loss: {ensemble_logloss:.4f} ({ensemble_logloss - baseline_logloss:+.4f})")

# =============================================================================
# MODEL COMPARISON
# =============================================================================

print("\n[9] Model comparison")

comparison = pd.DataFrame({
    "Model": ["Baseline", "Logistic Regression", "XGBoost", "Ensemble"],
    "Accuracy": [baseline_acc, lr_acc, xgb_acc, ensemble_acc],
    "Log Loss": [baseline_logloss, lr_logloss, xgb_logloss, ensemble_logloss]
})

print("\n" + comparison.to_string(index=False))

best_idx = comparison["Log Loss"].idxmin()
best_model = comparison.loc[best_idx, "Model"]

print(f"\n✓ Best model: {best_model}")
print(f"✓ Improvement over original baseline (0.690):")
print(f"  Log loss: {ensemble_logloss:.4f} (Δ = {0.690 - ensemble_logloss:+.4f})")

# =============================================================================
# FEATURE IMPORTANCE
# =============================================================================

print("\n[10] Feature importance analysis")

importance_dict = model_xgb.get_score(importance_type='gain')
importance_df = pd.DataFrame([
    {'feature': k, 'importance': v}
    for k, v in importance_dict.items()
]).sort_values('importance', ascending=False)

print("\nTop 15 features:")
print(importance_df.head(15).to_string(index=False))

importance_df.to_csv("evaluation/feature_importance_historical.csv", index=False)

# =============================================================================
# ROLLING VALIDATION (MULTIPLE TEST SEASONS)
# =============================================================================

print("\n" + "="*60)
print("STRATEGY 2: ROLLING VALIDATION")
print("="*60)

print("\n[11] Testing on multiple recent seasons")

test_seasons = ["2021-22", "2022-23", "2023-24"]
rolling_results = []

for test_season in test_seasons:
    # Train on all prior seasons
    train_seasons = [s for s in games["season"].unique() if s < test_season]

    train_mask = games["season"].isin(train_seasons)
    test_mask = games["season"] == test_season

    X_train_roll = X[train_mask]
    y_train_roll = y[train_mask]
    X_test_roll = X[test_mask]
    y_test_roll = y[test_mask]

    # Train quick LR model
    lr_roll = Pipeline([
        ("scaler", StandardScaler()),
        ("lr", LogisticRegression(penalty="l2", C=1.0, max_iter=1000, random_state=RANDOM_SEED))
    ])
    lr_roll.fit(X_train_roll, y_train_roll)

    probs_roll = lr_roll.predict_proba(X_test_roll)[:, 1]
    acc_roll = accuracy_score(y_test_roll, probs_roll > 0.5)
    ll_roll = log_loss(y_test_roll, probs_roll)

    rolling_results.append({
        "test_season": test_season,
        "train_games": len(X_train_roll),
        "test_games": len(X_test_roll),
        "accuracy": acc_roll,
        "log_loss": ll_roll
    })

    print(f"  {test_season}: Acc={acc_roll:.4f}, LL={ll_roll:.4f} (trained on {len(train_seasons)} seasons)")

rolling_df = pd.DataFrame(rolling_results)
print("\n✓ Rolling validation complete")
print(f"  Mean accuracy: {rolling_df['accuracy'].mean():.4f}")
print(f"  Mean log loss: {rolling_df['log_loss'].mean():.4f}")

# =============================================================================
# SAVE MODELS
# =============================================================================

print("\n[12] Saving models and results")

# Save models
with open("models/logistic_regression_historical.pkl", "wb") as f:
    pickle.dump(lr, f)

model_xgb.save_model("models/xgboost_historical.json")

# Save ensemble config
ensemble_config = {
    "weights": optimal_weights.tolist(),
    "models": ["logistic_regression", "xgboost"],
    "seed": RANDOM_SEED,
    "train_seasons": TRAIN_SEASONS_SINGLE,
    "test_season": TEST_SEASON_SINGLE
}

with open("models/ensemble_config_historical.json", "w") as f:
    json.dump(ensemble_config, f, indent=2)

# Save predictions
test_results = games[test_mask].reset_index(drop=True).copy()
test_results["lr_prob"] = lr_probs_test
test_results["xgb_prob"] = xgb_probs_test
test_results["ensemble_prob"] = ensemble_weighted
test_results["actual_home_win"] = y_test.values

test_results.to_csv("data/processed/test_predictions_historical.csv", index=False)

print("✓ Models saved")
print("✓ Predictions saved")

# =============================================================================
# VISUALIZATIONS
# =============================================================================

print("\n[13] Creating visualizations")

# Calibration plot
plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], "k--", label="Perfect", linewidth=2)

for probs, label in [
    (lr_probs_test, "Logistic Regression"),
    (xgb_probs_test, "XGBoost"),
    (ensemble_weighted, "Ensemble"),
]:
    t, p = calibration_curve(y_test, probs, n_bins=10)
    plt.plot(p, t, marker="o", label=label, markersize=6)

plt.xlabel("Predicted Probability")
plt.ylabel("True Probability")
plt.title("Calibration Plot - Historical Models (2015-2024)")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("evaluation/calibration_historical.png", dpi=150)
plt.close()

print("✓ Calibration plot saved")

# Performance over time
plt.figure(figsize=(10, 6))
seasons_plot = rolling_df["test_season"]
acc_plot = rolling_df["accuracy"]
ll_plot = rolling_df["log_loss"]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.plot(seasons_plot, acc_plot, marker='o', linewidth=2, markersize=8)
ax1.set_xlabel("Test Season")
ax1.set_ylabel("Accuracy")
ax1.set_title("Accuracy Over Time")
ax1.grid(alpha=0.3)

ax2.plot(seasons_plot, ll_plot, marker='o', linewidth=2, markersize=8, color='orange')
ax2.set_xlabel("Test Season")
ax2.set_ylabel("Log Loss")
ax2.set_title("Log Loss Over Time")
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.savefig("evaluation/performance_over_time.png", dpi=150)
plt.close()

print("✓ Performance over time plot saved")

# =============================================================================
# FINAL SUMMARY
# =============================================================================

print("\n" + "="*60)
print("HISTORICAL MODEL TRAINING COMPLETE")
print("="*60)

print(f"\n✓ Training data: {len(X_train):,} games (2015-2022)")
print(f"✓ Test data: {len(X_test):,} games (2023-24)")
print(f"✓ Best model: {best_model}")
print(f"✓ Test accuracy: {ensemble_acc:.4f}")
print(f"✓ Test log loss: {ensemble_logloss:.4f}")

print(f"\n✓ Improvement vs original 3-season model:")
print(f"  Original log loss: 0.658")
print(f"  Historical log loss: {ensemble_logloss:.4f}")
print(f"  Improvement: {0.658 - ensemble_logloss:.4f} ({100*(0.658-ensemble_logloss)/0.658:.1f}%)")

print("\n✓ Rolling validation (2021-2024):")
print(f"  Mean accuracy: {rolling_df['accuracy'].mean():.4f}")
print(f"  Mean log loss: {rolling_df['log_loss'].mean():.4f}")

print("\n" + "="*60)
print("PROJECT PHASE 2 COMPLETE!")
print("="*60)


HISTORICAL MODEL TRAINING (2015-2024)

[1] Loading historical dataset
✓ Loaded 10,749 games
✓ Features: 72
✓ Seasons: ['2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

[2] Defining feature set
✓ Total features: 40
  Context: 10
  L5: 11
  L10: 11
  Exponential weighted: 8

[3] Preparing data
✓ X shape: (10749, 40)
✓ Missing values after imputation: 0

STRATEGY 1: SINGLE HOLDOUT VALIDATION

[4] Creating train/val/test split
  Train: 2015-16 to 2022-23
  Test: 2023-24

✓ Train: 8,568 games
✓ Val: 951 games
✓ Test: 1,230 games
✓ Train home win rate: 0.571
✓ Test home win rate: 0.543

[5] Training baseline
✓ Baseline (constant 0.571)
  Accuracy: 0.5431
  Log Loss: 0.6910

[6] Training Logistic Regression
✓ Logistic Regression
  Accuracy: 0.6488 (+0.1057)
  Log Loss: 0.6350 (-0.0560)

[7] Training XGBoost
✓ XGBoost
  Best iteration: 35
  Accuracy: 0.6512 (+0.1081)
  Log Loss: 0.6360 (-0.0551)

[8] Creating ensemble
✓ Ensemble (weights: LR=

<Figure size 1000x600 with 0 Axes>

In [14]:
import os
import glob

# Find all CSV files in your Drive
print("Searching for nba_predictor files...")
search_paths = [
    "/content/drive/MyDrive/**/nba_predictor/**/*.csv",
    "/content/drive/MyDrive/**/*historical*.csv",
    "/content/drive/MyDrive/**/*games*.csv"
]

for pattern in search_paths:
    files = glob.glob(pattern, recursive=True)
    if files:
        print(f"\nFound {len(files)} files matching pattern:")
        for f in files[:10]:  # Show first 10
            print(f"  • {f}")
        break

Searching for nba_predictor files...

Found 30 files matching pattern:
  • /content/drive/MyDrive/nba_predictor/data/raw/raw_games_historical_2015_2024.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/games_outcomes_historical_2015_2024.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/team_reference.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/checkpoint_after_2015_16.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/checkpoint_after_2016_17.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/checkpoint_after_2017_18.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/checkpoint_after_2018_19.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/checkpoint_after_2019_20.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/checkpoint_after_2020_21.csv
  • /content/drive/MyDrive/nba_predictor/data/raw/checkpoint_after_2021_22.csv
