In [1]:
# Phase 4: Feature Engineering
# Purpose: Create advanced features from raw data for ML models

import pandas as pd
import numpy as np
import warnings
from pathlib import Path
from datetime import datetime, timedelta

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

print("="*70)
print("PHASE 4: FEATURE ENGINEERING")
print("="*70)
print(f"\nStart time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

# Load cleaned data
PROCESSED_DIR = Path('data/processed')

print("üìÇ Loading cleaned datasets...")
games = pd.read_csv(PROCESSED_DIR / 'games_with_betting.csv')
games_details = pd.read_csv(PROCESSED_DIR / 'games_details_cleaned.csv')

# Convert dates
games['GAME_DATE'] = pd.to_datetime(games['GAME_DATE'])

print(f"‚úÖ Loaded {len(games):,} games with betting data")
print(f"‚úÖ Loaded {len(games_details):,} player-game records")

# Sort by date (CRITICAL for rolling features)
games = games.sort_values('GAME_DATE').reset_index(drop=True)
print(f"\n‚úÖ Sorted by date: {games['GAME_DATE'].min().date()} to {games['GAME_DATE'].max().date()}")



# Create helper functions for feature engineering

def create_team_game_log(games_df):
    """
    Create separate records for home and away teams
    This allows us to calculate rolling stats per team
    """
    # Home team games
    home_games = games_df[['GAME_DATE', 'GAME_ID', 'SEASON', 'HOME_TEAM_ID', 
                           'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away', 
                           'HOME_TEAM_WINS', 'FG_PCT_home', 'FG_PCT_away',
                           'REB_home', 'REB_away', 'AST_home', 'AST_away']].copy()
    
    home_games['TEAM_ID'] = home_games['HOME_TEAM_ID']
    home_games['OPP_TEAM_ID'] = home_games['VISITOR_TEAM_ID']
    home_games['IS_HOME'] = 1
    home_games['TEAM_PTS'] = home_games['PTS_home']
    home_games['OPP_PTS'] = home_games['PTS_away']
    home_games['WON'] = home_games['HOME_TEAM_WINS']
    home_games['FG_PCT'] = home_games['FG_PCT_home']
    home_games['OPP_FG_PCT'] = home_games['FG_PCT_away']
    home_games['REB'] = home_games['REB_home']
    home_games['OPP_REB'] = home_games['REB_away']
    home_games['AST'] = home_games['AST_home']
    home_games['OPP_AST'] = home_games['AST_away']
    
    # Away team games
    away_games = games_df[['GAME_DATE', 'GAME_ID', 'SEASON', 'HOME_TEAM_ID', 
                           'VISITOR_TEAM_ID', 'PTS_home', 'PTS_away', 
                           'HOME_TEAM_WINS', 'FG_PCT_home', 'FG_PCT_away',
                           'REB_home', 'REB_away', 'AST_home', 'AST_away']].copy()
    
    away_games['TEAM_ID'] = away_games['VISITOR_TEAM_ID']
    away_games['OPP_TEAM_ID'] = away_games['HOME_TEAM_ID']
    away_games['IS_HOME'] = 0
    away_games['TEAM_PTS'] = away_games['PTS_away']
    away_games['OPP_PTS'] = away_games['PTS_home']
    away_games['WON'] = 1 - away_games['HOME_TEAM_WINS']
    away_games['FG_PCT'] = away_games['FG_PCT_away']
    away_games['OPP_FG_PCT'] = away_games['FG_PCT_home']
    away_games['REB'] = away_games['REB_away']
    away_games['OPP_REB'] = away_games['REB_home']
    away_games['AST'] = away_games['AST_away']
    away_games['OPP_AST'] = away_games['AST_home']
    
    # Combine
    team_games = pd.concat([home_games, away_games], ignore_index=True)
    team_games = team_games.sort_values(['TEAM_ID', 'GAME_DATE']).reset_index(drop=True)
    
    return team_games

print("‚úÖ Helper functions created")


# Create team-level game log
print("\nüìä Creating team game log...")
print("="*70)

team_games = create_team_game_log(games)

print(f"‚úÖ Created team game log: {len(team_games):,} records")
print(f"   (Each game creates 2 records: one per team)")

# Show sample
print("\nüîç Sample team game log:")
display(team_games[['GAME_DATE', 'TEAM_ID', 'OPP_TEAM_ID', 'IS_HOME', 
                     'TEAM_PTS', 'OPP_PTS', 'WON']].head(10))

# Verify counts
games_per_team = team_games.groupby('TEAM_ID').size()
print(f"\nüìà Games per team:")
print(f"   Min: {games_per_team.min()}")
print(f"   Max: {games_per_team.max()}")
print(f"   Average: {games_per_team.mean():.1f}")


# FEATURE SET 1: Rolling performance metrics
print("\nüîß FEATURE SET 1: Rolling Performance (Last 5 Games)")
print("="*70)

# Calculate rolling stats for each team
rolling_window = 5

# Group by team and calculate rolling metrics
team_games_sorted = team_games.sort_values(['TEAM_ID', 'GAME_DATE']).copy()

print(f"Calculating rolling {rolling_window}-game averages...")

# Rolling averages (shift by 1 to avoid data leakage - don't include current game)
for col in ['TEAM_PTS', 'OPP_PTS', 'WON', 'FG_PCT', 'REB', 'AST']:
    team_games_sorted[f'{col}_L5'] = (
        team_games_sorted.groupby('TEAM_ID')[col]
        .transform(lambda x: x.shift(1).rolling(rolling_window, min_periods=1).mean())
    )

# Point differential
team_games_sorted['PT_DIFF_L5'] = (
    team_games_sorted['TEAM_PTS_L5'] - team_games_sorted['OPP_PTS_L5']
)

print("‚úÖ Created rolling 5-game features:")
print("   - TEAM_PTS_L5 (average points scored)")
print("   - OPP_PTS_L5 (average points allowed)")
print("   - WON_L5 (win rate)")
print("   - FG_PCT_L5 (shooting percentage)")
print("   - REB_L5 (rebounds)")
print("   - AST_L5 (assists)")
print("   - PT_DIFF_L5 (point differential)")

# Show sample
print("\nüîç Sample rolling features:")
display(team_games_sorted[team_games_sorted['TEAM_ID'] == team_games_sorted['TEAM_ID'].iloc[0]]
        [['GAME_DATE', 'TEAM_PTS', 'TEAM_PTS_L5', 'WON', 'WON_L5', 'PT_DIFF_L5']].head(10))


# FEATURE SET 2: Longer rolling windows
print("\nüîß FEATURE SET 2: Rolling 10 and 20 Game Performance")
print("="*70)

# Last 10 games
for col in ['TEAM_PTS', 'OPP_PTS', 'WON']:
    team_games_sorted[f'{col}_L10'] = (
        team_games_sorted.groupby('TEAM_ID')[col]
        .transform(lambda x: x.shift(1).rolling(10, min_periods=3).mean())
    )

# Last 20 games (season-long trends)
for col in ['TEAM_PTS', 'OPP_PTS', 'WON']:
    team_games_sorted[f'{col}_L20'] = (
        team_games_sorted.groupby('TEAM_ID')[col]
        .transform(lambda x: x.shift(1).rolling(20, min_periods=5).mean())
    )

print("‚úÖ Created rolling 10-game and 20-game features")

# Point differentials
team_games_sorted['PT_DIFF_L10'] = (
    team_games_sorted['TEAM_PTS_L10'] - team_games_sorted['OPP_PTS_L10']
)
team_games_sorted['PT_DIFF_L20'] = (
    team_games_sorted['TEAM_PTS_L20'] - team_games_sorted['OPP_PTS_L20']
)

print("‚úÖ Created point differential features for L10 and L20")


# FEATURE SET 3: Home vs Away performance splits
print("\nüîß FEATURE SET 3: Home/Away Split Performance")
print("="*70)

# Calculate separate rolling stats for home and away games
for location, is_home in [('HOME', 1), ('AWAY', 0)]:
    location_games = team_games_sorted[team_games_sorted['IS_HOME'] == is_home].copy()
    
    for col in ['TEAM_PTS', 'WON']:
        # Rolling last 5 home/away games
        location_games[f'{col}_{location}_L5'] = (
            location_games.groupby('TEAM_ID')[col]
            .transform(lambda x: x.shift(1).rolling(5, min_periods=1).mean())
        )
    
    # Merge back
    team_games_sorted = team_games_sorted.merge(
        location_games[['GAME_ID', f'TEAM_PTS_{location}_L5', f'WON_{location}_L5']],
        on='GAME_ID',
        how='left'
    )

# Fill NaN with overall stats (for teams with no home/away games yet)
team_games_sorted['TEAM_PTS_HOME_L5'] = team_games_sorted['TEAM_PTS_HOME_L5'].fillna(
    team_games_sorted['TEAM_PTS_L5']
)
team_games_sorted['TEAM_PTS_AWAY_L5'] = team_games_sorted['TEAM_PTS_AWAY_L5'].fillna(
    team_games_sorted['TEAM_PTS_L5']
)
team_games_sorted['WON_HOME_L5'] = team_games_sorted['WON_HOME_L5'].fillna(
    team_games_sorted['WON_L5']
)
team_games_sorted['WON_AWAY_L5'] = team_games_sorted['WON_AWAY_L5'].fillna(
    team_games_sorted['WON_L5']
)

print("‚úÖ Created home/away split features:")
print("   - TEAM_PTS_HOME_L5 (points at home)")
print("   - TEAM_PTS_AWAY_L5 (points on road)")
print("   - WON_HOME_L5 (home win rate)")
print("   - WON_AWAY_L5 (away win rate)")


# FEATURE SET 4: Days of rest
print("\nüîß FEATURE SET 4: Rest Days Between Games")
print("="*70)

# Calculate days since last game for each team
team_games_sorted['DAYS_REST'] = (
    team_games_sorted.groupby('TEAM_ID')['GAME_DATE']
    .diff()
    .dt.total_seconds() / (24 * 3600)  # Convert to days
)

# First game of season has no rest data
team_games_sorted['DAYS_REST'] = team_games_sorted['DAYS_REST'].fillna(7)  # Assume 1 week

# Back-to-back indicator (playing on consecutive days)
team_games_sorted['BACK_TO_BACK'] = (team_games_sorted['DAYS_REST'] <= 1).astype(int)

# Well-rested indicator (3+ days rest)
team_games_sorted['WELL_RESTED'] = (team_games_sorted['DAYS_REST'] >= 3).astype(int)

print("‚úÖ Created rest features:")
print(f"   - DAYS_REST (days since last game)")
print(f"   - BACK_TO_BACK (playing on consecutive days)")
print(f"   - WELL_RESTED (3+ days rest)")

# Stats
b2b_pct = team_games_sorted['BACK_TO_BACK'].mean() * 100
print(f"\nüìä {b2b_pct:.1f}% of games are back-to-backs")

rested_pct = team_games_sorted['WELL_RESTED'].mean() * 100
print(f"üìä {rested_pct:.1f}% of games are well-rested (3+ days)")


# FEATURE SET 5: Winning and losing streaks
print("\nüîß FEATURE SET 5: Win/Loss Streaks")
print("="*70)

def calculate_streak(series):
    """Calculate current winning/losing streak"""
    streaks = []
    current_streak = 0
    
    for val in series:
        if pd.isna(val):
            streaks.append(0)
        elif val == 1:  # Win
            current_streak = current_streak + 1 if current_streak >= 0 else 1
            streaks.append(current_streak)
        else:  # Loss
            current_streak = current_streak - 1 if current_streak <= 0 else -1
            streaks.append(current_streak)
    
    return pd.Series(streaks, index=series.index)

# Calculate streaks for each team
team_games_sorted['WIN_STREAK'] = (
    team_games_sorted.groupby('TEAM_ID')['WON']
    .transform(lambda x: calculate_streak(x.shift(1)))  # Shift to avoid leakage
)

# Separate win/loss streak indicators
team_games_sorted['ON_WIN_STREAK'] = (team_games_sorted['WIN_STREAK'] > 0).astype(int)
team_games_sorted['ON_LOSS_STREAK'] = (team_games_sorted['WIN_STREAK'] < 0).astype(int)
team_games_sorted['WIN_STREAK_LENGTH'] = team_games_sorted['WIN_STREAK'].abs()

print("‚úÖ Created streak features:")
print("   - WIN_STREAK (positive=winning, negative=losing)")
print("   - ON_WIN_STREAK (binary)")
print("   - ON_LOSS_STREAK (binary)")
print("   - WIN_STREAK_LENGTH (absolute value)")

# Stats
max_win_streak = team_games_sorted['WIN_STREAK'].max()
max_loss_streak = team_games_sorted['WIN_STREAK'].min()
print(f"\nüìä Longest win streak in data: {max_win_streak} games")
print(f"üìä Longest loss streak in data: {abs(max_loss_streak)} games")


# FEATURE SET 6: Head-to-head matchup history
print("\nüîß FEATURE SET 6: Head-to-Head Matchup History")
print("="*70)

# Create matchup identifier (always sort team IDs so order doesn't matter)
team_games_sorted['MATCHUP_ID'] = team_games_sorted.apply(
    lambda row: f"{min(row['TEAM_ID'], row['OPP_TEAM_ID'])}_{max(row['TEAM_ID'], row['OPP_TEAM_ID'])}",
    axis=1
)

# Calculate head-to-head win rate (last 5 meetings)
def h2h_win_rate(group):
    """Calculate rolling head-to-head win rate for a team in this matchup"""
    group = group.sort_values('GAME_DATE')
    group['H2H_WIN_RATE_L5'] = (
        group['WON'].shift(1).rolling(5, min_periods=1).mean()
    )
    return group

team_games_sorted = (
    team_games_sorted.groupby(['MATCHUP_ID', 'TEAM_ID'], group_keys=False)
    .apply(h2h_win_rate)
)

print("‚úÖ Created head-to-head feature:")
print("   - H2H_WIN_RATE_L5 (win rate vs this opponent in last 5 meetings)")

# Show example
print("\nüîç Example H2H history:")
sample_matchup = team_games_sorted[
    team_games_sorted['MATCHUP_ID'] == team_games_sorted['MATCHUP_ID'].value_counts().index[0]
].head(10)
display(sample_matchup[['GAME_DATE', 'TEAM_ID', 'OPP_TEAM_ID', 'WON', 'H2H_WIN_RATE_L5']])


# Merge all team features back to main games dataset
print("\nüîó Merging features back to games dataset...")
print("="*70)

# Split team_games back into home and away
home_features = team_games_sorted[team_games_sorted['IS_HOME'] == 1].copy()
away_features = team_games_sorted[team_games_sorted['IS_HOME'] == 0].copy()

# Rename columns for home team
home_cols_to_keep = ['GAME_ID'] + [col for col in home_features.columns 
                                     if col.endswith(('_L5', '_L10', '_L20', '_HOME_L5', '_AWAY_L5')) 
                                     or col in ['DAYS_REST', 'BACK_TO_BACK', 'WELL_RESTED', 
                                               'WIN_STREAK', 'ON_WIN_STREAK', 'ON_LOSS_STREAK', 
                                               'WIN_STREAK_LENGTH', 'H2H_WIN_RATE_L5']]

home_features_subset = home_features[home_cols_to_keep].copy()
home_features_subset.columns = ['GAME_ID'] + [f'HOME_{col}' for col in home_features_subset.columns[1:]]

# Rename columns for away team
away_features_subset = away_features[home_cols_to_keep].copy()
away_features_subset.columns = ['GAME_ID'] + [f'AWAY_{col}' for col in away_features_subset.columns[1:]]

# Merge with main games dataset
games_featured = games.merge(home_features_subset, on='GAME_ID', how='left')
games_featured = games_featured.merge(away_features_subset, on='GAME_ID', how='left')

print(f"‚úÖ Merged features to games dataset")
print(f"   Original columns: {len(games.columns)}")
print(f"   New columns: {len(games_featured.columns)}")
print(f"   Features added: {len(games_featured.columns) - len(games.columns)}")


# FEATURE SET 7: Differential features (home advantage)
print("\nüîß FEATURE SET 7: Matchup Differential Features")
print("="*70)

# Create differential features (home - away)
differential_features = [
    ('TEAM_PTS_L5', 'Scoring differential'),
    ('WON_L5', 'Win rate differential'),
    ('PT_DIFF_L5', 'Point diff differential'),
    ('FG_PCT_L5', 'Shooting% differential'),
    ('DAYS_REST', 'Rest differential'),
    ('WIN_STREAK_LENGTH', 'Streak differential'),
]

for feat, description in differential_features:
    games_featured[f'DIFF_{feat}'] = (
        games_featured[f'HOME_{feat}'] - games_featured[f'AWAY_{feat}']
    )
    print(f"   ‚úÖ DIFF_{feat} ({description})")

print(f"\n‚úÖ Created {len(differential_features)} differential features")


# FEATURE SET 8: Betting market features
print("\nüîß FEATURE SET 8: Betting Market Features")
print("="*70)

# 1. Home favorite/underdog
games_featured['HOME_FAVORITE'] = (games_featured['HOME_SPREAD'] < 0).astype(int)
games_featured['HOME_UNDERDOG'] = (games_featured['HOME_SPREAD'] > 0).astype(int)

# 2. Spread size categories
games_featured['SPREAD_SIZE'] = games_featured['HOME_SPREAD'].abs()
games_featured['BIG_FAVORITE'] = (games_featured['HOME_SPREAD'] < -7).astype(int)
games_featured['BIG_UNDERDOG'] = (games_featured['HOME_SPREAD'] > 7).astype(int)

# 3. Implied probability edge
# Compare Vegas implied prob to team's recent win rate
games_featured['PROB_EDGE_HOME'] = (
    games_featured['HOME_WIN_PROB'] - games_featured['HOME_WON_L10']
)

# 4. Total line relative to recent scoring
games_featured['TOTAL_VS_RECENT'] = (
    games_featured['TOTAL_LINE'] - 
    (games_featured['HOME_TEAM_PTS_L5'] + games_featured['AWAY_TEAM_PTS_L5'])
)

print("‚úÖ Created betting market features:")
print("   - HOME_FAVORITE (binary)")
print("   - HOME_UNDERDOG (binary)")
print("   - SPREAD_SIZE (absolute spread)")
print("   - BIG_FAVORITE/BIG_UNDERDOG (¬±7 points)")
print("   - PROB_EDGE_HOME (implied prob vs recent performance)")
print("   - TOTAL_VS_RECENT (line vs recent scoring pace)")

# Show distribution
fav_pct = games_featured['HOME_FAVORITE'].mean() * 100
print(f"\nüìä Home team is favorite {fav_pct:.1f}% of the time")


# FEATURE SET 9: Time-based features
print("\nüîß FEATURE SET 9: Temporal Features")
print("="*70)

# Season progression (0 to 1, where 0=start, 1=end of season)
games_featured['SEASON_PROGRESS'] = (
    games_featured.groupby('SEASON')['GAME_DATE']
    .transform(lambda x: (x - x.min()) / (x.max() - x.min()))
)

# Month indicators (some months show different patterns)
games_featured['MONTH_OCT_NOV'] = games_featured['MONTH'].isin([10, 11]).astype(int)
games_featured['MONTH_DEC_JAN'] = games_featured['MONTH'].isin([12, 1]).astype(int)
games_featured['MONTH_FEB_MAR'] = games_featured['MONTH'].isin([2, 3]).astype(int)
games_featured['MONTH_APR_MAY_JUN'] = games_featured['MONTH'].isin([4, 5, 6]).astype(int)

# Day of week
games_featured['WEEKEND'] = games_featured['DAY_OF_WEEK'].isin([5, 6]).astype(int)

# Playoff race indicator (last 20 games of season)
games_featured['PLAYOFF_PUSH'] = (games_featured['SEASON_PROGRESS'] > 0.75).astype(int)

print("‚úÖ Created temporal features:")
print("   - SEASON_PROGRESS (0-1 scale)")
print("   - MONTH_OCT_NOV through MONTH_APR_MAY_JUN")
print("   - WEEKEND (Saturday/Sunday)")
print("   - PLAYOFF_PUSH (last quarter of season)")


# Data quality check
print("\nüîç DATA QUALITY CHECK")
print("="*70)

# Check for missing values in features
feature_cols = [col for col in games_featured.columns if col.startswith(('HOME_', 'AWAY_', 'DIFF_'))]
feature_cols += ['SPREAD_SIZE', 'PROB_EDGE_HOME', 'TOTAL_VS_RECENT', 'SEASON_PROGRESS']

missing_summary = games_featured[feature_cols].isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

if len(missing_summary) > 0:
    print(f"\n‚ö†Ô∏è  Features with missing values:")
    print(missing_summary)
    
    # Fill strategy
    print(f"\nüìù Filling missing values with appropriate defaults...")
    
    # For rolling features, NaN means beginning of season - use season average
    for col in missing_summary.index:
        if '_L5' in col or '_L10' in col or '_L20' in col:
            # Fill with median of that feature
            fill_value = games_featured[col].median()
            games_featured[col] = games_featured[col].fillna(fill_value)
            print(f"   Filled {col} with median: {fill_value:.3f}")
        elif 'DAYS_REST' in col:
            games_featured[col] = games_featured[col].fillna(3)  # Average rest
        elif 'STREAK' in col:
            games_featured[col] = games_featured[col].fillna(0)  # No streak
        else:
            games_featured[col] = games_featured[col].fillna(0)
    
    print(f"\n‚úÖ All missing values filled")
else:
    print("‚úÖ No missing values in features!")

# Final check
total_missing = games_featured[feature_cols].isnull().sum().sum()
print(f"\nüìä Final missing value count: {total_missing}")


# Summary of all created features
print("\nüìä FEATURE ENGINEERING SUMMARY")
print("="*70)

# Count features by category
feature_categories = {
    'Rolling Performance (L5)': [col for col in games_featured.columns if '_L5' in col],
    'Rolling Performance (L10)': [col for col in games_featured.columns if '_L10' in col],
    'Rolling Performance (L20)': [col for col in games_featured.columns if '_L20' in col],
    'Home/Away Splits': [col for col in games_featured.columns if ('_HOME_L5' in col or '_AWAY_L5' in col)],
    'Rest & Fatigue': [col for col in games_featured.columns if any(x in col for x in ['DAYS_REST', 'BACK_TO_BACK', 'WELL_RESTED'])],
    'Streaks': [col for col in games_featured.columns if 'STREAK' in col],
    'Head-to-Head': [col for col in games_featured.columns if 'H2H' in col],
    'Differentials': [col for col in games_featured.columns if col.startswith('DIFF_')],
    'Betting Market': [col for col in games_featured.columns if any(x in col for x in ['FAVORITE', 'UNDERDOG', 'PROB_EDGE', 'SPREAD_SIZE'])],
    'Temporal': [col for col in games_featured.columns if any(x in col for x in ['SEASON_PROGRESS', 'MONTH_', 'WEEKEND', 'PLAYOFF'])],
}

print("\nüìã Features by category:\n")
total_features = 0
for category, features in feature_categories.items():
    print(f"{category:30s}: {len(features):3d} features")
    total_features += len(features)

print(f"\n{'TOTAL ENGINEERED FEATURES':30s}: {total_features:3d}")

# Original features (from raw data)
original_features = len([col for col in games_featured.columns if col in games.columns])
print(f"{'Original features':30s}: {original_features:3d}")
print(f"{'Total columns':30s}: {len(games_featured.columns):3d}")


# Show sample of key features
print("\nüîç SAMPLE FEATURE VALUES")
print("="*70)

sample_features = [
    'GAME_DATE', 'MATCHUP', 'HOME_SPREAD', 'SPREAD_HOME_COVER',
    'HOME_WON_L5', 'AWAY_WON_L5', 'DIFF_WON_L5',
    'HOME_PT_DIFF_L5', 'AWAY_PT_DIFF_L5',
    'HOME_DAYS_REST', 'AWAY_DAYS_REST',
    'HOME_BACK_TO_BACK', 'AWAY_BACK_TO_BACK',
    'HOME_FAVORITE', 'PROB_EDGE_HOME',
    'SEASON_PROGRESS', 'PLAYOFF_PUSH'
]

# Filter to features that exist
sample_features = [col for col in sample_features if col in games_featured.columns]

print("\nRecent games with key features:")
display(games_featured[sample_features].tail(10))


# Save the fully featured dataset
print("\nüíæ SAVING ENGINEERED DATASET")
print("="*70)

# Save to processed folder
output_file = PROCESSED_DIR / 'games_with_features.csv'
games_featured.to_csv(output_file, index=False)

print(f"‚úÖ Saved: {output_file}")
print(f"   Rows: {len(games_featured):,}")
print(f"   Columns: {len(games_featured.columns)}")

# Create feature list document
feature_list = pd.DataFrame({
    'Feature': games_featured.columns,
    'Type': games_featured.dtypes.astype(str),
    'Non_Null_Count': games_featured.count(),
    'Unique_Values': [games_featured[col].nunique() for col in games_featured.columns]
})

feature_list.to_csv(PROCESSED_DIR / 'feature_list.csv', index=False)
print(f"‚úÖ Saved: feature_list.csv (documentation)")


# Create train/validation/test splits based on time
print("\nüìä CREATING TRAIN/VAL/TEST SPLITS")
print("="*70)

# Filter to games with all features (no missing data)
games_ml = games_featured.dropna(subset=['SPREAD_HOME_COVER']).copy()

print(f"Games with complete data: {len(games_ml):,}")

# Time-based split (critical for time-series data!)
# Train: 2006-2014
# Validation: 2015-2016
# Test: 2017

train = games_ml[games_ml['SEASON'] <= 2014].copy()
val = games_ml[(games_ml['SEASON'] >= 2015) & (games_ml['SEASON'] <= 2016)].copy()
test = games_ml[games_ml['SEASON'] >= 2017].copy()

print(f"\nüìä SPLIT SUMMARY:")
print(f"{'Dataset':<15} {'Games':<10} {'Date Range':<30} {'Seasons'}")
print("="*70)
print(f"{'Train':<15} {len(train):<10,} {train['GAME_DATE'].min().date()} to {train['GAME_DATE'].max().date()}  {train['SEASON'].min()}-{train['SEASON'].max()}")
print(f"{'Validation':<15} {len(val):<10,} {val['GAME_DATE'].min().date()} to {val['GAME_DATE'].max().date()}  {val['SEASON'].min()}-{val['SEASON'].max()}")
print(f"{'Test':<15} {len(test):<10,} {test['GAME_DATE'].min().date()} to {test['GAME_DATE'].max().date()}  {test['SEASON'].min()}-{test['SEASON'].max()}")
print(f"{'TOTAL':<15} {len(games_ml):<10,}")

# Check class balance in each split
print(f"\nüìä CLASS BALANCE (Home Cover Rate):")
print(f"Train:      {train['SPREAD_HOME_COVER'].mean()*100:.1f}%")
print(f"Validation: {val['SPREAD_HOME_COVER'].mean()*100:.1f}%")
print(f"Test:       {test['SPREAD_HOME_COVER'].mean()*100:.1f}%")

# Save splits
train.to_csv(PROCESSED_DIR / 'train_set.csv', index=False)
val.to_csv(PROCESSED_DIR / 'val_set.csv', index=False)
test.to_csv(PROCESSED_DIR / 'test_set.csv', index=False)

print(f"\n‚úÖ Saved train/val/test splits to {PROCESSED_DIR}")


# Quick feature importance preview
print("\nüîç FEATURE IMPORTANCE PREVIEW")
print("="*70)

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Select feature columns (exclude metadata and targets)
exclude_cols = ['GAME_DATE_EST', 'GAME_ID', 'GAME_STATUS_TEXT', 'SEASON', 
                'TEAM_ID_home', 'TEAM_ID_away', 'HOME_TEAM_ID', 'VISITOR_TEAM_ID',
                'GAME_DATE', 'MATCHUP', 'HOME_TEAM', 'AWAY_TEAM',
                'PTS_home', 'PTS_away', 'HOME_TEAM_WINS',  # Targets/outcomes
                'SPREAD_HOME_COVER', 'TOTAL_OVER', 'ML_HOME_WIN',  # Targets
                'ACTUAL_MARGIN', 'ACTUAL_TOTAL',  # Leakage
                'HOME_MONEYLINE', 'AWAY_MONEYLINE', 'HOME_SPREAD', 'AWAY_SPREAD', 'TOTAL_LINE',  # Already encoded as features
                'FG_PCT_home', 'FG_PCT_away', 'FT_PCT_home', 'FT_PCT_away',  # In-game stats
                'FG3_PCT_home', 'FG3_PCT_away', 'AST_home', 'AST_away', 'REB_home', 'REB_away']

feature_cols = [col for col in train.columns if col not in exclude_cols]

# Remove any remaining object/datetime columns
feature_cols = [col for col in feature_cols if train[col].dtype in ['int64', 'float64']]

print(f"Using {len(feature_cols)} features for modeling")

# Prepare data
X_train = train[feature_cols].fillna(0)
y_train = train['SPREAD_HOME_COVER']

# Quick Random Forest
print("\nTraining quick Random Forest for feature importance...")
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nüèÜ TOP 20 MOST IMPORTANT FEATURES:\n")
print(feature_importance.head(20).to_string(index=False))

# Save feature importance
feature_importance.to_csv(PROCESSED_DIR / 'feature_importance_preview.csv', index=False)
print(f"\n‚úÖ Saved full feature importance to feature_importance_preview.csv")


# Final summary
print("\n" + "="*70)
print("‚úÖ PHASE 4 COMPLETE - FEATURE ENGINEERING")
print("="*70)

print("\nüìä WHAT WE CREATED:")
print(f"   ‚Ä¢ {total_features} engineered features")
print(f"   ‚Ä¢ {len(train):,} training games (2006-2014)")
print(f"   ‚Ä¢ {len(val):,} validation games (2015-2016)")
print(f"   ‚Ä¢ {len(test):,} test games (2017)")

print("\nüìã KEY FEATURE CATEGORIES:")
for category, features in feature_categories.items():
    if len(features) > 0:
        print(f"   ‚úÖ {category}: {len(features)} features")

print("\nüéØ TARGET VARIABLES:")
print("   ‚Ä¢ SPREAD_HOME_COVER (primary target)")
print("   ‚Ä¢ TOTAL_OVER (secondary target)")
print("   ‚Ä¢ ML_HOME_WIN (tertiary target)")

print("\nüìÅ SAVED FILES:")
print(f"   ‚Ä¢ games_with_features.csv ({len(games_featured):,} rows)")
print(f"   ‚Ä¢ train_set.csv ({len(train):,} rows)")
print(f"   ‚Ä¢ val_set.csv ({len(val):,} rows)")
print(f"   ‚Ä¢ test_set.csv ({len(test):,} rows)")
print(f"   ‚Ä¢ feature_list.csv (documentation)")
print(f"   ‚Ä¢ feature_importance_preview.csv")

print("\nüéØ READY FOR:")
print("   ‚Üí Phase 5: Machine Learning Model Training")
print("   ‚Üí Algorithms: Logistic Regression, Random Forest, XGBoost")
print("   ‚Üí Evaluation: Accuracy, Precision, ROI simulation")
print("   ‚Üí SHAP explainability")

print(f"\nüìÇ All files saved to: {PROCESSED_DIR}")
print("\n" + "="*70)
print(f"Feature engineering completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70)

print("\nüí¨ REPLY: 'Phase 4 complete - ready for ML modeling!'")


PHASE 4: FEATURE ENGINEERING

Start time: 2026-02-15 10:02:00

üìÇ Loading cleaned datasets...
‚úÖ Loaded 14,816 games with betting data
‚úÖ Loaded 668,628 player-game records

‚úÖ Sorted by date: 2006-11-01 to 2018-06-08
‚úÖ Helper functions created

üìä Creating team game log...
‚úÖ Created team game log: 29,632 records
   (Each game creates 2 records: one per team)

üîç Sample team game log:


Unnamed: 0,GAME_DATE,TEAM_ID,OPP_TEAM_ID,IS_HOME,TEAM_PTS,OPP_PTS,WON
0,2006-11-01,1610612737,1610612755,0,75.0,88.0,0
1,2006-11-03,1610612737,1610612752,1,102.0,92.0,1
2,2006-11-05,1610612737,1610612753,1,95.0,82.0,1
3,2006-11-07,1610612737,1610612739,0,104.0,95.0,1
4,2006-11-10,1610612737,1610612761,0,111.0,102.0,1
5,2006-11-14,1610612737,1610612749,1,101.0,103.0,0
6,2006-11-18,1610612737,1610612748,1,88.0,93.0,0
7,2006-11-22,1610612737,1610612765,0,98.0,103.0,0
8,2006-11-24,1610612737,1610612761,1,97.0,93.0,1
9,2006-11-25,1610612737,1610612753,0,79.0,98.0,0



üìà Games per team:
   Min: 875
   Max: 1083
   Average: 987.7

üîß FEATURE SET 1: Rolling Performance (Last 5 Games)
Calculating rolling 5-game averages...
‚úÖ Created rolling 5-game features:
   - TEAM_PTS_L5 (average points scored)
   - OPP_PTS_L5 (average points allowed)
   - WON_L5 (win rate)
   - FG_PCT_L5 (shooting percentage)
   - REB_L5 (rebounds)
   - AST_L5 (assists)
   - PT_DIFF_L5 (point differential)

üîç Sample rolling features:


Unnamed: 0,GAME_DATE,TEAM_PTS,TEAM_PTS_L5,WON,WON_L5,PT_DIFF_L5
0,2006-11-01,75.0,,0,,
1,2006-11-03,102.0,75.0,1,0.0,-13.0
2,2006-11-05,95.0,88.5,1,0.5,-1.5
3,2006-11-07,104.0,90.666667,1,0.666667,3.333333
4,2006-11-10,111.0,94.0,1,0.75,4.75
5,2006-11-14,101.0,97.4,0,0.8,5.6
6,2006-11-18,88.0,102.6,0,0.8,7.8
7,2006-11-22,98.0,99.8,0,0.6,4.8
8,2006-11-24,97.0,100.4,1,0.4,1.2
9,2006-11-25,79.0,99.0,0,0.4,0.2



üîß FEATURE SET 2: Rolling 10 and 20 Game Performance
‚úÖ Created rolling 10-game and 20-game features
‚úÖ Created point differential features for L10 and L20

üîß FEATURE SET 3: Home/Away Split Performance
‚úÖ Created home/away split features:
   - TEAM_PTS_HOME_L5 (points at home)
   - TEAM_PTS_AWAY_L5 (points on road)
   - WON_HOME_L5 (home win rate)
   - WON_AWAY_L5 (away win rate)

üîß FEATURE SET 4: Rest Days Between Games
‚úÖ Created rest features:
   - DAYS_REST (days since last game)
   - BACK_TO_BACK (playing on consecutive days)
   - WELL_RESTED (3+ days rest)

üìä 21.4% of games are back-to-backs
üìä 23.8% of games are well-rested (3+ days)

üîß FEATURE SET 5: Win/Loss Streaks
‚úÖ Created streak features:
   - WIN_STREAK (positive=winning, negative=losing)
   - ON_WIN_STREAK (binary)
   - ON_LOSS_STREAK (binary)
   - WIN_STREAK_LENGTH (absolute value)

üìä Longest win streak in data: 27 games
üìä Longest loss streak in data: 26 games

üîß FEATURE SET 6: Head-to-He

Unnamed: 0,GAME_DATE,TEAM_ID,OPP_TEAM_ID,WON,H2H_WIN_RATE_L5
1023,2006-11-11,1610612738,1610612739,0,
1046,2007-01-03,1610612738,1610612739,0,0.0
1087,2007-04-01,1610612738,1610612739,1,0.0
1108,2007-11-27,1610612738,1610612739,0,0.333333
1111,2007-12-02,1610612738,1610612739,1,0.25
1139,2008-02-05,1610612738,1610612739,0,0.4
1150,2008-02-27,1610612738,1610612739,1,0.4
1182,2008-05-06,1610612738,1610612739,1,0.6
1183,2008-05-08,1610612738,1610612739,1,0.6
1184,2008-05-10,1610612738,1610612739,0,0.8



üîó Merging features back to games dataset...
‚úÖ Merged features to games dataset
   Original columns: 41
   New columns: 95
   Features added: 54

üîß FEATURE SET 7: Matchup Differential Features
   ‚úÖ DIFF_TEAM_PTS_L5 (Scoring differential)
   ‚úÖ DIFF_WON_L5 (Win rate differential)
   ‚úÖ DIFF_PT_DIFF_L5 (Point diff differential)
   ‚úÖ DIFF_FG_PCT_L5 (Shooting% differential)
   ‚úÖ DIFF_DAYS_REST (Rest differential)
   ‚úÖ DIFF_WIN_STREAK_LENGTH (Streak differential)

‚úÖ Created 6 differential features

üîß FEATURE SET 8: Betting Market Features
‚úÖ Created betting market features:
   - HOME_FAVORITE (binary)
   - HOME_UNDERDOG (binary)
   - SPREAD_SIZE (absolute spread)
   - BIG_FAVORITE/BIG_UNDERDOG (¬±7 points)
   - PROB_EDGE_HOME (implied prob vs recent performance)
   - TOTAL_VS_RECENT (line vs recent scoring pace)

üìä Home team is favorite 31.4% of the time

üîß FEATURE SET 9: Temporal Features
‚úÖ Created temporal features:
   - SEASON_PROGRESS (0-1 scale)
   - MON

Unnamed: 0,GAME_DATE,MATCHUP,HOME_SPREAD,SPREAD_HOME_COVER,HOME_WON_L5,AWAY_WON_L5,DIFF_WON_L5,HOME_PT_DIFF_L5,AWAY_PT_DIFF_L5,HOME_DAYS_REST,AWAY_DAYS_REST,HOME_BACK_TO_BACK,AWAY_BACK_TO_BACK,HOME_FAVORITE,PROB_EDGE_HOME,SEASON_PROGRESS,PLAYOFF_PUSH
14806,2018-05-23,CLE @ BOS,1.5,1.0,0.6,0.6,0.0,0.2,7.2,2.0,2.0,0,0,0,-0.228302,0.936255,1
14807,2018-05-24,GSW @ HOU,1.0,1.0,0.6,0.6,0.0,-3.8,7.6,2.0,2.0,0,0,0,-0.180769,0.940239,1
14808,2018-05-25,BOS @ CLE,6.5,1.0,0.4,0.6,-0.2,-2.4,2.4,2.0,2.0,0,0,0,-0.428997,0.944223,1
14809,2018-05-26,HOU @ GSW,12.5,1.0,0.4,0.6,-0.2,5.0,-5.0,2.0,2.0,0,0,0,-0.480383,0.948207,1
14810,2018-05-27,CLE @ BOS,3.5,0.0,0.4,0.6,-0.2,-4.6,4.6,2.0,2.0,0,0,0,-0.174468,0.952191,1
14811,2018-05-28,GSW @ HOU,-5.5,0.0,0.6,0.4,0.2,-8.2,8.2,2.0,2.0,0,0,1,0.128997,0.956175,1
14812,2018-05-31,CLE @ GSW,13.0,1.0,0.6,0.8,-0.2,14.4,8.8,3.0,4.0,0,0,0,-0.467725,0.968127,1
14813,2018-06-03,CLE @ GSW,11.5,1.0,0.6,0.6,0.0,8.2,0.8,3.0,3.0,0,0,0,-0.516176,0.98008,1
14814,2018-06-06,GSW @ CLE,-3.5,0.0,0.4,0.8,-0.4,-4.8,12.6,3.0,3.0,0,0,1,0.095142,0.992032,1
14815,2018-06-08,GSW @ CLE,-4.0,0.0,0.4,1.0,-0.6,-3.8,15.0,2.0,2.0,0,0,1,0.22406,1.0,1



üíæ SAVING ENGINEERED DATASET
‚úÖ Saved: data\processed\games_with_features.csv
   Rows: 14,816
   Columns: 115
‚úÖ Saved: feature_list.csv (documentation)

üìä CREATING TRAIN/VAL/TEST SPLITS
Games with complete data: 14,816

üìä SPLIT SUMMARY:
Dataset         Games      Date Range                     Seasons
Train           11,068     2006-11-01 to 2015-06-16  2006-2014
Validation      2,538      2015-11-01 to 2017-06-12  2015-2016
Test            1,210      2017-09-30 to 2018-06-08  2017-2017
TOTAL           14,816    

üìä CLASS BALANCE (Home Cover Rate):
Train:      64.0%
Validation: 63.3%
Test:       62.3%

‚úÖ Saved train/val/test splits to data\processed

üîç FEATURE IMPORTANCE PREVIEW
Using 80 features for modeling

Training quick Random Forest for feature importance...

üèÜ TOP 20 MOST IMPORTANT FEATURES:

         Feature  Importance
     ML_AWAY_WIN    0.411669
   HOME_WIN_PROB    0.079191
   AWAY_WIN_PROB    0.067778
   HOME_FAVORITE    0.050280
  PROB_EDGE_HOME    0