In [1]:
%pip install xgboost nfl_data_py pillow

Note: you may need to restart the kernel to use updated packages.


In [2]:
# NFL Game Prediction using nfl_data_py
# Updated version of NFL prediction pipeline using modern, maintained data source

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot  
import warnings
import os
warnings.filterwarnings('ignore')

# Core ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.calibration import CalibratedClassifierCV as CCV
from sklearn.pipeline import Pipeline

# Advanced ML libraries
try: 
    import xgboost as xgb
    HAS_XGB = True
except ImportError:
    HAS_XGB = False
    print("XGBoost not available. Install with: pip install xgboost")

# NFL data library
try:
    import nfl_data_py as nfl
    HAS_NFL_DATA = True
except ImportError:
    HAS_NFL_DATA = False
    print("nfl_data_py not available. Install with: pip install nfl_data_py")

class NFLGamePredictor:
    def __init__(self):
        self.models = {}
        self.best_features = []
        self.scaler = StandardScaler()
        self.final_model = None
        
    def collect_data(self, start_year=2010, end_year=2024, save_csv=True, data_folder='nfl_data'):
        """Collect comprehensive NFL data using nfl_data_py and save as CSV files"""
        
        if not HAS_NFL_DATA:
            raise ImportError("nfl_data_py is required. Install with: pip install nfl_data_py")
        
        print(f"Collecting NFL data from {start_year} to {end_year}...")
        
        # Create data folder if it doesn't exist
        if save_csv and not os.path.exists(data_folder):
            os.makedirs(data_folder)
            print(f"Created data folder: {data_folder}")
        
        years = list(range(start_year, end_year + 1))
        
        # Get play-by-play data for game-level statistics
        print("Downloading play-by-play data...")
        try:
            pbp_data = nfl.import_pbp_data(years)
            print(f"✓ Play-by-play data downloaded successfully")
        except Exception as e:
            print(f"Error downloading play-by-play data: {e}")
            # Try with reduced year range
            years = list(range(start_year, 2024 + 1))
            pbp_data = nfl.import_pbp_data(years)
            print(f"✓ Play-by-play data downloaded (adjusted to {start_year}-2024)")
        
        # Get team information
        print("Downloading team information...")
        teams = nfl.import_team_desc()
        team_dict = teams.set_index('team_abbr')['team_name'].to_dict()
        
        # Get weekly data for season-long team statistics
        print("Downloading weekly team data...")
        try:
            weekly_data = nfl.import_weekly_data(years)
            print(f"✓ Weekly data downloaded successfully")
        except Exception as e:
            print(f"Error downloading weekly data: {e}")
            print("Trying with 2015-2024 data only...")
            # Fall back to confirmed available years
            years_safe = list(range(2015, 2024 + 1))
            weekly_data = nfl.import_weekly_data(years_safe)
            print(f"✓ Weekly data downloaded (2015-2024)")
        
        # Get schedule data
        print("Downloading schedule data...")
        try:
            schedule_data = nfl.import_schedules(years)
            print(f"✓ Schedule data downloaded successfully")
        except Exception as e:
            print(f"Error downloading schedule data: {e}")
            print("Trying with 2015-2024 data only...")
            # Fall back to confirmed available years
            years_safe = list(range(2015, 2024 + 1))
            schedule_data = nfl.import_schedules(years_safe)
            print(f"✓ Schedule data downloaded (2015-2024)")
        
        # Try to get 2025 Week 1 data separately if available
        print("Attempting to download 2025 Week 1 data...")
        try:
            pbp_2025 = nfl.import_pbp_data([2025])
            if not pbp_2025.empty:
                pbp_data = pd.concat([pbp_data, pbp_2025], ignore_index=True)
                print("✓ 2025 play-by-play data added")
            
            weekly_2025 = nfl.import_weekly_data([2025])
            if not weekly_2025.empty:
                weekly_data = pd.concat([weekly_data, weekly_2025], ignore_index=True)
                print("✓ 2025 weekly data added")
                
            schedule_2025 = nfl.import_schedules([2025])
            if not schedule_2025.empty:
                schedule_data = pd.concat([schedule_data, schedule_2025], ignore_index=True)
                print("✓ 2025 schedule data added")
        except Exception as e:
            print(f"2025 data not available yet: {e}")
            print("Proceeding with 2015-2024 data only")
        
        print(f"Collected {len(pbp_data)} play-by-play records")
        print(f"Collected {len(weekly_data)} weekly team records") 
        print(f"Collected {len(schedule_data)} scheduled games")
        
        # Save data as CSV files if requested
        if save_csv:
            print(f"\nSaving data to CSV files in '{data_folder}' folder...")
            
            # Save play-by-play data (this will be large)
            pbp_file = os.path.join(data_folder, f'pbp_data_{start_year}_{end_year}.csv')
            pbp_data.to_csv(pbp_file, index=False)
            print(f"✓ Saved play-by-play data: {pbp_file} ({len(pbp_data):,} rows)")
            
            # Save weekly data
            weekly_file = os.path.join(data_folder, f'weekly_data_{start_year}_{end_year}.csv')
            weekly_data.to_csv(weekly_file, index=False)
            print(f"✓ Saved weekly data: {weekly_file} ({len(weekly_data):,} rows)")
            
            # Save schedule data
            schedule_file = os.path.join(data_folder, f'schedule_data_{start_year}_{end_year}.csv')
            schedule_data.to_csv(schedule_file, index=False)
            print(f"✓ Saved schedule data: {schedule_file} ({len(schedule_data):,} rows)")
            
            # Save team information
            teams_file = os.path.join(data_folder, 'team_info.csv')
            teams.to_csv(teams_file, index=False)
            print(f"✓ Saved team info: {teams_file} ({len(teams):,} rows)")
            
            # Save a sample of each dataset for quick inspection
            print(f"\nSaving sample data for quick inspection...")
            
            # Sample play-by-play (first 1000 rows)
            pbp_sample_file = os.path.join(data_folder, 'pbp_sample.csv')
            pbp_data.head(1000).to_csv(pbp_sample_file, index=False)
            print(f"✓ Saved PBP sample: {pbp_sample_file} (1,000 rows)")
            
            # Sample weekly data (recent season)
            weekly_sample = weekly_data[weekly_data['season'] >= end_year - 1]
            weekly_sample_file = os.path.join(data_folder, 'weekly_sample.csv')
            weekly_sample.to_csv(weekly_sample_file, index=False)
            print(f"✓ Saved weekly sample: {weekly_sample_file} ({len(weekly_sample):,} rows from {end_year-1}-{end_year})")
            
            # Show column information
            print(f"\nDATA STRUCTURE OVERVIEW:")
            print("="*50)
            print(f"Play-by-Play Columns ({len(pbp_data.columns)}): {list(pbp_data.columns[:10])}...")
            print(f"Weekly Data Columns ({len(weekly_data.columns)}): {list(weekly_data.columns)}")
            print(f"Schedule Columns ({len(schedule_data.columns)}): {list(schedule_data.columns)}")
            print(f"Team Info Columns ({len(teams.columns)}): {list(teams.columns)}")
            
            print(f"\nDATA SAVED SUCCESSFULLY!")
            print(f"Check the '{data_folder}' folder to examine the downloaded data.")
        
        return pbp_data, weekly_data, schedule_data, team_dict
    
    def _calculate_injury_percentage(self, team_data):
        """
        Calculate estimated injury percentage based on available performance metrics
        
        This method estimates team injury impact by analyzing performance consistency
        and key player availability indicators in the data.
        """
        
        # If we have specific injury data columns, use them
        if 'injuries' in team_data.columns:
            return team_data['injuries'].mean()
        
        # Estimate injury impact based on performance variance and available metrics
        # Higher variance in key stats might indicate injury-related inconsistency
        
        injury_indicators = []
        
        # 1. Passing performance consistency (QB health indicator)
        if 'passing_yards' in team_data.columns and len(team_data) > 1:
            passing_std = team_data['passing_yards'].std()
            passing_mean = team_data['passing_yards'].mean()
            if passing_mean > 0:
                passing_variance = (passing_std / passing_mean) * 100
                injury_indicators.append(min(passing_variance, 30))  # Cap at 30%
        
        # 2. Rushing performance consistency (RB/OL health indicator)
        if 'rushing_yards' in team_data.columns and len(team_data) > 1:
            rushing_std = team_data['rushing_yards'].std()
            rushing_mean = team_data['rushing_yards'].mean()
            if rushing_mean > 0:
                rushing_variance = (rushing_std / rushing_mean) * 100
                injury_indicators.append(min(rushing_variance, 25))  # Cap at 25%
        
        # 3. Completion percentage consistency (QB/WR health indicator)
        if 'completions' in team_data.columns and 'passing_attempts' in team_data.columns:
            comp_pct = team_data['completions'] / (team_data['passing_attempts'] + 0.1)  # Avoid division by zero
            if len(comp_pct) > 1:
                comp_std = comp_pct.std()
                comp_variance = comp_std * 100
                injury_indicators.append(min(comp_variance, 20))  # Cap at 20%
        
        # 4. Fantasy points consistency (overall team health)
        if 'fantasy_points' in team_data.columns and len(team_data) > 1:
            fp_std = team_data['fantasy_points'].std()
            fp_mean = team_data['fantasy_points'].mean()
            if fp_mean > 0:
                fp_variance = (fp_std / fp_mean) * 100
                injury_indicators.append(min(fp_variance, 35))  # Cap at 35%
        
        # Calculate weighted average injury percentage
        if injury_indicators:
            # Weight more recent games higher (if we have game order info)
            weights = [1.0] * len(injury_indicators)  
            weighted_avg = sum(i * w for i, w in zip(injury_indicators, weights)) / sum(weights)
            
            # Normalize to 0-100% range and apply league baseline
            # NFL teams typically have 10-25% of roster dealing with some injury
            baseline_injury_rate = 15.0  # League average baseline
            estimated_injury_pct = min(max(weighted_avg, 5.0), 40.0)  # 5-40% range
            
            # Blend with baseline for more realistic estimates
            final_injury_pct = (estimated_injury_pct * 0.7) + (baseline_injury_rate * 0.3)
            
            return final_injury_pct
        
        # Default injury rate if no data available
        return 15.0  # NFL league average
    
    def _calculate_defensive_stats(self, weekly_data, schedule_data, season, week):
        """
        Calculate defensive statistics (yards/points allowed) for each team.
        OPTIMIZED VERSION - Pre-aggregate weekly data for faster lookups.
        """
        defensive_stats = {}
        
        # Filter to relevant season/weeks
        season_schedule = schedule_data[
            (schedule_data['season'] == season) & 
            (schedule_data['week'] < week)
        ].copy()
        
        if season_schedule.empty:
            return {}
        
        # PRE-AGGREGATE: Create team totals per week (MUCH FASTER)
        season_weekly = weekly_data[
            (weekly_data['season'] == season) & 
            (weekly_data['week'] < week)
        ]
        
        if season_weekly.empty:
            return {}
        
        # Group by team and week, sum all player stats
        team_week_totals = season_weekly.groupby(['recent_team', 'week']).agg({
            'passing_yards': 'sum',
            'rushing_yards': 'sum'
        }).reset_index()
        team_week_totals['total_yards'] = team_week_totals['passing_yards'] + team_week_totals['rushing_yards']
        
        # CRITICAL OPTIMIZATION: Create lookup dictionary for O(1) access instead of repeated filtering
        yards_lookup = {}
        for _, row in team_week_totals.iterrows():
            key = (row['recent_team'], row['week'])
            yards_lookup[key] = row['total_yards']
        
        # Get all unique teams from schedule (vectorized)
        all_teams = set(pd.concat([
            season_schedule['home_team'].dropna(),
            season_schedule['away_team'].dropna()
        ]).unique())
        
        for team in all_teams:
            # Get all games for this team (one filter operation)
            team_home = season_schedule[season_schedule['home_team'] == team]
            team_away = season_schedule[season_schedule['away_team'] == team]
            
            total_yards_allowed = 0
            total_points_allowed = 0
            games_count = 0
            
            # Process home games (defending against away team)
            for _, game in team_home.iterrows():
                if pd.notna(game.get('away_score')):
                    opponent = game['away_team']
                    game_week = game['week']
                    points_allowed = game['away_score']
                    
                    # O(1) dictionary lookup instead of DataFrame filter
                    yards_allowed = yards_lookup.get((opponent, game_week), 0)
                    if yards_allowed > 0:
                        total_yards_allowed += yards_allowed
                        total_points_allowed += points_allowed
                        games_count += 1
            
            # Process away games (defending against home team)
            for _, game in team_away.iterrows():
                if pd.notna(game.get('home_score')):
                    opponent = game['home_team']
                    game_week = game['week']
                    points_allowed = game['home_score']
                    
                    # O(1) dictionary lookup
                    yards_allowed = yards_lookup.get((opponent, game_week), 0)
                    if yards_allowed > 0:
                        total_yards_allowed += yards_allowed
                        total_points_allowed += points_allowed
                        games_count += 1
            
            # Calculate per-game averages
            if games_count > 0:
                defensive_stats[team] = {
                    'def_yards_allowed_pg': total_yards_allowed / games_count,
                    'def_points_allowed_pg': total_points_allowed / games_count
                }
            else:
                defensive_stats[team] = {
                    'def_yards_allowed_pg': 0,
                    'def_points_allowed_pg': 0
                }
        
        return defensive_stats
    
    def create_team_features(self, weekly_data, season, week, schedule_data=None):
        """
        Create team-level features for a specific season/week.
        
        CRITICAL: weekly_data contains PLAYER-level statistics.
        We must aggregate multiple players per team per week to get team totals.
        """
        
        # Filter data up to the current week
        season_data = weekly_data[
            (weekly_data['season'] == season) & 
            (weekly_data['week'] < week)
        ]
        
        if season_data.empty:
            return {}
        
        # Check if 'recent_team' column exists
        if 'recent_team' not in season_data.columns:
            print("Warning: 'recent_team' column not found in weekly_data")
            return {}
        
        # Calculate defensive stats if schedule data provided
        defensive_stats = {}
        if schedule_data is not None:
            defensive_stats = self._calculate_defensive_stats(weekly_data, schedule_data, season, week)
        
        # Calculate season averages for each team
        team_features = {}
        
        for team in season_data['recent_team'].unique():
            team_data = season_data[season_data['recent_team'] == team]
            
            if len(team_data) == 0:
                continue
            
            # CRITICAL FIX: Aggregate player stats into team totals per week
            # Group by week and sum all players' stats to get team totals
            team_weekly = team_data.groupby('week').agg({
                'passing_yards': 'sum',
                'rushing_yards': 'sum',
                'completions': 'sum',
                'passing_tds': 'sum',
                'interceptions': 'sum',
                'rushing_tds': 'sum',
                'fantasy_points': 'sum'
            }).reset_index()
            
            # Now calculate per-game averages from team totals
            total_yards_per_week = team_weekly['passing_yards'] + team_weekly['rushing_yards']
            
            # Recent form (last 3 games)
            recent_games = team_weekly.tail(3)
            recent_points = recent_games['fantasy_points'].mean() if len(recent_games) > 0 else 0
            recent_yards = (recent_games['passing_yards'] + recent_games['rushing_yards']).mean() if len(recent_games) > 0 else 0
            
            # Momentum tracking
            if len(team_weekly) >= 3:
                recent_3 = team_weekly.tail(3)['fantasy_points'].mean()
                earlier_games = team_weekly.iloc[:-3]['fantasy_points'].mean() if len(team_weekly) > 3 else recent_3
                points_trend = recent_3 - earlier_games
                
                recent_3_yards = (team_weekly.tail(3)['passing_yards'] + team_weekly.tail(3)['rushing_yards']).mean()
                earlier_yards = (team_weekly.iloc[:-3]['passing_yards'] + team_weekly.iloc[:-3]['rushing_yards']).mean() if len(team_weekly) > 3 else recent_3_yards
                yards_trend = recent_3_yards - earlier_yards
            else:
                points_trend = 0
                yards_trend = 0
            
            # Get defensive stats
            def_stats = defensive_stats.get(team, {
                'def_yards_allowed_pg': 0,
                'def_points_allowed_pg': 0
            })
                
            # Offensive features (from aggregated team data)
            features = {
                'passing_yards_pg': team_weekly['passing_yards'].mean(),
                'rushing_yards_pg': team_weekly['rushing_yards'].mean(), 
                'total_yards_pg': total_yards_per_week.mean(),
                'points_pg': team_weekly['fantasy_points'].mean(),
                'completions_pg': team_weekly['completions'].mean(),
                'passing_tds_pg': team_weekly['passing_tds'].mean(),
                'interceptions_thrown_pg': team_weekly['interceptions'].mean(),
                'rushing_tds_pg': team_weekly['rushing_tds'].mean(),
                'fumbles_lost_pg': 0,  # Not available in weekly_data
                
                # Defensive features
                'opp_total_yards_pg': def_stats['def_yards_allowed_pg'],
                'opp_points_pg': def_stats['def_points_allowed_pg'],
                
                # Recent form (last 3 games)
                'recent_points_pg': recent_points,
                'recent_yards_pg': recent_yards,
                
                # Momentum indicators
                'points_trend': points_trend,
                'yards_trend': yards_trend,
                
                # Team health and availability metrics
                'injury_percentage': self._calculate_injury_percentage(team_data),
                
                # Advanced metrics
                'turnover_ratio': team_weekly['interceptions'].mean(),
                'games_played': len(team_weekly)
            }
            
            team_features[team] = features
        
        return team_features
    
    def create_game_features(self, home_team, away_team, team_features, 
                           season, week, is_playoff=False, is_neutral=False):
        """Create features for a specific matchup with defensive stats"""
        
        if home_team not in team_features or away_team not in team_features:
            return None
        
        home_stats = team_features[home_team]
        away_stats = team_features[away_team]
        
        # Create matchup features
        features = {
            # Home team offensive stats
            'home_passing_ypg': home_stats['passing_yards_pg'],
            'home_rushing_ypg': home_stats['rushing_yards_pg'],
            'home_total_ypg': home_stats['total_yards_pg'],
            'home_points_pg': home_stats['points_pg'],
            'home_passing_tds_pg': home_stats['passing_tds_pg'],
            'home_turnovers_pg': home_stats.get('fumbles_lost_pg', 0) + home_stats['interceptions_thrown_pg'],
            'home_injury_pct': home_stats['injury_percentage'],
            
            # Home team defensive stats
            'home_def_yards_allowed': home_stats.get('opp_total_yards_pg', 0),
            'home_def_points_allowed': home_stats.get('opp_points_pg', 0),
            
            # Home team recent form
            'home_recent_points': home_stats.get('recent_points_pg', home_stats['points_pg']),
            'home_points_trend': home_stats.get('points_trend', 0),
            
            # Away team offensive stats  
            'away_passing_ypg': away_stats['passing_yards_pg'],
            'away_rushing_ypg': away_stats['rushing_yards_pg'],
            'away_total_ypg': away_stats['total_yards_pg'],
            'away_points_pg': away_stats['points_pg'],
            'away_passing_tds_pg': away_stats['passing_tds_pg'],
            'away_turnovers_pg': away_stats.get('fumbles_lost_pg', 0) + away_stats['interceptions_thrown_pg'],
            'away_injury_pct': away_stats['injury_percentage'],
            
            # Away team defensive stats
            'away_def_yards_allowed': away_stats.get('opp_total_yards_pg', 0),
            'away_def_points_allowed': away_stats.get('opp_points_pg', 0),
            
            # Away team recent form
            'away_recent_points': away_stats.get('recent_points_pg', away_stats['points_pg']),
            'away_points_trend': away_stats.get('points_trend', 0),
            
            # Matchup advantages
            'passing_advantage': home_stats['passing_yards_pg'] - away_stats['passing_yards_pg'],
            'rushing_advantage': home_stats['rushing_yards_pg'] - away_stats['rushing_yards_pg'],
            'scoring_advantage': home_stats['points_pg'] - away_stats['points_pg'],
            'turnover_advantage': away_stats.get('fumbles_lost_pg', 0) + away_stats['interceptions_thrown_pg'] - 
                                (home_stats.get('fumbles_lost_pg', 0) + home_stats['interceptions_thrown_pg']),
            'injury_advantage': away_stats['injury_percentage'] - home_stats['injury_percentage'],
            
            # Defensive matchup advantages
            'defensive_advantage': away_stats.get('opp_points_pg', 0) - home_stats.get('opp_points_pg', 0),
            'offensive_vs_defense': home_stats['points_pg'] - away_stats.get('opp_points_pg', 25),
            
            # Game context
            'home_field_advantage': 0 if is_neutral else 2.5,
            'is_playoff': 1 if is_playoff else 0,
            'is_neutral': 1 if is_neutral else 0,
            'week': week,
            'season': season,
        }
        
        return features
    
    def build_dataset(self, pbp_data, weekly_data, schedule_data, save_csv=True, data_folder='nfl_data'):
        """Build complete dataset from NFL data and optionally save as CSV"""
        
        print("Building dataset from collected data...")
        print(f"Processing {len(schedule_data)} scheduled games...")
        
        game_records = []
        processed_count = 0
        
        # Process each scheduled game
        for idx, game in schedule_data.iterrows():
            processed_count += 1
            if processed_count % 500 == 0:
                print(f"  Processed {processed_count}/{len(schedule_data)} games...")
            season = game['season']
            week = game['week']
            home_team = game['home_team']
            away_team = game['away_team']
            
            # Skip if missing essential data
            if pd.isna(home_team) or pd.isna(away_team):
                continue
            
            # Get team features up to this point in season (with defensive stats)
            team_features = self.create_team_features(weekly_data, season, week, schedule_data)
            
            if not team_features:
                continue
            
            # Create game features
            game_features = self.create_game_features(
                home_team, away_team, team_features, 
                season, week,
                is_playoff=game.get('game_type', '') == 'REG',
                is_neutral=False  # Simplified for now
            )
            
            if game_features is None:
                continue
            
            # Determine result (home team win = 1, loss = 0)
            home_score = game.get('home_score', 0)
            away_score = game.get('away_score', 0)
            
            # Skip games without scores (future games)
            if pd.isna(home_score) or pd.isna(away_score):
                continue
            
            game_features['home_win'] = 1 if home_score > away_score else 0
            game_features['home_score'] = home_score
            game_features['away_score'] = away_score
            game_features['game_id'] = f"{season}_{week}_{home_team}_{away_team}"
            
            game_records.append(game_features)
        
        df = pd.DataFrame(game_records)
        print(f"Created dataset with {len(df)} games")
        
        # Save the processed dataset
        if save_csv and not df.empty:
            if not os.path.exists(data_folder):
                os.makedirs(data_folder)
            
            processed_file = os.path.join(data_folder, 'processed_game_features.csv')
            df.to_csv(processed_file, index=False)
            print(f"✓ Saved processed game features: {processed_file} ({len(df):,} rows)")
            
            # Show feature information
            print(f"\nPROCESSED FEATURES ({len(df.columns)} columns):")
            print("="*50)
            for col in df.columns:
                print(f"  - {col}")
        
        return df
    
    def select_features(self, df, n_features=20):
        """Select best features using RFE with expanded feature set"""
        
        # Prepare data
        feature_cols = [col for col in df.columns 
                       if col not in ['home_win', 'home_score', 'away_score', 'game_id']]
        
        X = df[feature_cols]
        y = df['home_win']
        
        # Remove any columns with all NaN or constant values
        X = X.loc[:, X.var() > 0]
        X = X.fillna(X.mean())
        
        print(f"Starting feature selection with {len(X.columns)} features...")
        print(f"Testing up to {n_features} features for best performance...")
        
        # Use RFE with different numbers of features
        models = {}
        results = []
        
        for i in range(2, min(n_features + 1, len(X.columns) + 1)):
            rfe = RFE(estimator=LDA(), n_features_to_select=i)
            model = DecisionTreeClassifier(random_state=42)
            pipeline = Pipeline(steps=[('s', rfe), ('m', model)])
            
            cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
            scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
            
            results.append(scores)
            models[str(i)] = pipeline
            
            print(f'{i} features: {scores.mean():.3f} (+/- {scores.std():.3f})')
        
        # Find best number of features
        best_idx = np.argmax([np.mean(result) for result in results])
        best_n_features = best_idx + 2
        
        print(f"\nBest number of features: {best_n_features}")
        
        # Get the best feature set
        rfe = RFE(estimator=LDA(), n_features_to_select=best_n_features)
        rfe.fit(X, y)
        
        selected_features = X.columns[rfe.support_].tolist()
        self.best_features = selected_features
        
        print("\nSelected features:")
        for feature in selected_features:
            print(f"  - {feature}")
        
        return selected_features
    
    def train_models(self, df):
        """Train and compare multiple models"""
        
        if not self.best_features:
            self.select_features(df)
        
        # Prepare data
        X = df[self.best_features].fillna(df[self.best_features].mean())
        y = df['home_win']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Define models to test
        models_to_test = {
            'Logistic Regression': LogisticRegression(random_state=42),
            'Decision Tree': DecisionTreeClassifier(random_state=42),
            'Random Forest': RandomForestClassifier(random_state=42)
        }
        
        if HAS_XGB:
            models_to_test['XGBoost'] = xgb.XGBClassifier(random_state=42, verbosity=0)
        
        # Train and evaluate models
        model_results = {}
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
        
        print("\nTraining and evaluating models...")
        
        for name, model in models_to_test.items():
            # Use scaled data for logistic regression, raw for tree-based
            X_use = X_train_scaled if 'Logistic' in name else X_train
            scores = cross_val_score(model, X_use, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
            
            model_results[name] = {
                'mean_score': scores.mean(),
                'std_score': scores.std(),
                'scores': scores
            }
            
            print(f"{name}: {scores.mean():.3f} (+/- {scores.std():.3f})")
        
        self.models = models_to_test
        return model_results
    
    def tune_best_model(self, df, model_name='Random Forest'):
        """Tune hyperparameters for the best performing model"""
        
        X = df[self.best_features].fillna(df[self.best_features].mean())
        y = df['home_win']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        if model_name == 'Random Forest':
            param_grid = {
                'n_estimators': [200, 300, 400, 500],
                'max_depth': [8, 9, 10, 11, 12],
                'min_samples_leaf': [2, 5],
                'criterion': ['gini', 'entropy']
            }
            model = RandomForestClassifier(random_state=42)
        
        elif model_name == 'Logistic Regression':
            param_grid = {
                'C': [0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear']
            }
            model = LogisticRegression(random_state=42)
            X_train = self.scaler.fit_transform(X_train)
            X_test = self.scaler.transform(X_test)
        
        elif model_name == 'XGBoost' and HAS_XGB:
            param_grid = {
            'max_depth': [3, 5, 7],
            'learning_rate': [0.01, 0.1, 0.2],
            'n_estimators': [100, 200, 300],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'reg_alpha': [0, 1],
            'reg_lambda': [1, 5]
            }
            model = xgb.XGBClassifier(random_state=42, verbosity=0)
        
        else:
            print(f"Tuning not implemented for {model_name}")
            return None
        
        print(f"\nTuning {model_name}...")
        
        # Grid search
        grid_search = GridSearchCV(
            model, param_grid, 
            cv=5, scoring='accuracy', 
            n_jobs=-1, verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
        
        # Test on holdout set
        test_score = grid_search.score(X_test, y_test)
        print(f"Test set accuracy: {test_score:.3f}")
        
        return grid_search.best_estimator_
    
    def create_ensemble_model(self, df):
        """Create ensemble model combining multiple algorithms"""
        
        X = df[self.best_features].fillna(df[self.best_features].mean())
        y = df['home_win']
        
        # Individual models
        rf = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=42)
        lr = LogisticRegression(C=1, random_state=42)
        
        estimators = [('rf', rf), ('lr', lr)]
        
        if HAS_XGB:
            xgb_model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=200, 
                                        random_state=42, verbosity=0)
            estimators.append(('xgb', xgb_model))
        
        # Voting classifier
        voting_clf = VotingClassifier(estimators=estimators, voting='soft')
        
        # Calibrated classifier for better probability estimates
        self.final_model = CCV(voting_clf, method='isotonic', cv=3)
        
        print("Training ensemble model...")
        self.final_model.fit(X, y)
        
        return self.final_model
    
    def predict_games(self, games_df, confidence_threshold=0.6):
        """Make predictions on new games"""
        
        if self.final_model is None:
            raise ValueError("Model not trained yet. Call create_ensemble_model() first.")
        
        X = games_df[self.best_features].fillna(games_df[self.best_features].mean())
        
        # Get probability predictions
        probabilities = self.final_model.predict_proba(X)[:, 1]
        predictions = self.final_model.predict(X)
        
        # Create results dataframe
        results = games_df.copy()
        results['home_win_prob'] = probabilities
        results['predicted_home_win'] = predictions
        
        # High-confidence bets
        results['high_confidence_bet'] = (
            (probabilities >= confidence_threshold) | 
            (probabilities <= (1 - confidence_threshold))
        )
        
        return results

# Example usage and testing
if __name__ == "__main__":
    
    # Initialize predictor
    predictor = NFLGamePredictor()
    
    # Check if nfl_data_py is available
    if not HAS_NFL_DATA:
        print("Please install nfl_data_py to use this predictor:")
        print("pip install nfl_data_py")
        exit()
    
    try:
        # Collect data (this may take a few minutes)
        print("This may take a few minutes to download NFL data...")
        pbp_data, weekly_data, schedule_data, team_dict = predictor.collect_data(2015, 2025, save_csv=True)
        
        # Build dataset
        df = predictor.build_dataset(pbp_data, weekly_data, schedule_data, save_csv=True)
        
        if df.empty:
            print("No data collected. Check your internet connection and try again.")
            exit()
        
        # Feature selection with expanded feature set (20 features)
        predictor.select_features(df, n_features=20)
        
        # Train models
        model_results = predictor.train_models(df)
        
        # Create ensemble
        final_model = predictor.create_ensemble_model(df)
        
        # Example prediction on test set
        train_data = df[df['season'] < 2023]
        test_data = df[df['season'] == 2023]
        
        if not test_data.empty:
            predictor.final_model.fit(
                train_data[predictor.best_features].fillna(train_data[predictor.best_features].mean()), 
                train_data['home_win']
            )
            
            predictions = predictor.predict_games(test_data)
            
            # Calculate accuracy
            accuracy = (predictions['predicted_home_win'] == predictions['home_win']).mean()
            print(f"\n2023 season prediction accuracy: {accuracy:.3f}")
            
            # High confidence bets
            high_conf = predictions[predictions['high_confidence_bet']]
            if not high_conf.empty:
                conf_accuracy = (high_conf['predicted_home_win'] == high_conf['home_win']).mean()
                print(f"High confidence bet accuracy: {conf_accuracy:.3f} ({len(high_conf)} games)")
        
        print("\nPredictor trained successfully!")
        print("You can now use predictor.predict_games() on new data.")
        
    except Exception as e:
        print(f"Error: {e}")
        print("Make sure you have a stable internet connection for data download.")

This may take a few minutes to download NFL data...
Collecting NFL data from 2015 to 2025...
Downloading play-by-play data...
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
2024 done.
2025 done.
Downcasting floats.
✓ Play-by-play data downloaded successfully
Downloading team information...
Downloading weekly team data...
Error downloading weekly data: HTTP Error 404: Not Found
Trying with 2015-2024 data only...
Downcasting floats.
✓ Weekly data downloaded (2015-2024)
Downloading schedule data...
✓ Schedule data downloaded successfully
Attempting to download 2025 Week 1 data...
2025 done.
Downcasting floats.
✓ 2025 play-by-play data added
2025 data not available yet: HTTP Error 404: Not Found
Proceeding with 2015-2024 data only
Collected 525457 play-by-play records
Collected 54479 weekly team records
Collected 3015 scheduled games

Saving data to CSV files in 'nfl_data' folder...
✓ Saved play-by-play data: nfl_data/pbp_data_2015_2025.c

In [3]:
import joblib

# Save the trained model

joblib.dump(predictor.final_model, 'final_model.joblib')

# To load later:
# predictor.final_model = joblib.load('nfl_data/final_model.joblib')

['final_model.joblib']

In [4]:
# NFL SPREAD PREDICTION MODEL
# Enhanced version that predicts point spreads instead of just winners

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

class NFLSpreadPredictor:
    def __init__(self):
        self.spread_model = None
        self.best_features = []
        self.scaler = StandardScaler()
        
    def prepare_spread_data(self, df):
        """Convert winner prediction data to spread prediction data"""
        
        # Create point differential (home_score - away_score)
        df['point_spread'] = df['home_score'] - df['away_score']
        
        print(f"Spread Data Summary:")
        print(f"  Games: {len(df)}")
        print(f"  Average spread: {df['point_spread'].mean():.1f} points")
        print(f"  Spread range: {df['point_spread'].min():.0f} to {df['point_spread'].max():.0f}")
        print(f"  Home team wins: {(df['point_spread'] > 0).sum()} ({(df['point_spread'] > 0).mean():.1%})")
        
        return df
    
    def train_spread_model(self, df, use_existing_features=True):
        """Train regression model to predict point spreads"""
        
        # Prepare spread data
        df = self.prepare_spread_data(df)
        
        # Use existing features or select new ones for regression
        if use_existing_features and hasattr(predictor, 'best_features') and predictor.best_features:
            self.best_features = predictor.best_features
            print(f"Using existing features: {len(self.best_features)} features")
        else:
            # Select features for regression
            feature_cols = [col for col in df.columns 
                           if col not in ['home_win', 'home_score', 'away_score', 'game_id', 'point_spread']]
            self.best_features = feature_cols[:15]  # Use top 15 features
            print(f"Selected {len(self.best_features)} features for spread prediction")
        
        # Prepare data
        X = df[self.best_features].fillna(df[self.best_features].mean())
        y = df['point_spread']
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Test multiple regression models
        models = {
            'Random Forest': RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42),
            'Linear Regression': LinearRegression(),
        }
        
        if HAS_XGB:
            models['XGBoost'] = xgb.XGBRegressor(n_estimators=200, max_depth=6, random_state=42, verbosity=0)
        
        print("\nTraining spread prediction models...")
        best_model = None
        best_mae = float('inf')
        
        for name, model in models.items():
            # Use scaled data for linear regression, raw for tree-based
            X_use_train = X_train_scaled if 'Linear' in name else X_train
            X_use_test = X_test_scaled if 'Linear' in name else X_test
            
            # Train model
            model.fit(X_use_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_use_test)
            
            # Calculate metrics
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            
            print(f"{name}:")
            print(f"  MAE: {mae:.2f} points")
            print(f"  RMSE: {rmse:.2f} points")
            
            # Select best model based on MAE
            if mae < best_mae:
                best_mae = mae
                best_model = model
                self.spread_model = model
                print(f"  *** NEW BEST MODEL ***")
            print()
        
        print(f"Best model MAE: {best_mae:.2f} points")
        return self.spread_model
    
    def predict_spreads(self, games_df):
        """Predict point spreads for new games"""
        
        if self.spread_model is None:
            raise ValueError("Spread model not trained yet!")
        
        X = games_df[self.best_features].fillna(games_df[self.best_features].mean())
        
        # Use appropriate scaling based on model type
        if 'Linear' in str(type(self.spread_model)):
            X = self.scaler.transform(X)
        
        # Get spread predictions
        predicted_spreads = self.spread_model.predict(X)
        
        # Create results dataframe
        results = games_df.copy()
        results['predicted_spread'] = predicted_spreads
        
        # Determine winner based on spread
        results['predicted_winner_spread'] = np.where(
            predicted_spreads > 0, 
            results.get('home_team', 'HOME'),  # Home team wins if spread > 0
            results.get('away_team', 'AWAY')   # Away team wins if spread < 0
        )
        
        # Calculate confidence based on magnitude of spread
        results['spread_confidence'] = np.abs(predicted_spreads) / 21.0  # Scale by typical max spread
        results['spread_confidence'] = np.clip(results['spread_confidence'], 0.5, 0.95)  # 50-95% range
        
        return results

# Initialize spread predictor
spread_predictor = NFLSpreadPredictor()

print("NFL SPREAD PREDICTION MODEL READY!")
print("\nThis model predicts:")
print("  - Point spreads (e.g., Chiefs -7.5)")
print("  - How much teams will win/lose by")
print("  - More detailed game analysis")
print("\nNext: Train the spread model using existing data!")

NFL SPREAD PREDICTION MODEL READY!

This model predicts:
  - Point spreads (e.g., Chiefs -7.5)
  - How much teams will win/lose by
  - More detailed game analysis

Next: Train the spread model using existing data!


In [5]:
# TRAIN THE SPREAD MODEL
print("TRAINING NFL SPREAD PREDICTION MODEL")
print("="*50)

# Check if we have the original data
if 'df' in locals() or 'df' in globals():
    try:
        # Train spread model using existing dataset
        print("Training spread model on historical data...")
        
        spread_model = spread_predictor.train_spread_model(df, use_existing_features=True)
        
        if spread_model is not None:
            print("✓ Spread model trained successfully!")
            print("\nModel can now predict:")
            print("  - Point spreads (e.g., 'Chiefs -7.5')")
            print("  - Victory margins")
            print("  - Game competitiveness")
            
            # Test on a few sample games
            print(f"\nTesting on recent games...")
            test_games = df.tail(3)
            if not test_games.empty:
                spread_results = spread_predictor.predict_spreads(test_games)
                
                print("Sample predictions:")
                for _, game in spread_results.iterrows():
                    home_team = game.get('game_id', 'Unknown').split('_')[-2] if 'game_id' in game else 'HOME'
                    away_team = game.get('game_id', 'Unknown').split('_')[-1] if 'game_id' in game else 'AWAY'
                    spread = game['predicted_spread']
                    actual_spread = game.get('point_spread', 0)
                    
                    if spread > 0:
                        print(f"  {home_team} -{abs(spread):.1f} vs {away_team} (actual: {actual_spread:+.0f})")
                    else:
                        print(f"  {away_team} -{abs(spread):.1f} vs {home_team} (actual: {actual_spread:+.0f})")
        
    except Exception as e:
        print(f"Error training spread model: {e}")
        print("Make sure the original model data is available.")
        
else:
    print("ERROR: No training data found!")
    print("Please run Step 2 first to collect and prepare the data.")

TRAINING NFL SPREAD PREDICTION MODEL
Training spread model on historical data...
Spread Data Summary:
  Games: 2465
  Average spread: 2.0 points
  Spread range: -45 to 50
  Home team wins: 1356 (55.0%)
Using existing features: 16 features

Training spread prediction models...
Random Forest:
  MAE: 10.49 points
  RMSE: 13.70 points
  *** NEW BEST MODEL ***

Linear Regression:
  MAE: 10.80 points
  RMSE: 14.12 points

XGBoost:
  MAE: 11.55 points
  RMSE: 14.81 points

Best model MAE: 10.49 points
✓ Spread model trained successfully!

Model can now predict:
  - Point spreads (e.g., 'Chiefs -7.5')
  - Victory margins
  - Game competitiveness

Testing on recent games...
Sample predictions:
  PHI -2.7 vs WAS (actual: +32)
  BUF -0.5 vs KC (actual: +3)
  PHI -11.6 vs KC (actual: +18)


In [6]:
# ENHANCED SPREAD PREDICTION FUNCTION
import re
from datetime import datetime

def predict_multiple_games_with_spreads(predictor, spread_predictor, games_text, season=2025, week=6, show_details=True):
    """
    Predict multiple games with BOTH winner probabilities AND point spreads
    
    Args:
        predictor: Trained NFLGamePredictor instance (for winner predictions)
        spread_predictor: Trained NFLSpreadPredictor instance (for spread predictions)
        games_text: Formatted text with games
        season: Season year
        week: Week number
        show_details: Whether to show detailed output
    
    Returns:
        DataFrame with both winner and spread predictions
    """
    
    if predictor.final_model is None:
        return None
        
    if spread_predictor.spread_model is None:
        return None
    
    # Use existing team mapping from original function
    team_mapping = {
        'TB Buccaneers': 'TB', 'Tampa Bay Buccaneers': 'TB', 'TB': 'TB',
        'ATL Falcons': 'ATL', 'Atlanta Falcons': 'ATL', 'ATL': 'ATL',
        'CIN Bengals': 'CIN', 'Cincinnati Bengals': 'CIN', 'CIN': 'CIN',
        'CLE Browns': 'CLE', 'Cleveland Browns': 'CLE', 'CLE': 'CLE',
        'MIA Dolphins': 'MIA', 'Miami Dolphins': 'MIA', 'MIA': 'MIA',
        'IND Colts': 'IND', 'Indianapolis Colts': 'IND', 'IND': 'IND',
        'CAR Panthers': 'CAR', 'Carolina Panthers': 'CAR', 'CAR': 'CAR',
        'JAX Jaguars': 'JAX', 'Jacksonville Jaguars': 'JAX', 'JAX': 'JAX',
        'LV Raiders': 'LV', 'Las Vegas Raiders': 'LV', 'LV': 'LV',
        'NE Patriots': 'NE', 'New England Patriots': 'NE', 'NE': 'NE',
        'ARI Cardinals': 'ARI', 'Arizona Cardinals': 'ARI', 'ARI': 'ARI',
        'NO Saints': 'NO', 'New Orleans Saints': 'NO', 'NO': 'NO',
        'PIT Steelers': 'PIT', 'Pittsburgh Steelers': 'PIT', 'PIT': 'PIT',
        'NYJ Jets': 'NYJ', 'New York Jets': 'NYJ', 'NYJ': 'NYJ',
        'NYG Giants': 'NYG', 'New York Giants': 'NYG', 'NYG': 'NYG',
        'WAS Commanders': 'WAS', 'Washington Commanders': 'WAS', 'WAS': 'WAS',
        'TEN Titans': 'TEN', 'Tennessee Titans': 'TEN', 'TEN': 'TEN',
        'DEN Broncos': 'DEN', 'Denver Broncos': 'DEN', 'DEN': 'DEN',
        'SF 49ers': 'SF', 'San Francisco 49ers': 'SF', 'SF': 'SF',
        'SEA Seahawks': 'SEA', 'Seattle Seahawks': 'SEA', 'SEA': 'SEA',
        'DET Lions': 'DET', 'Detroit Lions': 'DET', 'DET': 'DET',
        'GB Packers': 'GB', 'Green Bay Packers': 'GB', 'GB': 'GB',
        'HOU Texans': 'HOU', 'Houston Texans': 'HOU', 'HOU': 'HOU',
        'LA Rams': 'LA', 'Los Angeles Rams': 'LA', 'LA': 'LA',
        'LAR Rams': 'LA', 'Los Angeles Rams': 'LA', 'LAR': 'LA',
        'BAL Ravens': 'BAL', 'Baltimore Ravens': 'BAL', 'BAL': 'BAL',
        'BUF Bills': 'BUF', 'Buffalo Bills': 'BUF', 'BUF': 'BUF',
        'KC Chiefs': 'KC', 'Kansas City Chiefs': 'KC', 'KC': 'KC',
        'PHI Eagles': 'PHI', 'Philadelphia Eagles': 'PHI', 'PHI': 'PHI',
        'DAL Cowboys': 'DAL', 'Dallas Cowboys': 'DAL', 'DAL': 'DAL',
        'MIN Vikings': 'MIN', 'Minnesota Vikings': 'MIN', 'MIN': 'MIN',
        'CHI Bears': 'CHI', 'Chicago Bears': 'CHI', 'CHI': 'CHI',
        'LAC Chargers': 'LAC', 'Los Angeles Chargers': 'LAC', 'LAC': 'LAC'
    }
    
    # Parse games
    game_pattern = r'\(Away\)\s+(.*?)\s+vs\.\s+\(Home\)\s+(.*)'
    matches = re.findall(game_pattern, games_text, re.IGNORECASE)
    
    if not matches:
        return None
    
    # Get team features
    try:
        team_features = predictor.create_team_features(weekly_data, 2025, week, schedule_data)
        if not team_features:
            team_features = predictor.create_team_features(weekly_data, 2024, 19, schedule_data)
            data_source = "2024 season-end"
        else:
            data_source = f"2025 Week {week-1}"
            
        if not team_features:
            return None
            
    except Exception as e:
        return None
    
    all_predictions = []
    successful_predictions = 0
    
    for i, (away_full, home_full) in enumerate(matches, 1):
        away_full = away_full.strip()
        home_full = home_full.strip()
        
        # Map to abbreviations
        away_team = team_mapping.get(away_full, away_full.split()[-1] if away_full.split() else away_full)
        home_team = team_mapping.get(home_full, home_full.split()[-1] if home_full.split() else home_full)
        
        away_team = away_team.strip().upper()
        home_team = home_team.strip().upper()
        
        try:
            # Check if teams exist
            if away_team not in team_features or home_team not in team_features:
                continue
            
            # Create game features
            game_features = predictor.create_game_features(
                home_team, away_team, team_features,
                season, week, is_playoff=False, is_neutral=False
            )
            
            if game_features is None:
                continue
            
            game_df = pd.DataFrame([game_features])
            
            # Get BOTH winner and spread predictions
            winner_result = predictor.predict_games(game_df)
            spread_result = spread_predictor.predict_spreads(game_df)
            
            # Extract results
            home_win_prob = winner_result['home_win_prob'].iloc[0]
            predicted_home_win = winner_result['predicted_home_win'].iloc[0]
            predicted_spread = spread_result['predicted_spread'].iloc[0]
            
            # CRITICAL: Ensure spread and winner are consistent
            # The team with the negative spread MUST be the predicted winner
            # Calculate confidence based on spread: Win Probability ≈ 50% + (spread × 2.5%)
            spread_magnitude = abs(predicted_spread)
            spread_based_confidence = 0.50 + (spread_magnitude * 0.025)
            spread_based_confidence = min(spread_based_confidence, 0.95)  
            
            if predicted_spread > 0:
                # Home team favored (positive spread means home team wins)
                spread_display = f"{home_team} -{abs(predicted_spread):.1f}"
                favored_team = home_team
                winner = home_team  # Winner must match the favored team
                confidence = spread_based_confidence
            else:
                # Away team favored (negative spread means away team wins)
                spread_display = f"{away_team} -{abs(predicted_spread):.1f}"
                favored_team = away_team
                winner = away_team  # Winner must match the favored team
                confidence = spread_based_confidence
            
            # Store comprehensive prediction
            prediction_data = {
                'game_num': i,
                'away_team': away_team,
                'home_team': home_team,
                'matchup': f"{away_team} @ {home_team}",
                'predicted_winner': winner,
                'confidence': confidence,
                'home_win_prob': home_win_prob,
                'away_win_prob': 1 - home_win_prob,
                'predicted_spread': predicted_spread,
                'spread_display': spread_display,
                'favored_team': favored_team,
                'spread_magnitude': abs(predicted_spread)
            }
            
            all_predictions.append(prediction_data)
            successful_predictions += 1
            
        except Exception as e:
            continue
    
    if not all_predictions:
        return None
    
    # Create results DataFrame
    predictions_df = pd.DataFrame(all_predictions)
    
    return predictions_df

print("ENHANCED SPREAD PREDICTION FUNCTION READY!")
print("\nThis function provides:")
print("  - Winner predictions with confidence")
print("  - Point spread predictions")
print("  - Game competitiveness analysis")
print("  - Betting-focused insights")

ENHANCED SPREAD PREDICTION FUNCTION READY!

This function provides:
  - Winner predictions with confidence
  - Point spread predictions
  - Game competitiveness analysis
  - Betting-focused insights


In [7]:
# EXECUTE WEEK 8 SPREAD PREDICTIONS
print("EXECUTING NFL WEEK 8 SPREAD PREDICTIONS")
print("="*60)
# Week 8 Complete Schedule (October 2025)
week8_games = """
(Away) Minnesota Vikings vs. (Home) Los Angeles Chargers
(Away) Miami Dolphins vs. (Home) Atlanta Falcons
(Away) Chicago Bears vs. (Home) Baltimore Ravens
(Away) New York Jets vs. (Home) Cincinnati Bengals
(Away) Buffalo Bills vs. (Home) Carolina Panthers
(Away) San Francisco 49ers vs. (Home) Houston Texans
(Away) Cleveland Browns vs. (Home) New England Patriots
(Away) New York Giants vs. (Home) Philadelphia Eagles
(Away) Tampa Bay Buccaneers vs. (Home) New Orleans Saints
(Away) Tennessee Titans vs. (Home) Indianapolis Colts
(Away) Dallas Cowboys vs. (Home) Denver Broncos
(Away) Green Bay Packers vs. (Home) Pittsburgh Steelers
(Away) Washington Commanders vs. (Home) Kansas City Chiefs
"""
# Check if both models are trained
if ('predictor' in locals() and hasattr(predictor, 'final_model') and predictor.final_model is not None and
    'spread_predictor' in locals() and hasattr(spread_predictor, 'spread_model') and spread_predictor.spread_model is not None):
    
    print("Both models are trained and ready!")
    print("Generating comprehensive predictions with spreads...")
    print()

    # Run enhanced predictions with spreads
    week8_spread_results = predict_multiple_games_with_spreads(
        predictor, spread_predictor, week8_games, 
        season=2025, week=8, show_details=False
    )

    if week8_spread_results is not None:
        print(f"\n\nWEEK 8 PREDICTIONS")
        print("=" * 70)
        print(f"{'MATCHUP':<20} {'SPREAD':<15} {'WINNER':<12} {'CONFIDENCE':<12}")
        print("-" * 70)

        # Week 8 game schedule
        game_schedule = {
            'MIN @ LAC': ('Thursday', '8:15 PM ET'),
            'MIA @ ATL': ('Sunday', '1:00 PM ET'),
            'CHI @ BAL': ('Sunday', '1:00 PM ET'),
            'NYJ @ CIN': ('Sunday', '1:00 PM ET'),
            'BUF @ CAR': ('Sunday', '1:00 PM ET'),
            'SF @ HOU': ('Sunday', '1:00 PM ET'),
            'CLE @ NE': ('Sunday', '1:00 PM ET'),
            'NYG @ PHI': ('Sunday', '1:00 PM ET'),
            'TB @ NO': ('Sunday', '4:05 PM ET'),
            'TEN @ IND': ('Sunday', '4:25 PM ET'),
            'DAL @ DEN': ('Sunday', '4:25 PM ET'),
            'GB @ PIT': ('Sunday', '8:20 PM ET'),
            'WAS @ KC': ('Monday', '8:15 PM ET')
        }


        time_order = [
            'MIN @ LAC',
            'MIA @ ATL', 'CHI @ BAL', 'NYJ @ CIN', 'BUF @ CAR', 'SF @ HOU', 'CLE @ NE', 'NYG @ PHI',
            'TB @ NO',
            'TEN @ IND', 'DAL @ DEN',
            'GB @ PIT',
            'WAS @ KC'
            ]
        
        # Team name mapping for display
        team_names = {
            'WAS': 'Commanders', 'GB': 'Packers', 'NYG': 'Giants', 'DAL': 'Cowboys',
            'SEA': 'Seahawks', 'PIT': 'Steelers', 'LA': 'Rams', 'TEN': 'Titans',
            'BUF': 'Bills', 'NYJ': 'Jets', 'NE': 'Patriots', 'MIA': 'Dolphins',
            'JAX': 'Jaguars', 'CIN': 'Bengals', 'SF': '49ers', 'NO': 'Saints',
            'CLE': 'Browns', 'BAL': 'Ravens', 'CHI': 'Bears', 'DET': 'Lions',
            'DEN': 'Broncos', 'IND': 'Colts', 'CAR': 'Panthers', 'ARI': 'Cardinals',
            'PHI': 'Eagles', 'KC': 'Chiefs', 'ATL': 'Falcons', 'MIN': 'Vikings',
            'TB': 'Buccaneers', 'HOU': 'Texans', 'LAC': 'Chargers', 'LV': 'Raiders'
        }
        
        # Display results with spreads
        for matchup in time_order:
            game_row = week8_spread_results[week8_spread_results['matchup'] == matchup]
            if not game_row.empty:
                game = game_row.iloc[0]
                winner_abbr = game['predicted_winner']
                winner_name = team_names.get(winner_abbr, winner_abbr)
                spread_display = game['spread_display']
                
                print(f"{game['matchup']:<20} {spread_display:<15} {winner_name:<12} {game['confidence']:.1%}")
        
        print("-" * 70)
        print(f"Total Games: {len(week8_spread_results)} | Avg Spread: {week8_spread_results['spread_magnitude'].mean():.1f} pts")
        
        # Additional insights
        close_games = week8_spread_results[week8_spread_results['spread_magnitude'] <= 3.5]
        blowouts = week8_spread_results[week8_spread_results['spread_magnitude'] >= 10.0]
        print(f"\nGAME ANALYSIS:")
        print(f"  Close games (≤3.5 pts): {len(close_games)}")
        print(f"  Potential blowouts (≥10 pts): {len(blowouts)}")
        print(f"  Home teams favored: {(week8_spread_results['predicted_spread'] > 0).sum()}")
        print(f"  Away teams favored: {(week8_spread_results['predicted_spread'] < 0).sum()}")

        print(f"\nResults saved to 'week8_spread_results' variable")

    else:
        print("ERROR: Failed to generate spread predictions")
        
else:
    print("ERROR: Models not trained yet!")
    print("\nPlease run:")
    print("1. Step 2 to train the winner prediction model")
    print("2. Step 8 to train the spread prediction model")

EXECUTING NFL WEEK 8 SPREAD PREDICTIONS
Both models are trained and ready!
Generating comprehensive predictions with spreads...



WEEK 8 PREDICTIONS
MATCHUP              SPREAD          WINNER       CONFIDENCE  
----------------------------------------------------------------------
MIN @ LAC            LAC -1.7        Chargers     54.2%
MIA @ ATL            MIA -0.2        Dolphins     50.5%
CHI @ BAL            BAL -12.2       Ravens       80.5%
NYJ @ CIN            CIN -9.3        Bengals      73.4%
BUF @ CAR            BUF -7.4        Bills        68.6%
SF @ HOU             SF -1.9         49ers        54.8%
CLE @ NE             NE -1.7         Patriots     54.3%
NYG @ PHI            PHI -11.4       Eagles       78.6%
TB @ NO              TB -4.7         Buccaneers   61.7%
TEN @ IND            IND -6.8        Colts        66.9%
DAL @ DEN            DEN -10.4       Broncos      75.9%
GB @ PIT             GB -0.9         Packers      52.4%
WAS @ KC             KC -0.1         Chiefs 

In [8]:
# ===== CONFIGURATION (Update these for each week) =====
WEEK_NUMBER = 8
SEASON = 2025

# ===== AUTOMATED RESULT FETCHING =====
def fetch_actual_results(predictions_df, season, week):
    """
    Automatically fetch actual game results from nfl_data_py
    No more manual data entry needed!
    """
    try:
        # Import fresh schedule data for the target week
        current_schedule = nfl.import_schedules([season])
        week_games = current_schedule[
            (current_schedule['season'] == season) & 
            (current_schedule['week'] == week)
        ]
        
        actual_results = {}
        games_found = 0
        games_played = 0
        
        # Process each prediction to find actual results
        for _, pred_row in predictions_df.iterrows():
            matchup = pred_row['matchup']
            away_team = pred_row['away_team'] 
            home_team = pred_row['home_team']
            
            # Find corresponding game in schedule
            game_row = week_games[
                (week_games['away_team'] == away_team) & 
                (week_games['home_team'] == home_team)
            ]
            
            if not game_row.empty:
                game = game_row.iloc[0]
                games_found += 1
                
                # Check if game has been played (has scores)
                home_score = game.get('home_score', None)
                away_score = game.get('away_score', None)
                
                if pd.notna(home_score) and pd.notna(away_score):
                    # Game completed - extract results
                    winner = home_team if home_score > away_score else away_team
                    
                    actual_results[matchup] = {
                        'winner': winner,
                        'home_score': int(home_score),
                        'away_score': int(away_score),
                        'game_id': game.get('game_id', ''),
                        'gameday': game.get('gameday', '')
                    }
                    games_played += 1
        
        print(f"Found {games_played}/{len(predictions_df)} completed games")
        
        if games_played == 0:
            print(f"No completed games found for Week {week}, {season}")
            return None
            
        return actual_results
        
    except Exception as e:
        print(f"Error fetching results: {e}")
        return None

def quick_results_check(week_num=None, season_year=None):
    """
    One-click function to check results for any week
    """
    # Use config values if not specified
    if week_num is None:
        week_num = WEEK_NUMBER
    if season_year is None:
        season_year = SEASON
    
    # Find the predictions variable for this week
    predictions_var = f'week{week_num}_spread_results'
    
    if predictions_var in locals() or predictions_var in globals():
        try:
            # Get the predictions dataframe
            if predictions_var in locals():
                predictions_df = locals()[predictions_var]
            else:
                predictions_df = globals()[predictions_var]
            
            # Fetch actual results automatically
            actual_results = fetch_actual_results(predictions_df, season_year, week_num)
            
            if actual_results:
                # Run the analysis with fetched results
                results = analyze_week(week_num, season_year, predictions_df, actual_results)
                
                # Save to global variable
                globals()[f'week{week_num}_final_results'] = results
                globals()[f'week{week_num}_actual_results'] = actual_results
                
                return results
            else:
                print("No results available yet - games may not be completed.")
                return None
                
        except Exception as e:
            print(f"Error: {e}")
            return None
    else:
        print(f"'{predictions_var}' not found - run predictions first")
        return None

# Smart result fetcher - tries auto-fetch, falls back gracefully
def get_week_results(predictions_df=None, manual_results=None):
    """
    Smart function that tries automatic fetching first, 
    then falls back to manual results if provided
    """
    if predictions_df is None:
        # Try to find week1_results
        if 'week1_results' in locals():
            predictions_df = week1_results
        elif 'week1_results' in globals():
            predictions_df = globals()['week1_results']
        else:
            print("No predictions found. Run predictions first.")
            return None
    
    # Try automatic fetching first
    actual_results = fetch_actual_results(predictions_df, SEASON, WEEK_NUMBER)
    
    if actual_results:
        return actual_results
    elif manual_results:
        return manual_results
    else:
        print("No results available - games may not be completed yet")
        return None

print("Result system ready. Use: quick_results_check()")

# ===== AUTOMATED ANALYSIS CODE (No changes needed below) =====

def analyze_week(week_num, season_year, predictions_df, actuals):
    """
    Universal function to analyze any week's predictions vs actual results
    """
    print("=" * 95)
    print(f"{' ' * 25}NFL WEEK {week_num} {season_year} - FINAL RESULTS")
    print("=" * 95)
    
    # Build comparison
    results = {}
    for matchup, actual in actuals.items():
        pred_row = predictions_df[predictions_df['matchup'] == matchup]
        if not pred_row.empty:
            pred = pred_row.iloc[0]['predicted_winner']
            conf = pred_row.iloc[0]['confidence'] * 100
            results[matchup] = {
                'predicted': pred,
                'actual': actual['winner'],
                'score': f"{actual['away_score']}-{actual['home_score']}",
                'correct': pred == actual['winner'],
                'confidence': conf
            }
    
    # Display table
    print(f"\n{'GAME':<20} {'PREDICTED':<12} {'ACTUAL':<12} {'SCORE':<12} {'RESULT':<10}")
    print("-" * 95)
    
    correct = sum(1 for r in results.values() if r['correct'])
    total = len(results)
    
    for matchup, result in results.items():
        status = "CORRECT" if result['correct'] else "WRONG"
        print(f"{matchup:<20} {result['predicted']:<12} {result['actual']:<12} {result['score']:<12} {status:<10}")
    
    # Summary
    accuracy = (correct / total * 100) if total > 0 else 0
    print("-" * 95)
    print(f"\nOVERALL ACCURACY: {correct}/{total} = {accuracy:.1f}%\n")
    
    # Breakdown
    correct_games = [(m, r) for m, r in results.items() if r['correct']]
    incorrect_games = [(m, r) for m, r in results.items() if not r['correct']]
    
    print("=" * 95)
    print(f"CORRECT PREDICTIONS ({len(correct_games)}/{total}):")
    print("=" * 95)
    for matchup, result in correct_games:
        print(f"  {matchup:<20} {result['actual']:<5} | Score: {result['score']:<10} | Confidence: {result['confidence']:.1f}%")
    
    print("\n" + "=" * 95)
    print(f"INCORRECT PREDICTIONS ({len(incorrect_games)}/{total}):")
    print("=" * 95)
    for matchup, result in incorrect_games:
        print(f"  {matchup:<20} Predicted: {result['predicted']:<5} | Actual: {result['actual']:<5} | Score: {result['score']}")
    
    # High confidence analysis
    high_conf = {m: r for m, r in results.items() if r['confidence'] > 65}
    if high_conf:
        hc_correct = sum(1 for r in high_conf.values() if r['correct'])
        hc_accuracy = (hc_correct / len(high_conf) * 100)
        print("\n" + "=" * 95)
        print(f"HIGH CONFIDENCE PICKS (>65%): {hc_correct}/{len(high_conf)} = {hc_accuracy:.1f}%")
        print("=" * 95)
        for matchup, result in high_conf.items():
            status = "✓" if result['correct'] else "✗"
            print(f"{status} {matchup:<20} {result['predicted']:<5} ({result['confidence']:.1f}%) → Actual: {result['actual']}")
    
    print(f"\nResults saved to 'week{week_num}_final_results'")
    return results

# ===== SEAMLESS EXECUTION =====
# Auto-run analysis if predictions exist
if f'week{WEEK_NUMBER}_spread_results' in globals():
    quick_results_check()
else:
    print("Run predictions first, then return here for results analysis.")


Result system ready. Use: quick_results_check()
Found 13/13 completed games
                         NFL WEEK 8 2025 - FINAL RESULTS

GAME                 PREDICTED    ACTUAL       SCORE        RESULT    
-----------------------------------------------------------------------------------------------
MIN @ LAC            LAC          LAC          10-37        CORRECT   
MIA @ ATL            MIA          MIA          34-10        CORRECT   
CHI @ BAL            BAL          BAL          16-30        CORRECT   
NYJ @ CIN            CIN          NYJ          39-38        WRONG     
BUF @ CAR            BUF          BUF          40-9         CORRECT   
SF @ HOU             SF           HOU          15-26        WRONG     
CLE @ NE             NE           NE           13-32        CORRECT   
NYG @ PHI            PHI          PHI          20-38        CORRECT   
TB @ NO              TB           TB           23-3         CORRECT   
TEN @ IND            IND          IND          14-38        

In [9]:
# Save Week 8 predictions to CSV for Plot.ipynb
# This allows Plot.ipynb to load predictions even in different Jupyter sessions

if 'week8_spread_results' in globals() and week8_spread_results is not None:
    import os
    # Save in the Week8 directory so Plot.ipynb can find it
    csv_path = os.path.join("week8_predictions.csv")
    week8_spread_results.to_csv(csv_path, index=False)
    print(f"✓ Saved Week 8 predictions to {os.path.abspath(csv_path)}")
    print(f"  Total predictions: {len(week8_spread_results)}")
    print(f"  This file can now be loaded by Plot.ipynb automatically!")
else:
    print("⚠ week8_spread_results not found. Run the prediction cell first.")


✓ Saved Week 8 predictions to /Users/akulaggarwal/Desktop/NFL Performance Prediction/Week8/week8_predictions.csv
  Total predictions: 13
  This file can now be loaded by Plot.ipynb automatically!
