# Failure Pattern Analysis: Week 10+ Performance Degradation

**Objective**: Analyze existing Week 1-14 prediction results to identify systematic failure patterns.

**Key Questions**:
1. Why did high-confidence picks fail catastrophically (33.3% in Weeks 13-14)?
2. Are there specific game types that the model struggles with?
3. Is there home team bias?
4. How does calibration differ between legacy (1-9) vs enhanced (10-14)?

**Data Source**: Existing CSV prediction files in Week*/week*_predictions.csv

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nfl_data_py as nfl
from pathlib import Path

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries imported")

## Load All Prediction Results (Weeks 1-14)

In [None]:
# Load predictions from CSV files
base_path = Path('/Users/akulaggarwal/Desktop/NFL Performance Prediction')
all_predictions = []

for week in range(1, 15):
    csv_path = base_path / f'Week{week}' / f'week{week}_predictions.csv'
    
    if csv_path.exists():
        df = pd.read_csv(csv_path)
        df['week'] = week
        df['model_type'] = 'legacy' if week <= 9 else 'enhanced'
        all_predictions.append(df)
        print(f"✅ Loaded Week {week}: {len(df)} games")
    else:
        print(f"❌ Week {week} not found")

if all_predictions:
    predictions_df = pd.concat(all_predictions, ignore_index=True)
    print(f"\n✅ Total predictions loaded: {len(predictions_df)} games across {predictions_df['week'].nunique()} weeks")
else:
    print("\n❌ No predictions found. Please ensure CSV files exist.")
    predictions_df = pd.DataFrame()

## Fetch Actual Results (2025 Season)

In [None]:
if not predictions_df.empty:
    print("Fetching 2025 NFL schedule and results...")
    
    # Load 2025 schedule
    schedule_2025 = nfl.import_schedules([2025])
    
    if schedule_2025.empty:
        print("⚠️ 2025 data not available, trying 2024...")
        schedule_2025 = nfl.import_schedules([2024])
    
    print(f"✅ Loaded {len(schedule_2025)} games from schedule")
    
    # Match predictions to actual results
    predictions_df['actual_winner'] = None
    predictions_df['home_score'] = None
    predictions_df['away_score'] = None
    predictions_df['correct'] = None
    
    for idx, row in predictions_df.iterrows():
        week = row['week']
        away_team = row['away_team']
        home_team = row['home_team']
        
        # Find game in schedule
        game = schedule_2025[
            (schedule_2025['week'] == week) &
            (schedule_2025['away_team'].str.upper() == away_team.upper()) &
            (schedule_2025['home_team'].str.upper() == home_team.upper())
        ]
        
        if not game.empty and pd.notna(game.iloc[0]['home_score']):
            home_score = game.iloc[0]['home_score']
            away_score = game.iloc[0]['away_score']
            actual_winner = home_team if home_score > away_score else away_team
            
            predictions_df.at[idx, 'actual_winner'] = actual_winner
            predictions_df.at[idx, 'home_score'] = home_score
            predictions_df.at[idx, 'away_score'] = away_score
            predictions_df.at[idx, 'correct'] = (row['predicted_winner'] == actual_winner)
    
    # Filter to completed games only
    completed_mask = predictions_df['correct'].notna()
    predictions_df = predictions_df[completed_mask].copy()
    
    print(f"\n✅ Matched {len(predictions_df)} completed games with actual results")
    print(f"   Legacy (Weeks 1-9): {len(predictions_df[predictions_df['model_type']=='legacy'])} games")
    print(f"   Enhanced (Weeks 10-14): {len(predictions_df[predictions_df['model_type']=='enhanced'])} games")

## Analysis 1: Overall Performance Comparison

In [None]:
if not predictions_df.empty:
    print("="*70)
    print("PERFORMANCE COMPARISON: LEGACY vs ENHANCED")
    print("="*70)
    
    for model_type in ['legacy', 'enhanced']:
        subset = predictions_df[predictions_df['model_type'] == model_type]
        
        accuracy = subset['correct'].mean()
        n_games = len(subset)
        n_correct = subset['correct'].sum()
        
        # High-confidence accuracy (>65%)
        hc_mask = subset['confidence'] > 0.65
        hc_accuracy = subset[hc_mask]['correct'].mean() if hc_mask.sum() > 0 else None
        hc_games = hc_mask.sum()
        
        weeks = sorted(subset['week'].unique())
        
        print(f"\n{model_type.upper()} MODEL (Weeks {weeks[0]}-{weeks[-1]}):")
        print(f"  Overall Accuracy: {accuracy:.1%} ({n_correct}/{n_games})")
        print(f"  High-Conf Picks (>65%): {hc_accuracy:.1%} ({subset[hc_mask]['correct'].sum()}/{hc_games})" if hc_accuracy else "  High-Conf Picks: N/A")
        print(f"  Avg Confidence: {subset['confidence'].mean():.1%}")
        print(f"  Std Dev (week-to-week): {subset.groupby('week')['correct'].mean().std():.1%}")
    
    # Calculate delta
    legacy_acc = predictions_df[predictions_df['model_type']=='legacy']['correct'].mean()
    enhanced_acc = predictions_df[predictions_df['model_type']=='enhanced']['correct'].mean()
    delta = enhanced_acc - legacy_acc
    
    print(f"\n{'='*70}")
    print(f"DELTA: Enhanced vs Legacy = {delta:+.1%} ({(delta*100):+.1f} percentage points)")
    print(f"{'='*70}")

## Analysis 2: Calibration Analysis (Confidence vs Actual Accuracy)

In [None]:
if not predictions_df.empty:
    print("\n" + "="*70)
    print("CALIBRATION ANALYSIS: Predicted Confidence vs Actual Accuracy")
    print("="*70)
    
    # Create confidence bins
    bins = [0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 1.0]
    bin_labels = ['50-55%', '55-60%', '60-65%', '65-70%', '70-75%', '75%+']
    
    predictions_df['conf_bin'] = pd.cut(predictions_df['confidence'], bins=bins, labels=bin_labels, include_lowest=True)
    
    for model_type in ['legacy', 'enhanced']:
        subset = predictions_df[predictions_df['model_type'] == model_type]
        
        print(f"\n{model_type.upper()} MODEL:")
        print(f"{'Confidence Bin':<15} {'N Games':<10} {'Actual Acc':<15} {'Calibration Gap':<20}")
        print("-"*70)
        
        for bin_label in bin_labels:
            bin_data = subset[subset['conf_bin'] == bin_label]
            
            if len(bin_data) > 0:
                actual_acc = bin_data['correct'].mean()
                expected_conf = bin_data['confidence'].mean()
                gap = actual_acc - expected_conf
                
                gap_str = f"{gap:+.1%} {'(overconfident)' if gap < 0 else '(underconfident)' if gap > 0 else '(calibrated)'}" 
                
                print(f"{bin_label:<15} {len(bin_data):<10} {actual_acc:<14.1%} {gap_str}")
            else:
                print(f"{bin_label:<15} {0:<10} {'N/A':<14} {'N/A'}")
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    for idx, model_type in enumerate(['legacy', 'enhanced']):
        subset = predictions_df[predictions_df['model_type'] == model_type]
        
        calibration = subset.groupby('conf_bin').agg({
            'correct': 'mean',
            'confidence': 'mean'
        }).reset_index()
        
        ax = axes[idx]
        x = np.arange(len(calibration))
        width = 0.35
        
        ax.bar(x - width/2, calibration['confidence'], width, label='Predicted Confidence', alpha=0.7)
        ax.bar(x + width/2, calibration['correct'], width, label='Actual Accuracy', alpha=0.7)
        
        ax.set_xlabel('Confidence Bin')
        ax.set_ylabel('Percentage')
        ax.set_title(f'{model_type.title()} Model Calibration')
        ax.set_xticks(x)
        ax.set_xticklabels(calibration['conf_bin'], rotation=45)
        ax.legend()
        ax.set_ylim([0, 1])
        ax.axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Baseline')
    
    plt.tight_layout()
    plt.savefig('calibration_analysis.png', dpi=150)
    print("\n✅ Calibration chart saved to: calibration_analysis.png")

## Analysis 3: Home Team Bias

In [None]:
if not predictions_df.empty:
    print("\n" + "="*70)
    print("HOME TEAM BIAS ANALYSIS")
    print("="*70)
    
    predictions_df['predicted_home_win'] = predictions_df['predicted_winner'] == predictions_df['home_team']
    predictions_df['actual_home_win'] = predictions_df['actual_winner'] == predictions_df['home_team']
    
    for model_type in ['legacy', 'enhanced']:
        subset = predictions_df[predictions_df['model_type'] == model_type]
        
        # How often does model predict home win?
        pred_home_rate = subset['predicted_home_win'].mean()
        
        # What's actual home win rate?
        actual_home_rate = subset['actual_home_win'].mean()
        
        # Accuracy on home wins vs away wins
        home_win_preds = subset[subset['predicted_home_win']]
        away_win_preds = subset[~subset['predicted_home_win']]
        
        home_acc = home_win_preds['correct'].mean()
        away_acc = away_win_preds['correct'].mean()
        
        # Recall: Of actual home wins, how many did we predict?
        actual_home_wins = subset[subset['actual_home_win']]
        home_recall = (actual_home_wins['predicted_home_win']).mean()
        
        # Recall: Of actual away wins, how many did we predict?
        actual_away_wins = subset[~subset['actual_home_win']]
        away_recall = (~actual_away_wins['predicted_home_win']).mean()
        
        print(f"\n{model_type.upper()} MODEL:")
        print(f"  Predicted home win rate: {pred_home_rate:.1%}")
        print(f"  Actual home win rate: {actual_home_rate:.1%}")
        print(f"  BIAS: {pred_home_rate - actual_home_rate:+.1%} {'(over-predicting home wins)' if pred_home_rate > actual_home_rate else '(under-predicting home wins)'}")
        print(f"\n  When predicting home win: {home_acc:.1%} accuracy ({len(home_win_preds)} games)")
        print(f"  When predicting away win: {away_acc:.1%} accuracy ({len(away_win_preds)} games)")
        print(f"\n  Home win recall: {home_recall:.1%} (caught {home_recall:.1%} of actual home wins)")
        print(f"  Away win recall: {away_recall:.1%} (caught {away_recall:.1%} of actual away wins)")
        
        if home_recall > 0.70 and away_recall < 0.40:
            print(f"  ⚠️ SEVERE HOME BIAS DETECTED")

## Analysis 4: Division Game Performance

In [None]:
if not predictions_df.empty:
    print("\n" + "="*70)
    print("DIVISION GAME ANALYSIS")
    print("="*70)
    
    # Define divisions (simplified - this would need full team mapping)
    afc_east = ['BUF', 'MIA', 'NE', 'NYJ']
    afc_north = ['BAL', 'CIN', 'CLE', 'PIT']
    afc_south = ['HOU', 'IND', 'JAX', 'TEN']
    afc_west = ['DEN', 'KC', 'LAC', 'LV']
    nfc_east = ['DAL', 'NYG', 'PHI', 'WAS']
    nfc_north = ['CHI', 'DET', 'GB', 'MIN']
    nfc_south = ['ATL', 'CAR', 'NO', 'TB']
    nfc_west = ['ARI', 'LA', 'SF', 'SEA']
    
    divisions = [afc_east, afc_north, afc_south, afc_west, nfc_east, nfc_north, nfc_south, nfc_west]
    
    def is_division_game(away, home):
        for div in divisions:
            if away in div and home in div:
                return True
        return False
    
    predictions_df['is_division_game'] = predictions_df.apply(
        lambda row: is_division_game(row['away_team'], row['home_team']), axis=1
    )
    
    for model_type in ['legacy', 'enhanced']:
        subset = predictions_df[predictions_df['model_type'] == model_type]
        
        div_games = subset[subset['is_division_game']]
        non_div_games = subset[~subset['is_division_game']]
        
        div_acc = div_games['correct'].mean() if len(div_games) > 0 else None
        non_div_acc = non_div_games['correct'].mean() if len(non_div_games) > 0 else None
        
        print(f"\n{model_type.upper()} MODEL:")
        print(f"  Division games: {div_acc:.1%} accuracy ({len(div_games)} games)" if div_acc else "  Division games: N/A")
        print(f"  Non-division games: {non_div_acc:.1%} accuracy ({len(non_div_games)} games)" if non_div_acc else "  Non-division games: N/A")
        
        if div_acc and non_div_acc:
            delta = div_acc - non_div_acc
            print(f"  DELTA: {delta:+.1%} {'(division games harder)' if delta < 0 else '(division games easier)'}")

## Analysis 5: Worst Performing Games (Enhanced Model)

In [None]:
if not predictions_df.empty:
    print("\n" + "="*70)
    print("HIGH-CONFIDENCE FAILURES (Enhanced Model, Weeks 10-14)")
    print("="*70)
    
    enhanced = predictions_df[predictions_df['model_type'] == 'enhanced']
    
    # High-confidence failures (>65% confidence but wrong)
    hc_failures = enhanced[(enhanced['confidence'] > 0.65) & (~enhanced['correct'])]
    
    print(f"\nFound {len(hc_failures)} high-confidence failures:")
    print(f"\n{'Week':<6} {'Matchup':<20} {'Predicted':<10} {'Actual':<10} {'Confidence':<12} {'Score':<12} {'Is Div?'}")
    print("-"*90)
    
    for _, game in hc_failures.sort_values('confidence', ascending=False).iterrows():
        is_div = 'YES' if game.get('is_division_game', False) else 'NO'
        score = f"{int(game['away_score'])}-{int(game['home_score'])}"
        
        print(f"{game['week']:<6} {game['matchup']:<20} {game['predicted_winner']:<10} {game['actual_winner']:<10} {game['confidence']:<11.1%} {score:<12} {is_div}")
    
    # Patterns
    if len(hc_failures) > 0:
        div_failures = hc_failures['is_division_game'].sum() if 'is_division_game' in hc_failures.columns else 0
        print(f"\nPATTERNS:")
        print(f"  Division games: {div_failures}/{len(hc_failures)} ({div_failures/len(hc_failures):.1%})")
        print(f"  Avg confidence: {hc_failures['confidence'].mean():.1%}")
        print(f"  Weeks affected: {sorted(hc_failures['week'].unique())}")

## Save Results

In [None]:
if not predictions_df.empty:
    # Save annotated predictions
    predictions_df.to_csv('failure_pattern_analysis_results.csv', index=False)
    print("\n✅ Full analysis saved to: failure_pattern_analysis_results.csv")
    
    print("\n" + "="*70)
    print("FAILURE PATTERN ANALYSIS COMPLETE")
    print("="*70)
    print("\nNext steps:")
    print("1. Review calibration analysis - are high-confidence picks overconfident?")
    print("2. Check home team bias - is model systematically favoring home teams?")
    print("3. Examine division game performance - NFL parity makes these harder")
    print("4. Run debug_enhanced_model.ipynb for full ablation study (2-4 hours)")