In [3]:
# NBA Game Data Exploration
# This notebook explores the historical NBA game data collected from the 2017-18 through 2023-24 seasons
# to identify patterns and features for our prediction system.

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from datetime import datetime
from IPython.display import display

# Set visualization style
plt.style.use('fivethirtyeight')
%matplotlib inline

# Configure visualizations for better readability
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10

# 1. Load the most recent game data file
print("Finding the most recent processed game data file...")
game_files = glob.glob('../data/processed/nba_games_processed_*.csv')

if not game_files:
    raise FileNotFoundError("No processed game data files found. Run collect_games.py first.")

latest_file = max(game_files, key=os.path.getctime)
print(f"Loading data from: {latest_file}")

# Load the data
games_df = pd.read_csv(latest_file)

# 2. Basic Data Exploration
print("\n--- Dataset Overview ---")
print(f"Shape: {games_df.shape} (rows, columns)")
print(f"Memory usage: {games_df.memory_usage().sum() / 1024 / 1024:.2f} MB")

# Display the first few rows to understand structure
print("\nFirst few records:")
display(games_df.head())

# Examine data types
print("\nData Types:")
display(games_df.dtypes)

# Check for missing values
print("\nMissing Values:")
missing = games_df.isnull().sum()
display(missing[missing > 0] if missing.sum() > 0 else "No missing values")

# Convert GAME_DATE back to datetime if it's not already
if not pd.api.types.is_datetime64_dtype(games_df['GAME_DATE']):
    games_df['GAME_DATE'] = pd.to_datetime(games_df['GAME_DATE'])

# 3. Understanding the Dataset Structure

# Let's identify unique values for some key columns to understand the data
print("\n--- Key Column Analysis ---")
print(f"Number of unique teams: {games_df['TEAM_NAME'].nunique()}")
print(f"Number of unique seasons: {games_df['SEASON'].nunique()}")
print(f"Seasons included: {sorted(games_df['SEASON'].unique())}")
print(f"Date range: {games_df['GAME_DATE'].min()} to {games_df['GAME_DATE'].max()}")
print(f"Game types: {sorted(games_df['GAME_TYPE'].unique()) if 'GAME_TYPE' in games_df.columns else 'Game type not specified'}")

# 4. Games per season analysis
print("\n--- Games Per Season ---")
season_counts = games_df['SEASON'].value_counts().sort_index()
print(season_counts)

plt.figure(figsize=(12, 6))
ax = season_counts.plot(kind='bar')
plt.title('Number of Game Records per Season')
plt.xlabel('Season')
plt.ylabel('Number of Records')
plt.xticks(rotation=45)
for i, v in enumerate(season_counts):
    ax.text(i, v + 20, str(v), ha='center')
plt.tight_layout()
plt.show()

# Note about COVID-19 impact
print("\nNote: The reduced number of games in 2019-20 and 2020-21 seasons reflects the impact of the COVID-19 pandemic.")

# 5. Home vs. Away Performance Analysis

# First, let's determine how home/away is indicated in the data
# Common indicators might be a specific column or values like 'vs'/'@' in matchup column
print("\n--- Home vs. Away Performance ---")

# Looking for indicators in the dataset
if 'HOME_GAME' in games_df.columns:
    home_indicator = 'HOME_GAME'
elif 'MATCHUP' in games_df.columns and any(games_df['MATCHUP'].str.contains('vs.')):
    # Create a home indicator based on matchup
    games_df['HOME_GAME'] = games_df['MATCHUP'].str.contains('vs.')
    home_indicator = 'HOME_GAME'
    print("Created HOME_GAME indicator based on MATCHUP column")
else:
    print("Could not determine home/away status from available columns")
    home_indicator = None

# If we can determine home/away games, analyze performance
if home_indicator:
    # Count home vs away games
    home_away_counts = games_df[home_indicator].value_counts()
    print(f"\nHome vs Away Games:")
    print(f"Home Games: {home_away_counts.get(True, 0)}")
    print(f"Away Games: {home_away_counts.get(False, 0)}")
    
    # Win percentage by home/away
    if 'WL' in games_df.columns:
        games_df['WIN'] = games_df['WL'] == 'W'
        
        # Calculate win percentages
        home_win_pct = games_df[games_df[home_indicator]]['WIN'].mean() * 100
        away_win_pct = games_df[~games_df[home_indicator]]['WIN'].mean() * 100
        
        print(f"\nWin Percentages:")
        print(f"Home Win %: {home_win_pct:.2f}%")
        print(f"Away Win %: {away_win_pct:.2f}%")
        print(f"Home Court Advantage: {home_win_pct - away_win_pct:.2f} percentage points")
        
        # Visualize home court advantage
        plt.figure(figsize=(10, 6))
        win_pcts = pd.Series([home_win_pct, away_win_pct], index=['Home', 'Away'])
        win_pcts.plot(kind='bar', color=['green', 'orange'])
        plt.title('Win Percentage: Home vs. Away')
        plt.ylabel('Win Percentage (%)')
        plt.ylim(0, 100)
        for i, v in enumerate(win_pcts):
            plt.text(i, v + 1, f"{v:.1f}%", ha='center')
        plt.tight_layout()
        plt.show()
        
        # Home court advantage by season
        print("\nHome Court Advantage by Season:")
        season_hca = games_df.groupby('SEASON').apply(
            lambda x: (x[x[home_indicator]]['WIN'].mean() - x[~x[home_indicator]]['WIN'].mean()) * 100
        ).sort_index()
        print(season_hca)
        
        plt.figure(figsize=(12, 6))
        season_hca.plot(kind='bar', color='purple')
        plt.title('Home Court Advantage by Season (Percentage Points)')
        plt.xlabel('Season')
        plt.ylabel('Home Advantage (pp)')
        plt.axhline(y=0, color='gray', linestyle='--')
        for i, v in enumerate(season_hca):
            plt.text(i, v + 0.5 if v >= 0 else v - 1.5, f"{v:.1f}", ha='center')
        plt.tight_layout()
        plt.show()
    else:
        print("Win/Loss indicator not found in dataset")

# 6. Scoring Analysis
print("\n--- Scoring Analysis ---")

if 'PTS' in games_df.columns:
    # Basic scoring stats
    print(f"Average Points Per Game: {games_df['PTS'].mean():.2f}")
    print(f"Median Points Per Game: {games_df['PTS'].median():.2f}")
    print(f"Max Points in a Game: {games_df['PTS'].max()} (by {games_df.loc[games_df['PTS'].idxmax(), 'TEAM_NAME']})")
    print(f"Min Points in a Game: {games_df['PTS'].min()} (by {games_df.loc[games_df['PTS'].idxmin(), 'TEAM_NAME']})")
    
    # Scoring distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(games_df['PTS'], kde=True, bins=30)
    plt.title('Distribution of Points Scored Per Team Per Game')
    plt.xlabel('Points')
    plt.ylabel('Frequency')
    plt.axvline(games_df['PTS'].mean(), color='red', linestyle='--', 
                label=f'Mean: {games_df["PTS"].mean():.1f}')
    plt.axvline(games_df['PTS'].median(), color='green', linestyle='--', 
                label=f'Median: {games_df["PTS"].median():.1f}')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Scoring trends over time
    yearly_scoring = games_df.groupby('SEASON')['PTS'].mean().reset_index()
    plt.figure(figsize=(12, 6))
    plt.plot(yearly_scoring['SEASON'], yearly_scoring['PTS'], marker='o', linewidth=2)
    plt.title('Average Points Per Game by Season')
    plt.xlabel('Season')
    plt.ylabel('Average Points')
    plt.xticks(rotation=45)
    plt.grid(True, linestyle='--', alpha=0.7)
    
    # Add value labels
    for i, row in yearly_scoring.iterrows():
        plt.text(i, row['PTS'] + 0.3, f"{row['PTS']:.1f}", ha='center')
    
    plt.tight_layout()
    plt.show()
    
    # Home vs. Away scoring
    if home_indicator:
        home_pts = games_df[games_df[home_indicator]]['PTS'].mean()
        away_pts = games_df[~games_df[home_indicator]]['PTS'].mean()
        print(f"\nScoring by Location:")
        print(f"Home Teams Average: {home_pts:.2f} points")
        print(f"Away Teams Average: {away_pts:.2f} points")
        print(f"Difference: {home_pts - away_pts:.2f} points")
        
        plt.figure(figsize=(10, 6))
        scoring_loc = pd.Series([home_pts, away_pts], index=['Home', 'Away'])
        scoring_loc.plot(kind='bar', color=['blue', 'red'])
        plt.title('Average Points Scored: Home vs. Away')
        plt.ylabel('Average Points')
        for i, v in enumerate(scoring_loc):
            plt.text(i, v + 0.5, f"{v:.1f}", ha='center')
        plt.tight_layout()
        plt.show()
else:
    print("Points (PTS) column not found in dataset")

# 7. Team Performance Analysis
print("\n--- Team Performance Analysis ---")

if 'TEAM_NAME' in games_df.columns and 'WL' in games_df.columns:
    # Convert win/loss to boolean for easier calculations
    if not 'WIN' in games_df.columns:
        games_df['WIN'] = games_df['WL'] == 'W'
    
    # Calculate win percentage by team
    team_performance = games_df.groupby('TEAM_NAME')['WIN'].agg(['count', 'mean']).reset_index()
    team_performance = team_performance.rename(columns={'count': 'Games', 'mean': 'Win_PCT'})
    team_performance['Win_PCT'] = team_performance['Win_PCT'] * 100
    team_performance = team_performance.sort_values('Win_PCT', ascending=False)
    
    print("Overall Team Performance (2017-2024):")
    display(team_performance.head(10))
    
    # Visualize top and bottom performing teams
    plt.figure(figsize=(14, 8))
    sns.barplot(x='Win_PCT', y='TEAM_NAME', data=team_performance.head(10), palette='viridis')
    plt.title('Top 10 NBA Teams by Win Percentage (2017-2024)')
    plt.xlabel('Win Percentage (%)')
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(14, 8))
    sns.barplot(x='Win_PCT', y='TEAM_NAME', data=team_performance.tail(10).sort_values('Win_PCT'), palette='viridis')
    plt.title('Bottom 10 NBA Teams by Win Percentage (2017-2024)')
    plt.xlabel('Win Percentage (%)')
    plt.tight_layout()
    plt.show()
    
    # Team performance over time
    if 'SEASON' in games_df.columns:
        # Select a few top and bottom teams for analysis
        top_teams = team_performance.head(5)['TEAM_NAME'].tolist()
        bottom_teams = team_performance.tail(5)['TEAM_NAME'].tolist()
        selected_teams = top_teams + bottom_teams
        
        # Calculate win percentage by team and season
        team_season_perf = games_df[games_df['TEAM_NAME'].isin(selected_teams)].groupby(['SEASON', 'TEAM_NAME'])['WIN'].mean().reset_index()
        team_season_perf['WIN_PCT'] = team_season_perf['WIN'] * 100
        
        # Plot performance trends
        plt.figure(figsize=(14, 8))
        for team in selected_teams:
            team_data = team_season_perf[team_season_perf['TEAM_NAME'] == team]
            plt.plot(team_data['SEASON'], team_data['WIN_PCT'], marker='o', linewidth=2, label=team)
        
        plt.title('Win Percentage Trends for Selected Teams')
        plt.xlabel('Season')
        plt.ylabel('Win Percentage (%)')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
else:
    print("Team name or win/loss columns not found in dataset")

# 8. Conference Analysis
print("\n--- Conference Performance Analysis ---")

# First check if we have conference data in the game dataset
if 'TEAM_CONFERENCE' in games_df.columns:
    conference_col = 'TEAM_CONFERENCE'
else:
    # We need to merge with team data to get conference information
    print("Conference information not found in game data. Attempting to merge with team data...")
    
    # Find the most recent team data file
    team_files = glob.glob('../data/processed/nba_teams_*.csv')
    if team_files:
        latest_team_file = max(team_files, key=os.path.getctime)
        print(f"Loading team data from: {latest_team_file}")
        
        # Load team data
        teams_df = pd.read_csv(latest_team_file)
        
        # Check if we have team ID and conference columns
        if 'id' in teams_df.columns and 'Conference' in teams_df.columns:
            # Try to merge on team ID if available in games_df
            if 'TEAM_ID' in games_df.columns:
                games_df = pd.merge(
                    games_df, 
                    teams_df[['id', 'Conference']], 
                    left_on='TEAM_ID', 
                    right_on='id', 
                    how='left'
                )
                conference_col = 'Conference'
            # Otherwise try to merge on team name
            elif 'TEAM_NAME' in games_df.columns and 'full_name' in teams_df.columns:
                games_df = pd.merge(
                    games_df, 
                    teams_df[['full_name', 'Conference']], 
                    left_on='TEAM_NAME', 
                    right_on='full_name', 
                    how='left'
                )
                conference_col = 'Conference'
            else:
                print("Could not merge team data - no matching columns found")
                conference_col = None
        else:
            print("Team data does not contain required conference information")
            conference_col = None
    else:
        print("No team data files found")
        conference_col = None

# If we have conference data, analyze conference performance
if conference_col and conference_col in games_df.columns:
    # Check how many rows have conference data
    missing_conf = games_df[conference_col].isna().sum()
    if missing_conf > 0:
        print(f"Warning: {missing_conf} rows ({missing_conf/len(games_df)*100:.1f}%) are missing conference data")
    
    # Analyze win percentage by conference
    if 'WIN' in games_df.columns:
        conf_perf = games_df.groupby(conference_col)['WIN'].agg(['count', 'mean']).reset_index()
        conf_perf = conf_perf.rename(columns={'count': 'Games', 'mean': 'Win_PCT'})
        conf_perf['Win_PCT'] = conf_perf['Win_PCT'] * 100
        
        print("\nOverall Conference Performance:")
        display(conf_perf)
        
        # Conference performance by season
        conf_season_perf = games_df.groupby(['SEASON', conference_col])['WIN'].mean().reset_index()
        conf_season_perf['WIN_PCT'] = conf_season_perf['WIN'] * 100
        
        plt.figure(figsize=(12, 6))
        for conf in games_df[conference_col].dropna().unique():
            conf_data = conf_season_perf[conf_season_perf[conference_col] == conf]
            plt.plot(conf_data['SEASON'], conf_data['WIN_PCT'], marker='o', linewidth=2, label=conf)
        
        plt.title('Conference Win Percentage by Season')
        plt.xlabel('Season')
        plt.ylabel('Win Percentage (%)')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.xticks(rotation=45)
        plt.axhline(y=50, color='gray', linestyle='--')
        plt.tight_layout()
        plt.show()
        
        # Inter-conference games
        if 'MATCHUP' in games_df.columns and 'OPP_TEAM_ID' in games_df.columns:
            print("\nAttempting to analyze inter-conference games...")
            # This would require identifying opponents' conferences
            # We'd need to map each game to determine if it's East vs West
            # This is more complex and might need a separate analysis
    else:
        print("Win/Loss indicator not available for conference analysis")
else:
    print("Conference analysis could not be performed due to missing data")

# 9. Seasonal Patterns
print("\n--- Seasonal Patterns Analysis ---")

if 'GAME_DATE' in games_df.columns and 'WIN' in games_df.columns:
    # Extract month from game date
    games_df['MONTH'] = games_df['GAME_DATE'].dt.month
    # Convert to basketball season month (Oct-Jun, where Oct=1, Nov=2, etc.)
    games_df['SEASON_MONTH'] = games_df['MONTH'].apply(
        lambda m: m - 9 if m >= 10 else m + 3
    )
    
    # Monthly win percentages
    month_names = {
        1: 'Oct', 2: 'Nov', 3: 'Dec', 4: 'Jan', 5: 'Feb', 
        6: 'Mar', 7: 'Apr', 8: 'May', 9: 'Jun'
    }
    
    # Get win percentage by month for home teams
    if home_indicator:
        monthly_home_win = games_df[games_df[home_indicator]].groupby('SEASON_MONTH')['WIN'].mean()
        
        plt.figure(figsize=(12, 6))
        monthly_home_win.plot(kind='line', marker='o')
        plt.title('Home Team Win Percentage by Month of Season')
        plt.xlabel('Month')
        plt.ylabel('Win Percentage')
        plt.xticks(range(1, 10), [month_names[i] for i in range(1, 10)])
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.ylim(0.4, 0.7)
        
        # Add value labels
        for i, v in monthly_home_win.items():
            plt.text(i, v + 0.01, f"{v:.3f}", ha='center')
        
        plt.tight_layout()
        plt.show()
    
    # Scoring by month
    if 'PTS' in games_df.columns:
        monthly_scoring = games_df.groupby('SEASON_MONTH')['PTS'].mean()
        
        plt.figure(figsize=(12, 6))
        monthly_scoring.plot(kind='line', marker='o', color='green')
        plt.title('Average Points Per Game by Month of Season')
        plt.xlabel('Month')
        plt.ylabel('Average Points')
        plt.xticks(range(1, 10), [month_names[i] for i in range(1, 10)])
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # Add value labels
        for i, v in monthly_scoring.items():
            plt.text(i, v + 0.3, f"{v:.1f}", ha='center')
        
        plt.tight_layout()
        plt.show()
    
    # Day of week patterns
    games_df['WEEKDAY'] = games_df['GAME_DATE'].dt.dayofweek
    weekday_names = {
        0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 
        3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'
    }
    
    # Game frequency by day of week
    weekday_counts = games_df['WEEKDAY'].value_counts().sort_index()
    weekday_counts.index = weekday_counts.index.map(lambda x: weekday_names[x])
    
    plt.figure(figsize=(12, 6))
    weekday_counts.plot(kind='bar')
    plt.title('Number of Games by Day of Week')
    plt.xlabel('Day of Week')
    plt.ylabel('Number of Games')
    for i, v in enumerate(weekday_counts):
        plt.text(i, v + 50, str(v), ha='center')
    plt.tight_layout()
    plt.show()
    
    # Home win percentage by day of week
    if home_indicator:
        weekday_home_win = games_df[games_df[home_indicator]].groupby('WEEKDAY')['WIN'].mean()
        weekday_home_win.index = weekday_home_win.index.map(lambda x: weekday_names[x])
        
        plt.figure(figsize=(12, 6))
        weekday_home_win.plot(kind='bar', color='purple')
        plt.title('Home Team Win Percentage by Day of Week')
        plt.xlabel('Day of Week')
        plt.ylabel('Win Percentage')
        plt.ylim(0.4, 0.7)
        for i, v in enumerate(weekday_home_win):
            plt.text(i, v + 0.01, f"{v:.3f}", ha='center')
        plt.tight_layout()
        plt.show()

else:
    print("Required date or win/loss columns not found for seasonal analysis")

# 10. Margin of Victory Analysis
print("\n--- Margin of Victory Analysis ---")

# Check if we can calculate margin of victory
if 'PLUS_MINUS' in games_df.columns:
    mov_col = 'PLUS_MINUS'
    print("Using PLUS_MINUS column for margin of victory")
elif all(col in games_df.columns for col in ['PTS', 'OPP_PTS']):
    games_df['PLUS_MINUS'] = games_df['PTS'] - games_df['OPP_PTS']
    mov_col = 'PLUS_MINUS'
    print("Calculated PLUS_MINUS from PTS and OPP_PTS")
else:
    print("Cannot calculate margin of victory from available columns")
    mov_col = None

if mov_col:
    # Basic MOV stats
    print(f"\nAverage Margin of Victory (absolute): {games_df[mov_col].abs().mean():.2f} points")
    print(f"Median Margin of Victory (absolute): {games_df[mov_col].abs().median():.2f} points")
    print(f"Maximum Margin of Victory: {games_df[mov_col].max():.2f} points")
    print(f"Maximum Margin of Defeat: {games_df[mov_col].min():.2f} points")
    
    # Distribution of margins
    plt.figure(figsize=(12, 6))
    sns.histplot(games_df[mov_col], kde=True, bins=50)
    plt.title('Distribution of Game Point Differentials')
    plt.xlabel('Point Differential (positive = win, negative = loss)')
    plt.ylabel('Frequency')
    plt.axvline(x=0, color='red', linestyle='--')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    
    # Close games analysis
    close_threshold = 3  # games decided by 3 points or less
    close_games_pct = (games_df[mov_col].abs() <= close_threshold).mean() * 100
    print(f"\nPercentage of games decided by {close_threshold} points or less: {close_games_pct:.2f}%")
    
    blowout_threshold = 20  # games decided by 20+ points
    blowout_games_pct = (games_df[mov_col].abs() >= blowout_threshold).mean() * 100
    print(f"Percentage of games decided by {blowout_threshold}+ points (blowouts): {blowout_games_pct:.2f}%")
    
    # Margin categories
    bins = [-100, -20, -10, -3, 3, 10, 20, 100]
    labels = ['Blown Out (20+)', 'Big Loss (10-19)', 'Close Loss (1-3)', 
              'Close Win (1-3)', 'Big Win (10-19)', 'Blowout Win (20+)']
    games_df['MARGIN_CAT'] = pd.cut(games_df[mov_col], bins=bins, labels=labels)
    
    margin_counts = games_df['MARGIN_CAT'].value_counts().sort_index()
    margin_pcts = margin_counts / len(games_df) * 100
    
    plt.figure(figsize=(14, 8))
    margin_pcts.plot(kind='bar', color=sns.color_palette("viridis", len(margin_pcts)))
    plt.title('Distribution of Game Outcomes by Margin Category')
    plt.xlabel('Margin Category')
    plt.ylabel('Percentage of Games')
    for i, v in enumerate(margin_pcts):
        plt.text(i, v + 0.5, f"{v:.1f}%", ha='center')
    plt.xticks(rotation=45)
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    
    # Margin of victory by season
    if 'SEASON' in games_df.columns:
        season_mov = games_df.groupby('SEASON')[mov_col].agg(['mean', 'median', 'std']).reset_index()
        season_mov['abs_mean'] = games_df.groupby('SEASON')[mov_col].apply(lambda x: x.abs().mean()).values
        
        plt.figure(figsize=(12, 6))
        plt.plot(season_mov['SEASON'], season_mov['abs_mean'], marker='o', linewidth=2, 
                 label='Average Absolute Margin')
        plt.plot(season_mov['SEASON'], season_mov['std'], marker='s', linewidth=2, 
                 label='Standard Deviation')
        plt.title('Game Margin Trends by Season')
        plt.xlabel('Season')
        plt.ylabel('Points')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.xticks(rotation=45)
        
        # Add value labels
        for i, row in season_mov.iterrows():
            plt.text(i, row['abs_mean'] + 0.2, f"{row['abs_mean']:.1f}", ha='center')
            plt.text(i, row['std'] + 0.2, f"{row['std']:.1f}", ha='center', color='orange')
        
        plt.tight_layout()
        plt.show()

# 11. Key Insights and Next Steps
print("\n--- Key Insights and Potential Predictive Features ---")
print("""
Based on this exploratory analysis, several potential predictive features emerge:

1. Home Court Advantage:
   - Home teams consistently win more often (typically by 4-8 percentage points)
   - Home court advantage varies by season and may have been affected by COVID

2. Team Strength Indicators:
   - Historical win percentage (overall and recent)
   - Scoring differentials
   - Conference performance patterns

3. Temporal Patterns:
   - Month of season effects on performance
   - Day of week patterns
   - Season-to-season trends

4. Game Context:
   - Close vs. blowout game likelihood
   - Scoring environment changes over time

Next steps for model development:
1. Engineer features based on these observations
2. Create rolling performance metrics (last 10/20 games)
3. Develop head-to-head matchup statistics
4. Build baseline prediction models using these features
5. Evaluate which features provide the most predictive power
""")

# 12. Save insights to a file for future reference
insights_path = '../documentation/game_data_exploration_insights.md'
os.makedirs(os.path.dirname(insights_path), exist_ok=True)

with open(insights_path, 'w') as f:
    f.write("# NBA Game Data Exploration Insights\n\n")
    f.write(f"Analysis conducted on {datetime.now().strftime('%Y-%m-%d')}\n\n")
    f.write("## Dataset Overview\n")
    f.write(f"- Total games analyzed: {len(games_df)}\n")
    f.write(f"- Seasons covered: {', '.join(sorted(games_df['SEASON'].unique()))}\n")
    f.write(f"- Date range: {games_df['GAME_DATE'].min().strftime('%Y-%m-%d')} to {games_df['GAME_DATE'].max().strftime('%Y-%m-%d')}\n\n")
    
    f.write("## Key Insights\n\n")
    
    # Home court advantage
    if home_indicator and 'WIN' in games_df.columns:
        home_win_pct = games_df[games_df[home_indicator]]['WIN'].mean() * 100
        away_win_pct = games_df[~games_df[home_indicator]]['WIN'].mean() * 100
        hca = home_win_pct - away_win_pct
        
        f.write("### Home Court Advantage\n")
        f.write(f"- Home teams win {home_win_pct:.1f}% of games\n")
        f.write(f"- Away teams win {away_win_pct:.1f}% of games\n")
        f.write(f"- Overall home court advantage: {hca:.1f} percentage points\n\n")
    
    # Scoring trends
    if 'PTS' in games_df.columns:
        f.write("### Scoring Patterns\n")
        f.write(f"- Average points per team per game: {games_df['PTS'].mean():.1f}\n")
        if 'SEASON' in games_df.columns:
            latest_season = max(games_df['SEASON'])
            latest_avg = games_df[games_df['SEASON'] == latest_season]['PTS'].mean()
            first_season = min(games_df['SEASON'])
            first_avg = games_df[games_df['SEASON'] == first_season]['PTS'].mean()
            f.write(f"- Scoring trend: {first_avg:.1f} PPG in {first_season} to {latest_avg:.1f} PPG in {latest_season}\n")
        if home_indicator:
            f.write(f"- Home teams score {home_pts - away_pts:.1f} more points on average than away teams\n\n")
    
    # Close games
    if mov_col:
        f.write("### Game Margins\n")
        f.write(f"- Average margin of victory: {games_df[mov_col].abs().mean():.1f} points\n")
        f.write(f"- Close games (≤3 points): {close_games_pct:.1f}% of all games\n")
        f.write(f"- Blowouts (≥20 points): {blowout_games_pct:.1f}% of all games\n\n")
    
    # Team performance
    if 'TEAM_NAME' in games_df.columns and 'WIN' in games_df.columns:
        f.write("### Team Performance\n")
        top_team = team_performance.iloc[0]
        bottom_team = team_performance.iloc[-1]
        f.write(f"- Best performing team: {top_team['TEAM_NAME']} ({top_team['Win_PCT']:.1f}% wins)\n")
        f.write(f"- Worst performing team: {bottom_team['TEAM_NAME']} ({bottom_team['Win_PCT']:.1f}% wins)\n\n")
    
    # Seasonal patterns
    if 'MONTH' in games_df.columns and 'WIN' in games_df.columns:
        f.write("### Seasonal Patterns\n")
        if 'SEASON_MONTH' in games_df.columns and home_indicator:
            best_month = monthly_home_win.idxmax()
            worst_month = monthly_home_win.idxmin()
            f.write(f"- Strongest home advantage: {month_names[best_month]} ({monthly_home_win[best_month]:.3f} win %)\n")
            f.write(f"- Weakest home advantage: {month_names[worst_month]} ({monthly_home_win[worst_month]:.3f} win %)\n")
        
        if 'WEEKDAY' in games_df.columns:
            most_games_day = weekday_counts.idxmax()
            least_games_day = weekday_counts.idxmin()
            f.write(f"- Most games played on: {most_games_day}s\n")
            f.write(f"- Fewest games played on: {least_games_day}s\n\n")
    
    f.write("## Potential Predictive Features\n\n")
    f.write("Based on this exploration, these features may be most valuable for prediction:\n\n")
    f.write("1. **Team Strength Metrics**\n")
    f.write("   - Recent win percentage (last 10, 20, 40 games)\n")
    f.write("   - Point differential trends\n")
    f.write("   - Home/away specific performance\n\n")
    
    f.write("2. **Matchup Factors**\n")
    f.write("   - Historical head-to-head performance\n")
    f.write("   - Conference matchup dynamics\n")
    f.write("   - Style matchup (high-scoring vs. defensive teams)\n\n")
    
    f.write("3. **Contextual Elements**\n")
    f.write("   - Home court advantage\n")
    f.write("   - Day of week\n")
    f.write("   - Time of season\n")
    f.write("   - Rest days between games\n\n")
    
    f.write("4. **Advanced Possibilities**\n")
    f.write("   - Game importance (playoff implications)\n")
    f.write("   - Team momentum (streaks)\n")
    f.write("   - Travel distance factors\n\n")
    
    f.write("## Next Steps\n\n")
    f.write("1. Engineer the identified features from raw game data\n")
    f.write("2. Develop a baseline prediction model using logistic regression\n")
    f.write("3. Evaluate feature importance and refine the feature set\n")
    f.write("4. Experiment with more advanced modeling approaches\n")
    f.write("5. Integrate player-level statistics for enhanced predictions\n")

print(f"\nExploration insights saved to {insights_path}")

Finding the most recent processed game data file...


FileNotFoundError: No processed game data files found. Run collect_games.py first.