# Feature Engineering for March Madness Predictions

This notebook creates predictive features by:
1. Calculating advanced team statistics
2. Incorporating historical tournament performance
3. Engineering matchup-specific features
4. Preparing features for modeling

In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os
regular_season = pd.read_csv("../data/MRegularSeasonCompactResults.csv")
seeds = pd.read_csv("../data/MNCAATourneySeeds.csv")

## Team Performance Metrics

Calculate season stats for each team:

In [18]:
def calculate_season_stats(df, season):
    """Calculate team statistics for a given season"""
    season_games = df[df['Season'] == season]
    
    # Calculate aggregate stats
    winning_stats = season_games.groupby('WTeamID').agg({
        'WScore': ['count', 'mean'],
        'LScore': ['mean'],
        'NumOT': ['sum']
    }).reset_index()
    
    losing_stats = season_games.groupby('LTeamID').agg({
        'WScore': ['mean'], 
        'LScore': ['count', 'mean'],
        'NumOT': ['sum']
    }).reset_index()
    
    return winning_stats, losing_stats

In [19]:
# Save engineered features
def save_features(features_df, filename):
    """Save engineered features to CSV file"""
    output_path = "../data/features/"
    # Create features directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    features_df.to_csv(output_path + filename, index=False)
    print(f"Saved features to {output_path + filename}")

# Example usage:
all_season_stats = pd.DataFrame()
for season in regular_season['Season'].unique():
    winning_stats, losing_stats = calculate_season_stats(regular_season, season)
    # Process and combine stats...
    season_stats = process_stats(winning_stats, losing_stats)
    all_season_stats = pd.concat([all_season_stats, season_stats])

# Save the features
save_features(all_season_stats, "team_season_stats.csv")

Saved features to ../data/features/team_season_stats.csv


In [20]:
def process_stats(winning_stats, losing_stats):
    """Process and combine winning and losing stats into team features"""
    # Rename columns for clarity
    winning_stats.columns = ['TeamID', 'Wins', 'PointsScored', 'PointsAllowed', 'OTGames']
    losing_stats.columns = ['TeamID', 'OppPointsScored', 'Losses', 'PointsScored2', 'OTGames2']
    
    # Merge winning and losing stats
    team_stats = pd.merge(winning_stats, losing_stats, on='TeamID', how='outer').fillna(0)
    
    # Calculate derived features
    team_stats['Games'] = team_stats['Wins'] + team_stats['Losses'] 
    team_stats['WinPct'] = team_stats['Wins'] / team_stats['Games']
    team_stats['AvgPointsScored'] = (team_stats['PointsScored'] * team_stats['Wins'] + 
                                    team_stats['PointsScored2'] * team_stats['Losses']) / team_stats['Games']
    team_stats['AvgPointsAllowed'] = (team_stats['PointsAllowed'] * team_stats['Wins'] + 
                                     team_stats['OppPointsScored'] * team_stats['Losses']) / team_stats['Games']
    team_stats['OTRate'] = (team_stats['OTGames'] + team_stats['OTGames2']) / team_stats['Games']
    
    # Select and rename final features
    final_cols = ['TeamID', 'Games', 'WinPct', 'AvgPointsScored', 'AvgPointsAllowed', 'OTRate']
    return team_stats[final_cols]