# March Madness Prediction

## Overview

### Goal
Submissions are based on the Brier Score, the goal will be to minimize the brier score between the predicted probabilities and the actual game outcomes. The Brier score measures the accuracy of probablistic predition, in this case the mean square error. 

The brier score can be thought of as a cost function that measures the average squared difference between the predicted probabilities and the actual outcomes.

$$
Brier = \frac{1}{N} \sum_{i=1}^{N} (p_i - o_i)^2
$$

where $p_i$ is the predicted probability of the event and $o_i$ is the actual outcome. The Brier score can span across all items in a set of N predictions.

Therefore, minimizing the Brier score will result in a more accurate prediction.


## Import Libraries
Numpy for numerical operations
Pandas for data manipulation
Matplotlib, Seaborn, Plotly for plotting


In [2]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.subplots as sp
import xgboost as xgb
import sklearn as sk

## Load Data

Set up a data dictionary that will store the data for each file, this will make it easier to access data from the csvs


In [3]:
data_dir = 'data/'
data = {}

files = glob.glob(data_dir + '*.csv')
for file in files:
    filename = file.split('/')[-1].split('\\')[-1].split('.')[0]
    try:
        data[filename] = pd.read_csv(file, encoding='latin-1')
    except Exception as e:
        print(f"Warning: Could not load {filename}: {e}")



Now that we have the data loaded, lets start by loading team and team spelling data.


In [None]:
teams = pd.concat([data['MTeams'], data['WTeams']])
teams_spelling = pd.concat([data['MTeamSpellings'], data['WTeamSpellings']])
teams_spelling = teams_spelling.groupby(by='TeamID', as_index=False)['TeamNameSpelling'].count()
teams_spelling.columns = ['TeamID', 'TeamNameCount']
teams = pd.merge(teams, teams_spelling, how='left', on=['TeamID'])
print(teams.head())

Add season and turney reluts, both compact and detailed. There is a chance we only end up using the detailed results

In [5]:
season_compact_results = pd.concat([data['MRegularSeasonCompactResults'], data['WRegularSeasonCompactResults']]).assign(ST='S')
season_detailed_results = pd.concat([data['MRegularSeasonDetailedResults'], data['WRegularSeasonDetailedResults']]).assign(ST='S')
tourney_compact_results = pd.concat([data['MNCAATourneyCompactResults'], data['WNCAATourneyCompactResults']]).assign(ST='T')
tourney_detailed_results = pd.concat([data['MNCAATourneyDetailedResults'], data['WNCAATourneyDetailedResults']]).assign(ST='T')


In [None]:
# Create a summary of the datasets
summary = pd.DataFrame({
    'Dataset': [
        'Regular Season Compact',
        'Regular Season Detailed',
        'Tournament Compact',
        'Tournament Detailed'
    ],
    'Shape': [
        season_compact_results.shape,
        season_detailed_results.shape,
        tourney_compact_results.shape,
        tourney_detailed_results.shape
    ],
    'Columns': [
        len(season_compact_results.columns),
        len(season_detailed_results.columns),
        len(tourney_compact_results.columns),
        len(tourney_detailed_results.columns)
    ],
    'Sample Years': [
        f"{season_compact_results['Season'].min()}-{season_compact_results['Season'].max()}",
        f"{season_detailed_results['Season'].min()}-{season_detailed_results['Season'].max()}",
        f"{tourney_compact_results['Season'].min()}-{tourney_compact_results['Season'].max()}",
        f"{tourney_detailed_results['Season'].min()}-{tourney_detailed_results['Season'].max()}"
    ]
})

# Display the summary with nice formatting
display(summary.style.set_properties(**{
    'border-color': 'black',
    'border-style': 'solid',
    'border-width': '1px',
    'text-align': 'center'
}))

# If you want to see the first few rows of each dataset, you can also do:
# print("\nSample of Regular Season Compact Results:")
# display(season_compact_results.head(3))

# print("\nSample of Regular Season Detailed Results:")
# display(season_detailed_results.head(3))

# print("\nSample of Tournament Compact Results:")
# display(tourney_compact_results.head(3))

# print("\nSample of Tournament Detailed Results:")
# display(tourney_detailed_results.head(3))

In [20]:
# Load other data that might prove useful
slots = pd.concat([data['MNCAATourneySlots'], data['WNCAATourneySlots']])
seeds = pd.concat([data['MNCAATourneySeeds'], data['WNCAATourneySeeds']])
seeds['SeedValue'] = seeds['Seed'].str.extract(r'(\d+)').astype(int)
seeds_dict = {'_'.join(map(str,[int(k1),k2])):int(v[1:3]) for k1, v, k2 in seeds[['Season', 'Seed', 'TeamID']].values}
game_cities = pd.concat([data['MGameCities'], data['WGameCities']])
seasons = pd.concat([data['MSeasons'], data['WSeasons']])

# Combine all game results
all_compact_results = pd.concat([season_compact_results, tourney_compact_results])
all_detailed_results = pd.concat([season_detailed_results, tourney_detailed_results])


## Feature Engineering



In [21]:
# Add derived features to compact results
all_compact_results['ScoreDiff'] = all_compact_results['WScore'] - all_compact_results['LScore']
all_compact_results['HomeAdvantage'] = all_compact_results['WLoc'].map({'H': 1, 'N': 0, 'A': -1})

# Add derived features to detaifled results
all_detailed_results['ScoreDiff'] = all_detailed_results['WScore'] - all_detailed_results['LScore']
all_detailed_results['HomeAdvantage'] = all_detailed_results['WLoc'].map({'H': 1, 'N': 0, 'A': -1})

    # Calculate shooting percentages (handling division by zero)
all_detailed_results['WFGPct'] = np.where(all_detailed_results['WFGA'] > 0, 
                                        all_detailed_results['WFGM'] / all_detailed_results['WFGA'], 0)
all_detailed_results['WFG3Pct'] = np.where(all_detailed_results['WFGA3'] > 0, 
                                        all_detailed_results['WFGM3'] / all_detailed_results['WFGA3'], 0)
all_detailed_results['WFTPct'] = np.where(all_detailed_results['WFTA'] > 0, 
                                        all_detailed_results['WFTM'] / all_detailed_results['WFTA'], 0)
all_detailed_results['LFGPct'] = np.where(all_detailed_results['LFGA'] > 0, 
                                        all_detailed_results['LFGM'] / all_detailed_results['LFGA'], 0)
all_detailed_results['LFG3Pct'] = np.where(all_detailed_results['LFGA3'] > 0, 
                                        all_detailed_results['LFGM3'] / all_detailed_results['LFGA3'], 0)
all_detailed_results['LFTPct'] = np.where(all_detailed_results['LFTA'] > 0, 
                                        all_detailed_results['LFTM'] / all_detailed_results['LFTA'], 0)

# Add statistical differences
all_detailed_results['ReboundDiff'] = (all_detailed_results['WOR'] + all_detailed_results['WDR']) - \
                                    (all_detailed_results['LOR'] + all_detailed_results['LDR'])
all_detailed_results['AssistDiff'] = all_detailed_results['WAst'] - all_detailed_results['LAst']
all_detailed_results['TurnoverDiff'] = all_detailed_results['WTO'] - all_detailed_results['LTO']
all_detailed_results['StealDiff'] = all_detailed_results['WStl'] - all_detailed_results['LStl']
all_detailed_results['BlockDiff'] = all_detailed_results['WBlk'] - all_detailed_results['LBlk']
all_detailed_results['FoulDiff'] = all_detailed_results['WPF'] - all_detailed_results['LPF']

# Add seed information to tournament games
tourney_compact = all_compact_results[all_compact_results['ST'] == 'T'].copy()
tourney_detailed = all_detailed_results[all_detailed_results['ST'] == 'T'].copy()

# Add winner seeds
tourney_compact = pd.merge(
    tourney_compact,
    seeds[['Season', 'TeamID', 'SeedValue']],
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
)
tourney_compact.rename(columns={'SeedValue': 'WSeedValue'}, inplace=True)
tourney_compact.drop('TeamID', axis=1, inplace=True)

tourney_detailed = pd.merge(
    tourney_detailed,
    seeds[['Season', 'TeamID', 'SeedValue']],
    how='left',
    left_on=['Season', 'WTeamID'],
    right_on=['Season', 'TeamID']
)
tourney_detailed.rename(columns={'SeedValue': 'WSeedValue'}, inplace=True)
tourney_detailed.drop('TeamID', axis=1, inplace=True)

# Add loser seeds
tourney_compact = pd.merge(
    tourney_compact,
    seeds[['Season', 'TeamID', 'SeedValue']],
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
)
tourney_compact.rename(columns={'SeedValue': 'LSeedValue'}, inplace=True)
tourney_compact.drop('TeamID', axis=1, inplace=True)

tourney_detailed = pd.merge(
    tourney_detailed,
    seeds[['Season', 'TeamID', 'SeedValue']],
    how='left',
    left_on=['Season', 'LTeamID'],
    right_on=['Season', 'TeamID']
)
tourney_detailed.rename(columns={'SeedValue': 'LSeedValue'}, inplace=True)
tourney_detailed.drop('TeamID', axis=1, inplace=True)

    # Calculate seed difference (lower is better in seeding, so LSeed - WSeed is positive if favorite won)
tourney_compact['SeedDiff'] = tourney_compact['LSeedValue'] - tourney_compact['WSeedValue']
tourney_detailed['SeedDiff'] = tourney_detailed['LSeedValue'] - tourney_detailed['WSeedValue']

before creating the model, we need to prepare the features. This method is critical for transforming the raw basketball data into structured data that is suitable for machine learning

In [None]:
def prepare_features(self, min_season=None, max_season=None, use_detailed=True):
        """
        Prepare features for model training from the game results data.
        
        Parameters:
        -----------
        min_season : int, optional
            Minimum season to include in training data
        max_season : int, optional
            Maximum season to include in training data
        use_detailed : bool, default=True
            Whether to use detailed features or just compact features
            
        Returns:
        --------
        X : DataFrame
            Feature matrix
        y : Series
            Target variable (1 for team1 win, 0 for team2 win)
        """
        print("Preparing features for model training...")
        
        # Check if required data exists
        required_files = [
            'MRegularSeasonDetailedResults', 
            'WRegularSeasonDetailedResults',
            'MNCAATourneyDetailedResults',
            'WNCAATourneyDetailedResults'
        ]
        
        missing_files = [f for f in required_files if f not in self.data or self.data[f] is None]
        if missing_files:
            print(f"Error: Missing required data files: {missing_files}")
            # List available files
            print(f"Available files: {list(self.data.keys())}")
            raise ValueError(f"Missing required data files: {missing_files}")
        
        # Filter seasons if specified
        all_detailed_results = pd.concat([
            self.data['MRegularSeasonDetailedResults'], 
            self.data['WRegularSeasonDetailedResults'],
            self.data['MNCAATourneyDetailedResults'],
            self.data['WNCAATourneyDetailedResults']
        ]).assign(ST='All')
        
        # Filter by season if specified
        if min_season is not None:
            all_detailed_results = all_detailed_results[all_detailed_results['Season'] >= min_season]
        if max_season is not None:
            all_detailed_results = all_detailed_results[all_detailed_results['Season'] <= max_season]
            
        # Create pairs of teams for each game (both directions)
        # For each game, we create two rows: (team1, team2) and (team2, team1)
        # with corresponding targets 1 and 0
        game_pairs = []
        
        for _, game in all_detailed_results.iterrows():
            # Features for team1 (winner) vs team2 (loser)
            features1 = {
                'Season': game['Season'],
                'DayNum': game['DayNum'],
                'Team1': game['WTeamID'],
                'Team2': game['LTeamID'],
                'Target': 1  # Team1 won
            }
            
            # Features for team2 (loser) vs team1 (winner)
            features2 = {
                'Season': game['Season'],
                'DayNum': game['DayNum'],
                'Team1': game['LTeamID'],
                'Team2': game['WTeamID'],
                'Target': 0  # Team1 lost
            }
            
            # Add location feature if available
            if 'WLoc' in game:
                if game['WLoc'] == 'H':
                    features1['Team1Home'] = 1
                    features2['Team1Home'] = 0
                elif game['WLoc'] == 'A':
                    features1['Team1Home'] = 0
                    features2['Team1Home'] = 1
                else:  # Neutral
                    features1['Team1Home'] = 0.5
                    features2['Team1Home'] = 0.5
            
            # Add detailed stats if available and requested
            if use_detailed and all(col in game for col in ['WFGM', 'LFGM']):
                # Team1 offensive stats when it was the winner
                features1.update({
                    'Team1_FGM': game['WFGM'],
                    'Team1_FGA': game['WFGA'],
                    'Team1_FGM3': game['WFGM3'],
                    'Team1_FGA3': game['WFGA3'],
                    'Team1_FTM': game['WFTM'],
                    'Team1_FTA': game['WFTA'],
                    'Team1_OR': game['WOR'],
                    'Team1_DR': game['WDR'],
                    'Team1_Ast': game['WAst'],
                    'Team1_TO': game['WTO'],
                    'Team1_Stl': game['WStl'],
                    'Team1_Blk': game['WBlk'],
                    'Team1_PF': game['WPF'],
                    
                    # Team2 offensive stats when it was the loser
                    'Team2_FGM': game['LFGM'],
                    'Team2_FGA': game['LFGA'],
                    'Team2_FGM3': game['LFGM3'],
                    'Team2_FGA3': game['LFGA3'],
                    'Team2_FTM': game['LFTM'],
                    'Team2_FTA': game['LFTA'],
                    'Team2_OR': game['LOR'],
                    'Team2_DR': game['LDR'],
                    'Team2_Ast': game['LAst'],
                    'Team2_TO': game['LTO'],
                    'Team2_Stl': game['LStl'],
                    'Team2_Blk': game['LBlk'],
                    'Team2_PF': game['LPF'],
                })
                
                # Team1 offensive stats when it was the loser
                features2.update({
                    'Team1_FGM': game['LFGM'],
                    'Team1_FGA': game['LFGA'],
                    'Team1_FGM3': game['LFGM3'],
                    'Team1_FGA3': game['LFGA3'],
                    'Team1_FTM': game['LFTM'],
                    'Team1_FTA': game['LFTA'],
                    'Team1_OR': game['LOR'],
                    'Team1_DR': game['LDR'],
                    'Team1_Ast': game['LAst'],
                    'Team1_TO': game['LTO'],
                    'Team1_Stl': game['LStl'],
                    'Team1_Blk': game['LBlk'],
                    'Team1_PF': game['LPF'],
                    
                    # Team2 offensive stats when it was the winner
                    'Team2_FGM': game['WFGM'],
                    'Team2_FGA': game['WFGA'],
                    'Team2_FGM3': game['WFGM3'],
                    'Team2_FGA3': game['WFGA3'],
                    'Team2_FTM': game['WFTM'],
                    'Team2_FTA': game['WFTA'],
                    'Team2_OR': game['WOR'],
                    'Team2_DR': game['WDR'],
                    'Team2_Ast': game['WAst'],
                    'Team2_TO': game['WTO'],
                    'Team2_Stl': game['WStl'],
                    'Team2_Blk': game['WBlk'],
                    'Team2_PF': game['WPF'],
                })
            
            game_pairs.append(features1)
            game_pairs.append(features2)
        
        # Convert to DataFrame
        games_df = pd.DataFrame(game_pairs)
        
        # Add team seed features if available
        if hasattr(self, 'seeds') and self.seeds is not None:
            # Join seed info for both teams
            games_df = pd.merge(
                games_df,
                self.seeds[['Season', 'TeamID', 'SeedValue']],
                left_on=['Season', 'Team1'],
                right_on=['Season', 'TeamID'],
                how='left'
            ).rename(columns={'SeedValue': 'Team1Seed'}).drop('TeamID', axis=1)
            
            games_df = pd.merge(
                games_df,
                self.seeds[['Season', 'TeamID', 'SeedValue']],
                left_on=['Season', 'Team2'],
                right_on=['Season', 'TeamID'],
                how='left'
            ).rename(columns={'SeedValue': 'Team2Seed'}).drop('TeamID', axis=1)
            
            # Create seed difference feature
            games_df['SeedDiff'] = games_df['Team1Seed'] - games_df['Team2Seed']
        
        # Add derived features
        if use_detailed:
            # Calculate shooting percentages
            for team in [1, 2]:
                prefix = f'Team{team}_'
                # Field goal percentage
                games_df[f'{prefix}FGPct'] = np.where(
                    games_df[f'{prefix}FGA'] > 0,
                    games_df[f'{prefix}FGM'] / games_df[f'{prefix}FGA'],
                    0
                )
                # 3-point percentage
                games_df[f'{prefix}FG3Pct'] = np.where(
                    games_df[f'{prefix}FGA3'] > 0,
                    games_df[f'{prefix}FGM3'] / games_df[f'{prefix}FGA3'],
                    0
                )
                # Free throw percentage
                games_df[f'{prefix}FTPct'] = np.where(
                    games_df[f'{prefix}FTA'] > 0,
                    games_df[f'{prefix}FTM'] / games_df[f'{prefix}FTA'],
                    0
                )
                # Total rebounds
                games_df[f'{prefix}TotalReb'] = games_df[f'{prefix}OR'] + games_df[f'{prefix}DR']
            
            # Calculate differentials between teams
            stat_pairs = [
                ('FGM', 'Field goals made'),
                ('FGA', 'Field goals attempted'),
                ('FGPct', 'Field goal percentage'),
                ('FGM3', '3-pointers made'),
                ('FGA3', '3-pointers attempted'),
                ('FG3Pct', '3-point percentage'),
                ('FTM', 'Free throws made'),
                ('FTA', 'Free throws attempted'),
                ('FTPct', 'Free throw percentage'),
                ('OR', 'Offensive rebounds'),
                ('DR', 'Defensive rebounds'),
                ('TotalReb', 'Total rebounds'),
                ('Ast', 'Assists'),
                ('TO', 'Turnovers'),
                ('Stl', 'Steals'),
                ('Blk', 'Blocks'),
                ('PF', 'Personal fouls')
            ]
            
            for stat, _ in stat_pairs:
                games_df[f'{stat}Diff'] = games_df[f'Team1_{stat}'] - games_df[f'Team2_{stat}']
        
        # Drop columns not needed for modeling
        drop_cols = ['Team1', 'Team2']  # We'll use team statistics instead of IDs
        
        # Keep Season and DayNum for evaluation and predictions
        X = games_df.drop(['Target'] + drop_cols, axis=1)
        y = games_df['Target']
        
        print(f"Prepared {len(X)} samples with {X.shape[1]} features")
        return X, y

In [None]:
    def create_model(self):
        """
        Creates XGBoost models for prediction and calibration.
        """
        # Main prediction model
        self.model = xgb.XGBRegressor(
            n_estimators=500,         # Number of boosting rounds
            learning_rate=0.05,       # Smaller learning rate for better generalization
            max_depth=6,              # Control model complexity
            min_child_weight=3,       # Helps prevent overfitting
            subsample=0.8,            # Use 80% of data for each tree
            colsample_bytree=0.8,     # Use 80% of features for each tree
            objective='binary:logistic',  # Binary classification with probability output
            random_state=42,
            n_jobs=-1                 # Use all CPU cores
        )
        
        # Calibration model to fine-tune probabilities
        self.calibration_model = xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.03,
            max_depth=4,
            min_child_weight=2,
            subsample=0.7,
            colsample_bytree=0.7,
            objective='binary:logistic',
            random_state=42,
            n_jobs=-1
        )