# March Madness Prediction

## Overview

### Goal
Submissions are based on the Brier Score, the goal will be to minimize the brier score between the predicted probabilities and the actual game outcomes. The Brier score measures the accuracy of probablistic predition, in this case the mean square error. 

The brier score can be thought of as a cost function that measures the average squared difference between the predicted probabilities and the actual outcomes.

$$
Brier = \frac{1}{N} \sum_{i=1}^{N} (p_i - o_i)^2
$$

where $p_i$ is the predicted probability of the event and $o_i$ is the actual outcome. The Brier score can span across all items in a set of N predictions.

Therefore, minimizing the Brier score will result in a more accurate prediction.




## Import Libraries
Numpy for numerical operations
Pandas for data manipulation
Matplotlib, Seaborn, Plotly for plotting



In [2]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.subplots as sp
import xgboost as xgb
import sklearn as sk


## Load Data

We want to get a baseline model in which we can improve upon. In order to do this effectively, I will use a class structure to store all the data and functions that will be used along the process. This will make it easier to improve and maintain changes to the prediction process.


In [None]:
class MarchMadnessPredictor:
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.data = None
        self.teams = None
        self.seeds  = None
        self.submission = None
        self.all_compact_results = None
        self.all_detailed_results = None
        self.tourney_compact_results = None
        self.tourney_detailed_results = None
        self.model = None
        self.calibration_model = None

    def load_data(self):
        
        """
        Set up a data dictionary that will store the data for each file. e.g.
        self.data = {
            'teams': [DataFrame with teams data],
            'games': [DataFrame with games data],
            'players': [DataFrame with players data]
        }
        """

        files = glob.glob(self.data_dir + '*.csv')
        self.data = {file.split('\\')[-1].split('.')[0]: pd.read_csv(file, encoding='latin-1') for file in files}

        self.submission = self.data['SampleSubmissionStage1']

        teams = pd.concat([self.data['MTeams'], self.data['WTeams']])
        teams_spelling = pd.concat([self.data['MTeamSpellings'], self.data['WTeamSpellings']])
        teams_spelling = teams_spelling.groupby(by='TeamID', as_index=False)['TeamNameSpelling'].count()
        teams_spelling.columns = ['TeamID', 'TeamNameCount']
        self.teams = pd.merge(teams, teams_spelling, how='left', on=['TeamID'])
        #print(self.teams.head())

        season_compact_results = pd.concat([self.data['MRegularSeasonCompactResults'], self.data['WRegularSeasonCompactResults']]).assign(ST='S')
        season_detailed_results = pd.concat([self.data['MRegularSeasonDetailedResults'], self.data['WRegularSeasonDetailedResults']]).assign(ST='S')
        tourney_compact_results = pd.concat([self.data['MNCAATourneyCompactResults'], self.data['WNCAATourneyCompactResults']]).assign(ST='T')
        tourney_detailed_results = pd.concat([self.data['MNCAATourneyDetailedResults'], self.data['WNCAATourneyDetailedResults']]).assign(ST='T')

        # Extract numeric seed value from seed string
        seeds = pd.concat([self.data['MNCAATourneySeeds'], self.data['WNCAATourneySeeds']])
        seeds['SeedValue'] = seeds['Seed'].str.extract('(\d+)').astype(int)
        self.seeds = seeds
        print(self.seeds)

        """
        Load the game data with additional derived features.
        Combines regualr season and tournament results
        """

        # Combine all game results
        all_compact_results = pd.concat([season_compact_results, tourney_compact_results])
        all_detailed_results = pd.concat([season_detailed_results, tourney_detailed_results])

        # Add derived features to compact results
        all_compact_results['ScoreDiff'] = all_compact_results['WScore'] - all_compact_results['LScore']
        all_compact_results['HomeAdvantage'] = all_compact_results['WLoc'].map({'H': 1, 'N': 0, 'A': -1})
        
        # Add derived features to detaifled results
        all_detailed_results['ScoreDiff'] = all_detailed_results['WScore'] - all_detailed_results['LScore']
        all_detailed_results['HomeAdvantage'] = all_detailed_results['WLoc'].map({'H': 1, 'N': 0, 'A': -1})

         # Calculate shooting percentages (handling division by zero)
        all_detailed_results['WFGPct'] = np.where(all_detailed_results['WFGA'] > 0, 
                                                all_detailed_results['WFGM'] / all_detailed_results['WFGA'], 0)
        all_detailed_results['WFG3Pct'] = np.where(all_detailed_results['WFGA3'] > 0, 
                                                all_detailed_results['WFGM3'] / all_detailed_results['WFGA3'], 0)
        all_detailed_results['WFTPct'] = np.where(all_detailed_results['WFTA'] > 0, 
                                                all_detailed_results['WFTM'] / all_detailed_results['WFTA'], 0)
        all_detailed_results['LFGPct'] = np.where(all_detailed_results['LFGA'] > 0, 
                                                all_detailed_results['LFGM'] / all_detailed_results['LFGA'], 0)
        all_detailed_results['LFG3Pct'] = np.where(all_detailed_results['LFGA3'] > 0, 
                                                all_detailed_results['LFGM3'] / all_detailed_results['LFGA3'], 0)
        all_detailed_results['LFTPct'] = np.where(all_detailed_results['LFTA'] > 0, 
                                                all_detailed_results['LFTM'] / all_detailed_results['LFTA'], 0)
        
        # Add statistical differences
        all_detailed_results['ReboundDiff'] = (all_detailed_results['WOR'] + all_detailed_results['WDR']) - \
                                            (all_detailed_results['LOR'] + all_detailed_results['LDR'])
        all_detailed_results['AssistDiff'] = all_detailed_results['WAst'] - all_detailed_results['LAst']
        all_detailed_results['TurnoverDiff'] = all_detailed_results['WTO'] - all_detailed_results['LTO']
        all_detailed_results['StealDiff'] = all_detailed_results['WStl'] - all_detailed_results['LStl']
        all_detailed_results['BlockDiff'] = all_detailed_results['WBlk'] - all_detailed_results['LBlk']
        all_detailed_results['FoulDiff'] = all_detailed_results['WPF'] - all_detailed_results['LPF']

        # Add seed information to tournament games
        tourney_compact = all_compact_results[all_compact_results['ST'] == 'T'].copy()
        tourney_detailed = all_detailed_results[all_detailed_results['ST'] == 'T'].copy()

        # Add winner seeds
        tourney_compact = pd.merge(
            tourney_compact,
            seeds[['Season', 'TeamID', 'SeedValue']],
            how='left',
            left_on=['Season', 'WTeamID'],
            right_on=['Season', 'TeamID']
        )
        tourney_compact.rename(columns={'SeedValue': 'WSeedValue'}, inplace=True)
        tourney_compact.drop('TeamID', axis=1, inplace=True)
        
        tourney_detailed = pd.merge(
            tourney_detailed,
            seeds[['Season', 'TeamID', 'SeedValue']],
            how='left',
            left_on=['Season', 'WTeamID'],
            right_on=['Season', 'TeamID']
        )
        tourney_detailed.rename(columns={'SeedValue': 'WSeedValue'}, inplace=True)
        tourney_detailed.drop('TeamID', axis=1, inplace=True)

        # Add loser seeds
        tourney_compact = pd.merge(
            tourney_compact,
            seeds[['Season', 'TeamID', 'SeedValue']],
            how='left',
            left_on=['Season', 'LTeamID'],
            right_on=['Season', 'TeamID']
        )
        tourney_compact.rename(columns={'SeedValue': 'LSeedValue'}, inplace=True)
        tourney_compact.drop('TeamID', axis=1, inplace=True)
        
        tourney_detailed = pd.merge(
            tourney_detailed,
            seeds[['Season', 'TeamID', 'SeedValue']],
            how='left',
            left_on=['Season', 'LTeamID'],
            right_on=['Season', 'TeamID']
        )
        tourney_detailed.rename(columns={'SeedValue': 'LSeedValue'}, inplace=True)
        tourney_detailed.drop('TeamID', axis=1, inplace=True)

         # Calculate seed difference (lower is better in seeding, so LSeed - WSeed is positive if favorite won)
        tourney_compact['SeedDiff'] = tourney_compact['LSeedValue'] - tourney_compact['WSeedValue']
        tourney_detailed['SeedDiff'] = tourney_detailed['LSeedValue'] - tourney_detailed['WSeedValue']

        # Store all processed data
        self.all_compact_results = all_compact_results
        self.all_detailed_results = all_detailed_results
        self.tourney_compact_results = tourney_compact
        self.tourney_detailed_results = tourney_detailed

        print("All Compact Resullts: \n", self.all_compact_results.head())
        print(self.all_detailed_results.head())
        print(self.tourney_compact_results.head())
        print(self.tourney_detailed_results.head())

    def create_model(self):
        """
        Creates XGBoost models for prediction and calibration.
        """
        # Main prediction model
        self.model = xgb.XGBRegressor(
            n_estimators=500,         # Number of boosting rounds
            learning_rate=0.05,       # Smaller learning rate for better generalization
            max_depth=6,              # Control model complexity
            min_child_weight=3,       # Helps prevent overfitting
            subsample=0.8,            # Use 80% of data for each tree
            colsample_bytree=0.8,     # Use 80% of features for each tree
            objective='reg:squarederror',  # Optimizes for MSE which aligns with Brier score
            random_state=42,
            n_jobs=-1                 # Use all CPU cores
        )
        
        # Calibration model to fine-tune probabilities
        self.calibration_model = xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.03,
            max_depth=4,
            min_child_weight=2,
            subsample=0.7,
            colsample_bytree=0.7,
            objective='reg:squarederror',
            random_state=42,
            n_jobs=-1
        )

        


In [None]:
if __name__ == '__main__':
    data_dir = 'data/'
    predictor = MarchMadnessPredictor(data_dir)
    predictor.load_data()
