# **March Madness Machine Learning 2025** #
##### By: CoNiya Butler & Daniel Davis #####

### **Introduction** ###

This notebook explores historical NCAA Division I men's and women's basketball data to predict the outcomes of March Madness tournament games. We'll leverage provided datasets containing team information, game results, and tournament seeds to build a predictive model. Data files are prefixed with 'M' for men's, 'W' for women's,and some span both.

##### Goal: #####


Minimize the Brier score, the evaluation metric for this competition.


##### Approach: #####

1. Start with a model with the features on the list provided below.
    - Seeding differences
    - Average points Per Game
    - Win percentage regular season
    - Win percentage tournament
    - Location
    - Average scoring difference per game
2. Apply feature engineering and model tuning techniques

### **Import Libraries** ###

In [2]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, brier_score_loss, mean_squared_error, roc_curve, auc
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, accuracy_score
import warnings
warnings.filterwarnings("ignore")

### **Load Data** ###

Load the datasets need for the model M is for men's basketball and W is for women's basketball

In [89]:
M_seed_df = pd.read_csv("/home/pysparkdevcontainer/code/data/MNCAATourneySeeds.csv")
W_seed_df = pd.read_csv("/home/pysparkdevcontainer/code/data/WNCAATourneySeeds.csv")
M_regular_results = pd.read_csv("/home/pysparkdevcontainer/code/data/MRegularSeasonDetailedResults.csv")
M_tourney_results = pd.read_csv("/home/pysparkdevcontainer/code/data/MNCAATourneyDetailedResults.csv")
W_regular_results = pd.read_csv("/home/pysparkdevcontainer/code/data/WRegularSeasonDetailedResults.csv")
W_tourney_results = pd.read_csv("/home/pysparkdevcontainer/code/data/WNCAATourneyDetailedResults.csv")
submission_df = pd.read_csv("/home/pysparkdevcontainer/code/data/SampleSubmissionStage2.csv")
submissionstage1_df = pd.read_csv("/home/pysparkdevcontainer/code/data/SampleSubmissionStage1.csv")

### **Data Preprocessing** ###

In [4]:
# Join the men's and women's dataset together
regular_results = pd.concat([M_regular_results, W_regular_results])
tourney_results = pd.concat([M_tourney_results, W_tourney_results])
seed_df = pd.concat([M_seed_df, W_seed_df])

In [5]:
regular_results

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80621,2025,106,3242,63,3416,58,H,0,17,46,...,26,5,11,15,21,11,12,4,2,21
80622,2025,106,3329,68,3428,64,A,0,23,63,...,23,9,16,8,18,16,11,8,6,20
80623,2025,106,3349,72,3194,39,H,0,30,63,...,17,16,23,4,25,5,10,4,3,9
80624,2025,106,3378,70,3150,52,A,0,25,59,...,17,11,14,7,24,5,15,8,2,15


In [92]:
#This function will caculate the teams' average win score, loss score, points and point differential, for each season.
def compute_seasonal_stats(results):
    wins = results.groupby(["Season", "WTeamID"])["WScore"].agg(["count", "mean"]).rename(columns={"count": "Wins", "mean": "AvgWinScore"}).reset_index()
    wins = wins.rename(columns={'WTeamID': 'Team1'})
    losses = results.groupby(["Season", "LTeamID"])["LScore"].agg(["count", "mean"]).rename(columns={"count": "Losses", "mean": "AvgLossScore"}).reset_index()
    losses = losses.rename(columns={'LTeamID' : 'Team1'})

    stats = wins.merge(losses, on=["Season", "Team1"]).fillna(0)
    stats["TotalGames"] = stats["Wins"] + stats["Losses"]
    stats["WinRate"] = stats["Wins"] / stats["TotalGames"]
    
   
    stats["AvgScore"] = (stats["Wins"] * stats["AvgWinScore"] + stats["Losses"] * stats["AvgLossScore"]) / stats["TotalGames"]
    

    stats["AvgScoreDiff"] = stats["AvgWinScore"] - stats["AvgLossScore"]
    
    return stats

season_stats = compute_seasonal_stats(regular_results)

season_stats

Unnamed: 0,Season,Team1,Wins,AvgWinScore,Losses,AvgLossScore,TotalGames,WinRate,AvgScore,AvgScoreDiff
0,2003,1102,12,68.750000,16,48.625000,28,0.428571,57.250000,20.125000
1,2003,1103,13,87.769231,14,70.428571,27,0.481481,78.777778,17.340659
2,2003,1104,17,74.705882,11,60.909091,28,0.607143,69.285714,13.796791
3,2003,1105,7,79.428571,19,68.947368,26,0.269231,71.769231,10.481203
4,2003,1106,13,68.307692,15,59.533333,28,0.464286,63.607143,8.774359
...,...,...,...,...,...,...,...,...,...,...
13531,2025,3476,11,70.818182,12,57.000000,23,0.478261,63.608696,13.818182
13532,2025,3477,4,72.250000,18,59.666667,22,0.181818,61.954545,12.583333
13533,2025,3478,4,67.000000,20,49.300000,24,0.166667,52.250000,17.700000
13534,2025,3479,6,71.500000,15,58.733333,21,0.285714,62.380952,12.766667


In [109]:
def feature_selection(results_df, df):
    df['Season'] = df['ID'].map(lambda x: x.split('_')[0]).astype(int)
    df['Team1'] = df['ID'].map(lambda x: x.split('_')[1]).astype(int)
    df['Team2'] = df['ID'].map(lambda x: x.split('_')[2]).astype(int)

    feature_df = df.merge(results_df[['Season', 'Team1' , 'WinRate', 'AvgScore', 'AvgScoreDiff']], on=['Season', 'Team1'])
    feature_df = feature_df.rename(columns={'WinRate': 'Team1WinRatio', 'AvgScore': 'Team1AvgScore', 'AvgScoreDiff': 'Team1AvgScoreDiff'})
    results_df = results_df.rename(columns={'Team1': 'Team2'})
    feature_df = feature_df.merge(results_df[['Season', 'Team2' , 'WinRate', 'AvgScore', 'AvgScoreDiff']], on=['Season', 'Team2'])
    feature_df = feature_df.rename(columns={'WinRate': 'Team2WinRatio', 'AvgScore': 'Team2AvgScore', 'AvgScoreDiff': 'Team2AvgScoreDiff'})
    feature_df['Team1WinPrediction'] = 1 / (1 + 10 ** ((feature_df['Team2WinRatio']*1000 - feature_df['Team1WinRatio']*1000) / 400))
    
    feature_df = feature_df.drop(['ID','Pred'], axis=1)
    return feature_df


In [110]:
training_df = feature_selection(season_stats, submissionstage1_df)
training_df

Unnamed: 0,Season,Team1,Team2,Team1WinRatio,Team1AvgScore,Team1AvgScoreDiff,Team2WinRatio,Team2AvgScore,Team2AvgScoreDiff,Team1WinPrediction
0,2021,1101,1102,0.826087,76.304348,15.197368,0.200000,58.720000,8.350000,0.973508
1,2021,1101,1103,0.826087,76.304348,15.197368,0.619048,76.571429,8.596154,0.767065
2,2021,1101,1104,0.826087,76.304348,15.197368,0.800000,79.566667,19.666667,0.537472
3,2021,1101,1105,0.826087,76.304348,15.197368,0.400000,63.333333,21.666667,0.920763
4,2021,1101,1106,0.826087,76.304348,15.197368,0.222222,64.222222,5.821429,0.970001
...,...,...,...,...,...,...,...,...,...,...
499801,2024,3475,3477,0.785714,71.535714,14.893939,0.448276,67.620690,13.932692,0.874621
499802,2024,3475,3478,0.785714,71.535714,14.893939,0.562500,57.031250,15.801587,0.783287
499803,2024,3476,3477,0.133333,52.200000,14.480769,0.448276,67.620690,13.932692,0.140281
499804,2024,3476,3478,0.133333,52.200000,14.480769,0.562500,57.031250,15.801587,0.077954


In [111]:
training_df.columns

Index(['Season', 'Team1', 'Team2', 'Team1WinRatio', 'Team1AvgScore',
       'Team1AvgScoreDiff', 'Team2WinRatio', 'Team2AvgScore',
       'Team2AvgScoreDiff', 'Team1WinPrediction'],
      dtype='object')

### **Exploratory Data Analysis (EDA)** ###

In [4]:
seed_df['Seed'].unique()

array(['W01', 'W02', 'W03', 'W04', 'W05', 'W06', 'W07', 'W08', 'W09',
       'W10', 'W11', 'W12', 'W13', 'W14', 'W15', 'W16', 'X01', 'X02',
       'X03', 'X04', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10', 'X11',
       'X12', 'X13', 'X14', 'X15', 'X16', 'Y01', 'Y02', 'Y03', 'Y04',
       'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13',
       'Y14', 'Y15', 'Y16', 'Z01', 'Z02', 'Z03', 'Z04', 'Z05', 'Z06',
       'Z07', 'Z08', 'Z09', 'Z10', 'Z11', 'Z12', 'Z13', 'Z14', 'Z15',
       'Z16', 'Y16a', 'Y16b', 'W16a', 'W16b', 'X16a', 'X16b', 'Z16a',
       'Z16b', 'W12a', 'W12b', 'Z11a', 'Z11b', 'X12a', 'X12b', 'Z14a',
       'Z14b', 'Y11a', 'Y11b', 'Z13a', 'Z13b', 'Y12a', 'Y12b', 'W11a',
       'W11b', 'X11a', 'X11b', 'Y10a', 'Y10b', 'Z10a', 'Z10b', 'Z12a',
       'Z12b'], dtype=object)

### **Model Training** ###

In [114]:
# Split features and target
X = training_df[['Season', 'Team1', 'Team2', 'Team1WinRatio', 'Team1AvgScore', 'Team1AvgScoreDiff', 'Team2WinRatio', 'Team2AvgScore', 'Team2AvgScoreDiff']]
y = training_df["Team1WinPrediction"]

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the numerical features (excluding TeamID)
scaler = StandardScaler()
X_train.iloc[:, 2:] = scaler.fit_transform(X_train.iloc[:, 2:])
X_test.iloc[:, 2:] = scaler.transform(X_test.iloc[:, 2:])

# Convert to DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define model parameters
params = {
    "objective": "binary:logistic",  # Probability prediction
    "eval_metric": "logloss",
    "eta": 0.1,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

# Train model
num_rounds = 100
bst = xgb.train(params, dtrain, num_rounds)

# Predict probabilities
y_pred_proba = bst.predict(dtest)

print("Example Predictions:", y_pred_proba[:5])

Example Predictions: [0.9144868  0.10187875 0.60800904 0.4736491  0.49600658]


### **Prediction Submission** ###

In [None]:
def submission_generator():
    pass