# March Madness Predictor: Team-Season Features + XGBoost

This notebook builds team-season features from regular-season box scores, constructs tournament matchup features as pairwise differences, and trains a binary classifier to predict win probabilities for tournament games. We use a season-based split to mimic predicting the final season.

Pipeline:
- Aggregate regular-season data into per-team, per-season stats (eFG%, TOV%, ORB%, FTR, offensive/defensive rating, win rate).
- Create tournament matchup rows by joining both teams' season features and computing feature differences.
- Train an XGBoost classifier; evaluate with accuracy, log loss, and ROC AUC.
- Report feature importances and test-season metrics.

In [12]:
# Imports and setup
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

import warnings
pd.set_option('display.max_columns', 200)
warnings.filterwarnings('ignore')

RNG_SEED = 42

In [13]:
# Load regular-season and tournament data
REGULAR_PATH = 'data/processed/RecentMRegularSeasonDetailedResults.csv'
TOURNEY_PATH = 'data/processed/RecentMNCAATourneyDetailedResults.csv'
TEAMS_PATH = 'data/relevant_data/MTeams.csv'
SEEDS_PATH = 'data/relevant_data/MNCAATourneySeeds.csv'

regular_df = pd.read_csv(REGULAR_PATH)
tourney_df = pd.read_csv(TOURNEY_PATH)
teams_df = pd.read_csv(TEAMS_PATH) if True else None
seeds_df = pd.read_csv(SEEDS_PATH)

print('Regular season shape:', regular_df.shape)
print('Tourney shape:', tourney_df.shape)
print('Teams shape:', teams_df.shape if teams_df is not None else None)
print('Seeds shape:', seeds_df.shape)
regular_df.head(2), tourney_df.head(2)

Regular season shape: (58364, 34)
Tourney shape: (602, 34)
Teams shape: (380, 4)
Seeds shape: (2626, 3)


(   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  WFGM  WFGA  \
 0    2015      11     1103      74     1420      57    H      0    25    53   
 1    2015      11     1104      82     1406      54    H      0    29    63   
 
    WFGM3  WFGA3  WFTM  WFTA  WOR  WDR  WAst  WTO  WStl  WBlk  WPF  LFGM  LFGA  \
 0     12     30    12    21    9   25    17   12     7     5   22    20    48   
 1      7     23    17    19   14   20    17    8    14     9   16    18    50   
 
    LFGM3  LFGA3  LFTM  LFTA  LOR  LDR  LAst  LTO  LStl  LBlk  LPF  
 0      3     12    14    28   12   23    13   16     9     0   19  
 1      7     21    11    15   14   20     9   22     2     2   20  ,
    Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  WFGM  WFGA  \
 0    2015     134     1214      74     1264      64    N      0    26    53   
 1    2015     134     1279      94     1140      90    N      0    36    80   
 
    WFGM3  WFGA3  WFTM  WFTA  WOR  WDR  WAst  WTO  WStl  WBl

In [14]:
# Build team-season features from regular-season box scores
def _safe_div(a, b):
    return np.divide(a, b, out=np.zeros_like(a, dtype=float), where=b!=0)

def build_team_season_features(regular_df):
    # Create per-team-per-game rows from W/L boxscores
    rows = []
    for _, r in regular_df.iterrows():
        season = r['Season']
        # Winner row
        rows.append({
            'Season': season,
            'TeamID': r['WTeamID'],
            'OppTeamID': r['LTeamID'],
            'TeamScore': r['WScore'],
            'OppScore': r['LScore'],
            'FGM': r['WFGM'], 'FGA': r['WFGA'],
            'FGM3': r['WFGM3'], 'FGA3': r['WFGA3'],
            'FTM': r['WFTM'], 'FTA': r['WFTA'],
            'OR': r['WOR'], 'DR': r['WDR'],
            'AST': r['WAst'], 'TO': r['WTO'], 'STL': r['WStl'], 'BLK': r['WBlk'], 'PF': r['WPF'],
            'Opp_FGM': r['LFGM'], 'Opp_FGA': r['LFGA'],
            'Opp_FGM3': r['LFGM3'], 'Opp_FGA3': r['LFGA3'],
            'Opp_FTM': r['LFTM'], 'Opp_FTA': r['LFTA'],
            'Opp_OR': r['LOR'], 'Opp_DR': r['LDR'],
            'Opp_AST': r['LAst'], 'Opp_TO': r['LTO'], 'Opp_STL': r['LStl'], 'Opp_BLK': r['LBlk'], 'Opp_PF': r['LPF'],
            'is_win': 1,
        })
        # Loser row (mirror)
        # Map location: winner H-> loser A, winner A-> loser H, N->N
        rows.append({
            'Season': season,
            'TeamID': r['LTeamID'],
            'OppTeamID': r['WTeamID'],
            'TeamScore': r['LScore'],
            'OppScore': r['WScore'],
            'FGM': r['LFGM'], 'FGA': r['LFGA'],
            'FGM3': r['LFGM3'], 'FGA3': r['LFGA3'],
            'FTM': r['LFTM'], 'FTA': r['LFTA'],
            'OR': r['LOR'], 'DR': r['LDR'],
            'AST': r['LAst'], 'TO': r['LTO'], 'STL': r['LStl'], 'BLK': r['LBlk'], 'PF': r['LPF'],
            'Opp_FGM': r['WFGM'], 'Opp_FGA': r['WFGA'],
            'Opp_FGM3': r['WFGM3'], 'Opp_FGA3': r['WFGA3'],
            'Opp_FTM': r['WFTM'], 'Opp_FTA': r['WFTA'],
            'Opp_OR': r['WOR'], 'Opp_DR': r['WDR'],
            'Opp_AST': r['WAst'], 'Opp_TO': r['WTO'], 'Opp_STL': r['WStl'], 'Opp_BLK': r['WBlk'], 'Opp_PF': r['WPF'],
            'is_win': 0,
        })
    team_games = pd.DataFrame(rows)

    # Possession estimates (team-side)
    team_games['Poss'] = team_games['FGA'] - team_games['OR'] + team_games['TO'] + 0.44 * team_games['FTA']
    team_games['Opp_Poss'] = team_games['Opp_FGA'] - team_games['Opp_OR'] + team_games['Opp_TO'] + 0.44 * team_games['Opp_FTA']

    # Aggregate totals per team-season
    agg = team_games.groupby(['Season', 'TeamID'], as_index=False).agg({
        'TeamScore': 'sum', 'OppScore': 'sum',
        'FGM': 'sum', 'FGA': 'sum', 'FGM3': 'sum', 'FGA3': 'sum',
        'FTM': 'sum', 'FTA': 'sum',
        'OR': 'sum', 'DR': 'sum',
        'TO': 'sum',
        'Opp_DR': 'sum',
        'Poss': 'sum', 'Opp_Poss': 'sum',
        'is_win': 'mean',
    })

    # Season-level features
    agg['eFG'] = _safe_div(agg['FGM'] + 0.5 * agg['FGM3'], agg['FGA'])
    agg['TOVp'] = _safe_div(agg['TO'], agg['Poss'])
    agg['ORBp'] = _safe_div(agg['OR'], agg['OR'] + agg['Opp_DR'])
    agg['FTR'] = _safe_div(agg['FTA'], agg['FGA'])
    agg['ORtg'] = 100.0 * _safe_div(agg['TeamScore'], agg['Poss'])
    agg['DRtg'] = 100.0 * _safe_div(agg['OppScore'], agg['Poss'])
    agg['win_rate'] = agg['is_win']

    # Keep only features we need
    features = agg[['Season','TeamID','eFG','TOVp','ORBp','FTR','ORtg','DRtg','win_rate']].copy()
    return features

team_features = build_team_season_features(regular_df)
print('Team-season feature shape:', team_features.shape)
team_features.head()

Team-season feature shape: (3904, 9)


Unnamed: 0,Season,TeamID,eFG,TOVp,ORBp,FTR,ORtg,DRtg,win_rate
0,2015,1101,0.473347,0.200071,0.241632,0.282726,95.187142,112.129116,0.25
1,2015,1102,0.536005,0.193538,0.294457,0.308424,106.844418,108.722876,0.413793
2,2015,1103,0.494259,0.1893,0.332516,0.313152,104.709648,98.308185,0.588235
3,2015,1104,0.503851,0.193271,0.300407,0.44095,106.479477,103.077906,0.548387
4,2015,1105,0.447422,0.221017,0.352227,0.415875,95.773893,107.38285,0.285714


In [15]:
# Build tournament matchup dataset from team-season features + seeds
def parse_seed_num(seed):
    # Seed strings look like 'W01', 'X16b', 'Y08', etc. Extract leading integer.
    if pd.isna(seed):
        return np.nan
    s = ''.join(ch for ch in str(seed) if ch.isdigit())
    return int(s) if s else np.nan

def build_tourney_matchups(tourney_df, team_features, seeds_df):
    # Prepare base with Team1=Winner, Team2=Loser
    base = tourney_df[['Season','WTeamID','LTeamID']].copy()
    base.rename(columns={'WTeamID':'Team1','LTeamID':'Team2'}, inplace=True)

    # Prepare seeds numeric
    seeds_df = seeds_df.copy()
    seeds_df['seed_num'] = seeds_df['Seed'].apply(parse_seed_num)
    seeds_feat = seeds_df[['Season','TeamID','seed_num']]

    # Merge features for Team1
    df = base.merge(team_features, left_on=['Season','Team1'], right_on=['Season','TeamID'], how='left')
    df = df.rename(columns={
        'eFG':'eFG_1','TOVp':'TOVp_1','ORBp':'ORBp_1','FTR':'FTR_1','ORtg':'ORtg_1','DRtg':'DRtg_1','win_rate':'win_rate_1'
    })
    df = df.drop(columns=['TeamID'])

    # Merge seed for Team1
    df = df.merge(seeds_feat, left_on=['Season','Team1'], right_on=['Season','TeamID'], how='left')
    df = df.rename(columns={'seed_num':'seed1'})
    df = df.drop(columns=['TeamID'])

    # Merge features for Team2
    df = df.merge(team_features, left_on=['Season','Team2'], right_on=['Season','TeamID'], how='left')
    df = df.rename(columns={
        'eFG':'eFG_2','TOVp':'TOVp_2','ORBp':'ORBp_2','FTR':'FTR_2','ORtg':'ORtg_2','DRtg':'DRtg_2','win_rate':'win_rate_2'
    })
    df = df.drop(columns=['TeamID'])

    # Merge seed for Team2
    df = df.merge(seeds_feat, left_on=['Season','Team2'], right_on=['Season','TeamID'], how='left')
    df = df.rename(columns={'seed_num':'seed2'})
    df = df.drop(columns=['TeamID'])

    # Compute diffs for numeric features (Team1 - Team2)
    feature_list = ['eFG','TOVp','ORBp','FTR','ORtg','DRtg','win_rate']
    for c in feature_list:
        df[f'{c}_diff'] = df[f'{c}_1'] - df[f'{c}_2']

    # Seed feature: lower number is stronger; to align positive values with Team1 advantage, use seed_num_diff = seed2 - seed1
    df['seed_num_diff'] = df['seed2'] - df['seed1']

    # Positive class (Team1 won)
    df['y'] = 1

    # Create swapped rows for negative class by negating diffs
    diff_cols = [f'{c}_diff' for c in feature_list] + ['seed_num_diff']
    pos = df[['Season'] + diff_cols + ['y']]
    neg = pos.copy()
    for col in diff_cols:
        neg[col] = -neg[col]
    neg['y'] = 0

    full = pd.concat([pos, neg], axis=0, ignore_index=True)

    X_cols = diff_cols
    X = full[X_cols].astype(float)
    y = full['y'].astype(int)
    seasons = full['Season'].astype(int)
    return X, y, seasons, X_cols

X, y, seasons, feature_names = build_tourney_matchups(tourney_df, team_features, seeds_df)
print('Matchup dataset:', X.shape, 'labels:', y.shape)
X.head()

Matchup dataset: (1204, 8) labels: (1204,)


Unnamed: 0,eFG_diff,TOVp_diff,ORBp_diff,FTR_diff,ORtg_diff,DRtg_diff,win_rate_diff,seed_num_diff
0,-0.038427,-0.006502,0.01811,-0.045023,-5.080124,-0.345614,-0.108902,0.0
1,-0.048002,0.005162,0.010512,-0.047605,-7.914302,-2.863319,-0.09375,0.0
2,-0.006363,0.017508,-0.033718,0.158137,-4.153356,-1.391516,0.01564,0.0
3,-0.030818,0.023878,0.022943,-0.034113,-7.222514,-0.996866,-0.069404,0.0
4,0.041718,-0.031885,0.001285,0.006654,10.665427,-15.107935,0.264706,13.0


In [16]:
# Train XGBoost with season-based split and evaluate
last_season = int(seasons.max())
train_mask = seasons < last_season
test_mask = seasons == last_season

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]

# Handle missing values by imputing with train means
train_means = X_train.mean()
X_train = X_train.fillna(train_means)
X_test = X_test.fillna(train_means)

print(f'Train seasons: <= {last_season-1} | Test season: {last_season}')
print('Train size:', X_train.shape, 'Test size:', X_test.shape)

model = XGBClassifier(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=RNG_SEED,
    eval_metric='logloss',
)

model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:, 1]
preds = (probs >= 0.5).astype(int)

acc = accuracy_score(y_test, preds)
ll = log_loss(y_test, probs)
try:
    auc = roc_auc_score(y_test, probs)
except Exception:
    auc = float('nan')

print({'accuracy': round(acc, 4), 'log_loss': round(ll, 4), 'roc_auc': (None if np.isnan(auc) else round(auc, 4))})

# Feature importances
importances = model.feature_importances_
for name, imp in sorted(zip(feature_names, importances), key=lambda x: -x[1]):
    print(f'{name}: {imp:.4f}')

Train seasons: <= 2023 | Test season: 2024
Train size: (1070, 8) Test size: (134, 8)
{'accuracy': 0.709, 'log_loss': 0.5782, 'roc_auc': 0.7939}
seed_num_diff: 0.3081
ORtg_diff: 0.1094
DRtg_diff: 0.1032
ORBp_diff: 0.1004
win_rate_diff: 0.1000
FTR_diff: 0.0981
eFG_diff: 0.0951
TOVp_diff: 0.0856
{'accuracy': 0.709, 'log_loss': 0.5782, 'roc_auc': 0.7939}
seed_num_diff: 0.3081
ORtg_diff: 0.1094
DRtg_diff: 0.1032
ORBp_diff: 0.1004
win_rate_diff: 0.1000
FTR_diff: 0.0981
eFG_diff: 0.0951
TOVp_diff: 0.0856
