### Import Dependencies

In [74]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

SEX = 'M' # Change to for MEN/WOMEN league

### Aggregate Seasonal Measures per Team

In the following cell, we will aggregate the performance of each team per season.

The dataset we are interested in particular is:
- `[M/F]RegularSeasonDetailedResults.csv`
    - All game results from past regular seasons

In [57]:
season_results_df = pd.read_csv(f'../data/{SEX}RegularSeasonDetailedResults.csv')
season_agg_df = pd.DataFrame(columns=[
    'Season',
    'TeamID',
    'GamesPlayed',
    'Win%',
    'MoV',
    'FGM',
    'FGA',
    'FG%',
    '3PTM',
    '3PTA',
    '3PT%',
    'FTM',
    'FTA',
    'FT%',
    'OR',
    'DR',
    'AST',
    'TO',
    'STL',
    'BLK',
    'PF'])

seasons = season_results_df['Season'].unique()
teams = season_results_df['WTeamID'].unique()

for season in seasons:
    for team in teams:
        win_df = season_results_df[(season_results_df['Season'] == season) & (season_results_df['WTeamID'] == team)]
        loss_df = season_results_df[(season_results_df['Season'] == season) & (season_results_df['LTeamID'] == team)]

        num_wins = len(win_df)
        num_losses = len(loss_df)
        if num_wins > 0:
            games_played = num_wins + num_losses
            win_perc = num_wins / games_played
            mov = (win_df['WScore'] - win_df['LScore']).mean()
            fgm = win_df['WFGM'].sum() + loss_df['LFGM'].sum()
            fga = win_df['WFGA'].sum() + loss_df['LFGA'].sum()
            fg_perc = fgm / fga
            threes_made = win_df['WFGM3'].sum() + loss_df['LFGM3'].sum()
            threes_attempted = win_df['WFGA3'].sum() + loss_df['LFGA3'].sum()
            threes_perc = threes_made / threes_attempted
            ftm = win_df['WFTM'].sum() + loss_df['LFTM'].sum()
            fta = win_df['WFTA'].sum() + loss_df['LFTA'].sum()
            ft_perc = ftm / fta
            off_rating = (win_df['WOR'].sum() + loss_df['LOR'].sum()) / games_played
            def_rating = (win_df['WDR'].sum() + loss_df['LDR'].sum()) / games_played
            ast = win_df['WAst'].sum() + loss_df['LAst'].sum()
            to = win_df['WTO'].sum() + loss_df['LTO'].sum()
            stl = win_df['WStl'].sum() + loss_df['LStl'].sum()
            blk = win_df['WBlk'].sum() + loss_df['LBlk'].sum()
            pf = win_df['WPF'].sum() + loss_df['LPF'].sum()

            season_agg_row = {
                'Season': season,
                'TeamID': team,
                'GamesPlayed': games_played,
                'Win%': win_perc,
                'MoV': mov,
                'FGM': fgm,
                'FGA': fga,
                'FG%': fg_perc,
                '3PTM': threes_made,
                '3PTA': threes_attempted,
                '3PT%': threes_perc,
                'FTM': ftm,
                'FTA': fta,
                'FT%': ft_perc,
                'OR': off_rating,
                'DR': def_rating,
                'AST': ast,
                'TO': to,
                'STL': stl,
                'BLK': blk,
                'PF': pf
            }
            season_agg_df = pd.concat([season_agg_df, pd.DataFrame([season_agg_row])], ignore_index=True)

season_agg_df.to_csv(f'data/{SEX}NCAASeasonAggregates.csv', index=False)

  season_agg_df = pd.concat([season_agg_df, pd.DataFrame([season_agg_row])], ignore_index=True)


In [33]:
season_agg_df = pd.read_csv('data/MNCAASeasonAggregates.csv')
season_agg_df

Unnamed: 0,Season,TeamID,GamesPlayed,Win%,MoV,FGM,FGA,FG%,3PTM,3PTA,...,FTM,FTA,FT%,OR,DR,AST,TO,STL,BLK,PF
0,2003,1104,28,0.607143,13.176471,673,1601,0.420362,178,556,...,416,586,0.709898,13.571429,23.928571,339,372,185,106,505
1,2003,1272,29,0.793103,12.695652,762,1740,0.437931,203,582,...,434,664,0.653614,14.068966,25.965517,482,400,214,147,544
2,2003,1266,28,0.821429,14.826087,762,1575,0.483810,162,427,...,509,661,0.770045,13.107143,24.071429,457,380,168,102,522
3,2003,1296,31,0.548387,10.176471,755,1645,0.458967,195,509,...,453,694,0.652738,13.000000,22.645161,393,527,236,112,614
4,2003,1400,28,0.785714,14.818182,784,1748,0.448513,164,470,...,476,666,0.714715,16.178571,26.142857,406,376,179,108,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7604,2024,1476,30,0.100000,10.666667,688,1738,0.395857,246,816,...,274,393,0.697201,6.866667,23.066667,356,389,188,71,464
7605,2024,1475,29,0.172414,8.000000,649,1630,0.398160,187,591,...,456,613,0.743883,8.482759,24.206897,315,372,178,80,627
7606,2024,1477,30,0.333333,6.800000,699,1782,0.392256,241,820,...,379,548,0.691606,7.500000,20.333333,368,330,221,137,530
7607,2024,1473,28,0.214286,5.500000,602,1549,0.388638,138,476,...,432,593,0.728499,7.142857,21.214286,212,324,166,69,513


### Pair Tournament Matchups with Season Data
This is the dataset that we will be using to train our model.

- `[M/F]NCAATourneyCompactResults.csv`
    - All game results from past tournaments
- `[M\F]NCAATourneySeeds.csv`
    - The seeding for the tournaments

Each row will encompass a tournament game of two teams along with their respective season performance. The predicted variable will be the 'Win' column. `1` represents Team1 won. `0` represents Team2 won.

In [34]:
tourney_results_df = pd.read_csv(f'../data/{SEX}NCAATourneyCompactResults.csv')
tourney_results_df.drop(columns=['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], inplace=True)
tourney_results_df.rename(columns={'WTeamID': 'T1', 'LTeamID': 'T2'}, inplace=True)
tourney_results_df['Win'] = 1

for index, _ in tourney_results_df.iterrows():
    if np.random.rand() < 0.5:
        tourney_results_df.at[index, 'T1'], tourney_results_df.at[index, 'T2'] = tourney_results_df.at[index, 'T2'], tourney_results_df.at[index, 'T1']
        tourney_results_df.at[index, 'Win'] = 0

print(tourney_results_df['Win'].value_counts())
tourney_results_df

Win
1    1227
0    1224
Name: count, dtype: int64


Unnamed: 0,Season,T1,T2,Win
0,1985,1234,1116,0
1,1985,1345,1120,0
2,1985,1207,1250,1
3,1985,1425,1229,0
4,1985,1325,1242,0
...,...,...,...,...
2446,2023,1274,1400,1
2447,2023,1361,1166,1
2448,2023,1163,1274,1
2449,2023,1194,1361,0


In [35]:
def getSeed(seeds_df, season, team):
    seed = seeds_df[(seeds_df['Season'] == season) & (seeds_df['TeamID'] == team)]['Seed'].values[0][1:]
    try:
        return int(seed)
    except ValueError:
        return int(seed[:-1])

In [55]:
tourney_df_columns = ['Season', 'T1', 'T1Seed', 'T2', 'T2Seed', 'SeedDiff', 'Win']
for i in range(1, 3):
    for col in season_agg_df.columns[2:]:
        tourney_df_columns.append(f'T{i}{col}')

tourney_df = pd.DataFrame(columns=tourney_df_columns)

seeds_df = pd.read_csv(f'../data/{SEX}NCAATourneySeeds.csv')
for index, row in tourney_results_df.iterrows():
    t1_season = season_agg_df[(season_agg_df['Season'] == row['Season']) & (season_agg_df['TeamID'] == row['T1'])].drop(columns=['TeamID'])
    t2_season = season_agg_df[(season_agg_df['Season'] == row['Season']) & (season_agg_df['TeamID'] == row['T2'])].drop(columns=['TeamID'])
    if len(t1_season) > 0 and len(t2_season) > 0:
        t1_seed = getSeed(seeds_df, row['Season'], row['T1'])
        t2_seed = getSeed(seeds_df, row['Season'], row['T2'])
        
        matchup = {
            'Season': row['Season'],
            'T1': row['T1'],
            'T1Seed': t1_seed,
            'T2': row['T2'],
            'T2Seed': t2_seed,
            'SeedDiff': t2_seed - t1_seed,
            'Win': row['Win']
        }
        matchup_df = pd.DataFrame([matchup])

        t1_season.columns = ['T1' + col if col != 'Season' else col for col in t1_season.columns]
        t2_season.columns = ['T2' + col if col != 'Season' else col for col in t2_season.columns]

        tourney_row_df = pd.merge(matchup_df, t1_season, on='Season', how='right')
        tourney_row_df = pd.merge(tourney_row_df, t2_season, on='Season', how='right')
        tourney_df = pd.concat([tourney_df, tourney_row_df], ignore_index=True)

tourney_df.to_csv(f'data/{SEX}NCAATourneyMatchups.csv', index=False)

  tourney_df = pd.concat([tourney_df, tourney_row_df], ignore_index=True)


In [56]:
tourney_df = pd.read_csv(f'data/{SEX}NCAATourneyMatchups.csv')
tourney_df = tourney_df.sample(frac=1).reset_index(drop=True)
tourney_df

Unnamed: 0,Season,T1,T1Seed,T2,T2Seed,SeedDiff,Win,T1GamesPlayed,T1Win%,T1MoV,...,T2FTM,T2FTA,T2FT%,T2OR,T2DR,T2AST,T2TO,T2STL,T2BLK,T2PF
0,2013,1276,4,1257,1,-3,0,32,0.781250,16.760000,...,545,766,0.711488,13.617647,23.882353,510,431,365,144,603
1,2012,1435,5,1458,4,-1,0,34,0.705882,14.250000,...,397,536,0.740672,9.696970,24.090909,385,296,172,115,469
2,2019,1295,16,1300,16,0,1,31,0.516129,9.312500,...,387,550,0.703636,11.533333,24.933333,433,465,158,77,541
3,2012,1313,15,1196,7,-8,0,32,0.750000,8.583333,...,445,625,0.712000,11.454545,23.696970,495,360,214,108,523
4,2004,1181,1,1228,5,4,1,32,0.843750,18.851852,...,371,533,0.696060,12.033333,22.933333,503,377,205,114,573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,2012,1452,10,1211,7,-3,0,32,0.593750,15.210526,...,549,768,0.714844,10.967742,26.161290,426,414,211,102,470
1311,2022,1266,9,1314,8,-1,0,31,0.612903,11.105263,...,461,597,0.772194,9.303030,27.636364,489,374,178,127,474
1312,2014,1110,15,1458,2,-13,0,32,0.625000,13.150000,...,573,770,0.744156,8.787879,24.393939,400,266,165,116,493
1313,2019,1181,1,1416,9,8,1,34,0.852941,19.931034,...,495,767,0.645372,9.806452,26.935484,411,369,178,139,524


#### Prepare Model Dataset

In [77]:
x = tourney_df.drop(columns=['Season', 'Win'])
y = tourney_df['Win']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1)

### Baseline Performance
To get a benchmark to compare our model, we will naively predict that the higher seed will automatically win the match and test our accuracy on the dataset. 

In [78]:
baseline_pred = []
for index, row in x_val.iterrows():
    if row['SeedDiff'] >= 0:
        baseline_pred.append(1)
    else:
        baseline_pred.append(0)
baseline_pred = np.array(baseline_pred)

acc = accuracy_score(y_val, baseline_pred)
print(f'Accuracy: {acc}')

Accuracy: 0.7110266159695817


### Training the Model
The dataset has been prepared. The baseline accuracy is 71.103%. Now we will train/eval a model which should beat the baseline accuracy.

In [92]:
# model = LogisticRegression()
# model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
model = xgb.XGBClassifier(
    max_depth=1,
    n_estimators=100,
    learning_rate=0.01,
    n_jobs=-1,
    objective='binary:logistic'
)
model.fit(x_train, y_train)
y_pred = model.predict(x_val)

acc = accuracy_score(y_val, y_pred)
print(f'Accuracy: {acc}')

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, x, y, cv=kfold, scoring='accuracy')
print("Cross-Validation Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.7110266159695817
Cross-Validation Accuracy: 0.6928 (+/- 0.0784)


### Generating the Test Set
We will generate a test set of all possible 2024 tournament matchups for each team and predict on those values. We will also pair these games with the corresponding 2024 regular season statistics for each team.

- `2024_tourney_seeds.csv`
    - File that will be updated with 2024 seeds once released (2023 seeds prior to that)

In [75]:
tourney_results_df = pd.read_csv(f'../data/2024_tourney_seeds.csv')
tourney_results_df

Unnamed: 0,Tournament,Seed,TeamID
0,M,W01,1345
1,M,W02,1266
2,M,W03,1243
3,M,W04,1397
4,M,W05,1181
...,...,...,...
123,W,Z12,3405
124,W,Z13,3387
125,W,Z14,3241
126,W,Z15,3436


### Predictions and Bracket Simulation
