# Predicting March Madness Brackets

Can we use historical results to predict march madness results? 

Use 2 datasets
cbb.csv - End of seasons stats for each team from 2013 to 2021 (https://www.kaggle.com/andrewsundberg/college-basketball-dataset)
big_dance.csv - Tournament results from 1985 to 2019 (https://data.world/michaelaroy/ncaa-tournament-results/)

In [328]:
# std imports
from os.path import join as path_join
from os import makedirs
import math

# tpl imports
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import SGDClassifier, Perceptron, LinearRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [329]:
# set some parameters
MIN_YEAR = 2008
MAX_YEAR = 2021
EXCLUDE_YEARS = [2020]  # just 2020 for now cause of covid
DUPLICATE_GAMES = True  # swap order of teams in row and duplicate game in dataset
ENCODE_CONF = True  # use Conference as training feature
NORMALIZE = True # normalize features
NUM_FEATURES = None # set to None to do no dimensionality reduction
SEED = 42

In [330]:
# read in the data sets

# keyed on TEAM, CONF, YEAR
season_stats_df = pd.read_csv(path_join('data', 'raw_cbb.csv'), index_col=0)

# keyed on year, round, region_number, seed, seed_2 (or year, round, team, team_2)
tournament_stats_df = pd.read_csv(path_join('data', 'tournament-results.csv'))

In [331]:
# preprocess datasets

# select valid years in each
season_stats_df.drop(season_stats_df[
    (season_stats_df['YEAR'] < MIN_YEAR) | (season_stats_df['YEAR'] > MAX_YEAR) | (season_stats_df['YEAR'].isin(EXCLUDE_YEARS))
    ].index, inplace=True)
tournament_stats_df.drop(tournament_stats_df[
    (tournament_stats_df['Year'] < MIN_YEAR) | (tournament_stats_df['Year'] > MAX_YEAR) | (tournament_stats_df['Year'].isin(EXCLUDE_YEARS))
    ].index, inplace=True)

# handle difference in team naming (remove period from 'St.' in stat dataset; other fixed rules)
season_stats_df['TEAM'] = season_stats_df['TEAM'].str.replace('St.', 'St', regex=False)
substitutions = {'Saint Louis': 'St Louis', 'Saint Joseph\'s': 'St Josephs', 'St John\'s': 'St Johns', 
                'North Carolina St': 'NC State', 'Saint Mary\'s': 'St Marys', 'Mississippi': 'Ole Miss',
                'Stephen F. Austin': 'Stephen F Austin', 'Middle Tennessee': 'Middle Tennessee St',
                'Miami OH': 'Miami Ohio', 'Miami FL': 'Miami', 'Penn': 'Pennsylvania', 
                'Mount St Mary\'s': 'Mount St Marys', 'Cal Irvine': 'UC Irvine', 'UCF': 'Central Florida',
                'Green Bay': 'Wisconsin Green Bay', 'Milwaukee': 'Wisconsin Milwaukee', 'UTSA': 'Texas San Antonio',
                'UC Santa Barbara': 'Santa Barbara', 'Southern Miss': 'Southern Mississippi',
                'UT Arlington': 'Texas Arlington', 'Saint Peter\'s': 'St Peters', 
                'LIU Brooklyn': 'Long Island Brooklyn', 'Loyola MD': 'Loyola Maryland'}
season_stats_df.replace(to_replace=substitutions, value=None, inplace=True)

stat_teams = set(season_stats_df['TEAM'].values)
tournament_teams = set(tournament_stats_df['Team_1'].values)
tournament_teams.update(tournament_stats_df['Team_2'].values)
assert len(tournament_teams.difference(stat_teams)) == 0, "Teams in Tournament, but not in Stats: {}".format(tournament_teams.difference(stat_teams))

# duplicate games
if DUPLICATE_GAMES:
    cols2dup = ['Seed_1', 'Score_1', 'Team_1'], ['Seed_2', 'Score_2', 'Team_2']
    swap = tournament_stats_df.rename(columns={**dict(zip(cols2dup[0], cols2dup[1])), **dict(zip(cols2dup[1], cols2dup[0]))})
    tournament_stats_df = tournament_stats_df.append(swap).sort_index(ignore_index=True)

# add win column (1 if team_1 won; 0 if team_2 won)
tournament_stats_df['winner'] = 0
tournament_stats_df.loc[tournament_stats_df['Score_1'] > tournament_stats_df['Score_2'], 'winner'] = 1

# add win_rate column
season_stats_df['WR'] = season_stats_df['W'] / season_stats_df['G']

# add conference embedding
if ENCODE_CONF:
    season_stats_df['CONF_ID'] = season_stats_df.groupby('CONF').ngroup().add(1)

# join team_1 with season_stats
data = tournament_stats_df.merge(season_stats_df, how='left', left_on=['Year', 'Team_1'], right_on=['YEAR', 'TEAM'], validate='m:1')
data.drop(columns=['TEAM', 'YEAR'], inplace=True)

# join team_2 with season_stats
data = data.merge(season_stats_df, how='left', left_on=['Year', 'Team_2'], right_on=['YEAR', 'TEAM'], suffixes=('_team_1', '_team_2'), validate='m:1')
data.drop(columns=['TEAM', 'YEAR'], inplace=True)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512 entries, 0 to 1511
Data columns (total 57 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            1512 non-null   int64  
 1   Round           1512 non-null   int64  
 2   Region Number   1512 non-null   int64  
 3   Region Name     1512 non-null   object 
 4   Seed_1          1512 non-null   int64  
 5   Score_1         1512 non-null   int64  
 6   Team_1          1512 non-null   object 
 7   Team_2          1512 non-null   object 
 8   Score_2         1512 non-null   int64  
 9   Seed_2          1512 non-null   int64  
 10  winner          1512 non-null   int64  
 11  ADJOE_team_1    1512 non-null   float64
 12  ADJDE_team_1    1512 non-null   float64
 13  BARTHAG_team_1  1512 non-null   float64
 14  RECORD_team_1   1512 non-null   object 
 15  W_team_1        1512 non-null   int64  
 16  G_team_1        1512 non-null   int64  
 17  EFG_O_team_1    1512 non-null   f

## Machine Learning
We want to predict the the column _winner_ using  the rest of the stat columns.

In [332]:
def get_classifier_best(Classifier, X, y, tune=None, **params):
    ''' Find an approximate best score from clf on X and y.
    '''
    print('Training classifier \'{}\'...'.format(Classifier.__name__))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED, shuffle=True)
    clf = Classifier(**params)
    best_score = 0

    if tune:
        search = GridSearchCV(clf, tune, refit=True)
        search.fit(X_train, y_train)

        best_params = search.best_params_
        best_score = search.best_score_
        print('{} scores: {}\twith {}'.format(Classifier.__name__, best_score, best_params))
        clf = search.best_estimator_
    else:
        cv_results = cross_validate(clf, X, y, cv=5)
        scores = cv_results['test_score']
        best_score = np.mean(scores)
        print('{} scores: {} ± {}'.format(Classifier.__name__, best_score, np.std(scores)))

        clf.fit(X_train, y_train)

    #y_true, y_pred = y_test, clf.predict(X_test)
    #print(classification_report(y_true, y_pred))
    
    if hasattr(clf, 'feature_importances_') and hasattr(clf, 'feature_names_in_'):
        importances = sorted(zip(clf.feature_names_in_, clf.feature_importances_), key=lambda x: x[1], reverse=True)
        print('Feature Importances: {}'.format(importances))
    
    return clf, best_score

In [333]:
# prepare data set
stat_features = ['G', 'W', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'EFG_D', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD',
                '2P_O', '2P_D', '3P_O', '3P_D', 'ADJ_T']
if ENCODE_CONF:
    stat_features.append('CONF_ID')
stat_features = [x + suffix for suffix in ('_team_1', '_team_2') for x in stat_features]
features = ['Round', 'Seed_1', 'Seed_2', *stat_features]

# normalize columns
normalized_features = ['Seed_1', 'Seed_2', *stat_features]
#normalized_features = [x + suffix for suffix in ('_team_1', '_team_2') for x in normalized_features]
scaler = StandardScaler()
if NORMALIZE:
    scaler.fit(data[normalized_features])
    data[normalized_features] = scaler.transform(data[normalized_features])

X, y = data[features], data['winner']

# dimensionality reduction
dimensions_reduced = False
if NUM_FEATURES and NUM_FEATURES < len(features):
    assert NUM_FEATURES > 0, 'Must have positive number of features'
    dimensions_reduced = True
    
    pca = PCA(n_components=NUM_FEATURES)
    X = pca.fit_transform(X)
    print('Reduced to {} features.'.format(pca.n_components_))
    

In [337]:
# run ML models
models = []

tune_params = [{'strategy': ['uniform'], 'random_state': [SEED]}]
result = get_classifier_best(DummyClassifier, X, y, tune=None, strategy='uniform')
models.append(result)

tune_params = [{'fit_intercept': [True, False], 'positive': [True, False]}]
result = get_classifier_best(LinearRegression, X, y, tune=tune_params)
models.append(result)

print()
tune_params = [{'learning_rate': [0.01,0.05,0.1,0.15], 'n_estimators': [50,100,200]}]
result = get_classifier_best(GradientBoostingClassifier, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'C': [0.01, 0.1, 1.0, 2.0, 10.0, 20.0], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
result = get_classifier_best(SVC, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'loss': ['hinge','log','modified_huber','squared_hinge'], 'alpha': [0.0001, 0.001], 
    'learning_rate': ['optimal'], 'max_iter': [1E5]}
result = get_classifier_best(SGDClassifier, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {}
result = get_classifier_best(GaussianNB, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {}
result = get_classifier_best(Perceptron, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'solver': ['lbfgs', 'adam'], 'batch_size': [16,32], 'hidden_layer_sizes': [(128), (128,64), (128,128,64), (512, 512, 128, 64)], 'max_iter': [10000]}
result = get_classifier_best(MLPClassifier, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'n_estimators': [20,50,100,200], 'learning_rate': [0.01,0.1,0.5]}
result = get_classifier_best(AdaBoostClassifier, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'n_neighbors': [3, 15, 25, 31], 'weights': ['uniform', 'distance'], 'p': [1, 2, 3]}
result = get_classifier_best(KNeighborsClassifier, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'criterion': ['entropy', 'gini'], 'splitter': ['best', 'random']}
result = get_classifier_best(DecisionTreeClassifier, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'n_estimators': [50, 100, 200], 'criterion': ['entropy', 'gini']}
result = get_classifier_best(RandomForestClassifier, X, y, tune=tune_params)
models.append(result)

print()
tune_params = {'max_iter_predict': [10, 100, 200]}
result = get_classifier_best(GaussianProcessClassifier, X, y, tune=tune_params)
models.append(result)

best_model, best_score = max(models, key=lambda x: x[1])
print('\nSelecting \'{}\' as best model with score: {}'.format(best_model.__class__.__name__, best_score))

Training classifier 'DummyClassifier'...
DummyClassifier scores: 0.5079251633772648 ± 0.012154371254998228
Training classifier 'LinearRegression'...
LinearRegression scores: 0.2471556717003315	with {'fit_intercept': True, 'positive': False}

Training classifier 'GradientBoostingClassifier'...
GradientBoostingClassifier scores: 0.7035019455252918	with {'learning_rate': 0.01, 'n_estimators': 50}
Feature Importances: [('BARTHAG_team_2', 0.4055078849702826), ('BARTHAG_team_1', 0.38163695823873645), ('Seed_1', 0.12405866806084544), ('ADJOE_team_1', 0.031812075048366105), ('Seed_2', 0.01742546405913501), ('ADJDE_team_1', 0.010422295842411406), ('ADJOE_team_2', 0.005949237903461367), ('FTR_team_1', 0.005778330042747228), ('ORB_team_1', 0.00519844572157676), ('3P_O_team_2', 0.0037321625707840344), ('FTR_team_2', 0.0036777897701690252), ('3P_D_team_2', 0.0022489891260217346), ('2P_D_team_1', 0.0011898167230785582), ('DRB_team_2', 0.0007145172292130313), ('3P_D_team_1', 0.0006473646931713006), (

## Creating a Bracket
Now we can use the model to fill out this years tournament bracket.

In [335]:
def get_winner(team1, team2, round, model, dataset):
    assert team1[1] in dataset['TEAM'].values, '{} not in data set'.format(team1[1])
    assert team2[1] in dataset['TEAM'].values, '{} not in data set'.format(team2[1])

    columns = ['G', 'W', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'EFG_D', 'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD',
                '2P_O', '2P_D', '3P_O', '3P_D', 'ADJ_T']
    if ENCODE_CONF:
        columns.append('CONF_ID')

    # ['Round', 'Seed_1', 'Seed_2', *stat_features]
    game_df = pd.DataFrame({'Round': [round], 'Seed_1': [team1[0]], 'Seed_2': [team2[0]]})
    stats_team_1 = dataset[dataset['TEAM'] == team1[1]][columns].reset_index(drop=True)
    stats_team_1 = stats_team_1.add_suffix('_team_1')
    stats_team_2 = dataset[dataset['TEAM'] == team2[1]][columns].reset_index(drop=True)
    stats_team_2 = stats_team_2.add_suffix('_team_2')
    joined_df = pd.concat([game_df, stats_team_1, stats_team_2], axis=1)

    if NORMALIZE:
        joined_df[normalized_features] = scaler.transform(joined_df[normalized_features])
    
    pred = model.predict(joined_df)
    return team1 if pred == 1 else team2


def get_winners(games, round, model, dataset):
    return [get_winner(t1, t2, round, model, dataset) for (t1, t2) in games]

def simulate_tournament(bracket, model, dataset):

    results = {
        'west': [bracket['west']], 'east': [bracket['east']], 
        'south': [bracket['south']], 'midwest': [bracket['midwest']]
        }

    # go to final four (i.e. simulate each region until one team is left)
    for region, brkt in bracket.items():
        current_games = brkt
        for round in range(1, 5):
            current_games = get_winners(current_games, round, model, dataset)
            if len(current_games) > 1:
                current_games = [x for x in zip(current_games[::2], current_games[1::2])]
            results[region].append(current_games)

    # now do final-four, championship, and winner
    results['final-four'] = [(results['west'][-1][0], results['east'][-1][0]), (results['south'][-1][0], results['midwest'][-1][0])]
    
    championship = get_winners(results['final-four'], 5, model, dataset)
    results['championship'] = (championship[0], championship[1])
    results['champion'] = get_winners([results['championship']], 6, model, dataset)[0]

    return results


def print_bracket(brkt, fp):

    for region in ['west', 'east', 'south', 'midwest']:
        print(region.upper(), file=fp)

        line_ending = ' ---'
        format_team = '({rank}) {team}'

        rounds = brkt[region]

        # compute widths
        max_widths = []
        for round in rounds[:-1]:
            widths = []
            for game in round:
                team1, team2 = game
                widths.append(len( format_team.format(rank=team1[0], team=team1[1]) ))
                widths.append(len( format_team.format(rank=team2[0], team=team2[1]) ))

            max_widths.append(max(widths))

        def print_line(lineno):
            round = int(math.log(lineno - (lineno & lineno - 1), 2))
            indent = ' ' * (sum(max_widths[0:round]) + round*len(line_ending))

            if round >= len(rounds):
                return
            
            idx = (lineno//2**round)//2
            games = rounds[round]
            game = games[idx // 2]
            if round == len(rounds)-1:
                team = games[0]
            else:
                team = game[idx % 2]

            print(indent + format_team.format(rank=team[0], team=team[1]) + line_ending, file=fp)

        for lineno in range(len(rounds[0])*4):
            print_line(lineno+1)

        print('\n', file=fp)

    print('Final Four + Championship', file=fp)
    final_four = brkt['final-four']
    championship = brkt['championship']
    champion = brkt['champion']
    indent1 = ' '*25
    indent2 = ' '*55
    print(format_team.format(rank=final_four[0][0][0], team=final_four[0][0][1]).ljust(20) + line_ending, file=fp)
    print(indent1 + format_team.format(rank=championship[0][0], team=championship[0][1]).ljust(20) + line_ending, file=fp)
    print(format_team.format(rank=final_four[0][1][0], team=final_four[0][1][1]).ljust(20) + line_ending, file=fp)
    print(indent2 + format_team.format(rank=champion[0], team=champion[1]).ljust(20), file=fp)
    print(format_team.format(rank=final_four[1][0][0], team=final_four[1][0][1]).ljust(20) + line_ending, file=fp)
    print(indent1 + format_team.format(rank=championship[1][0], team=championship[1][1]).ljust(20) + line_ending, file=fp)
    print(format_team.format(rank=final_four[1][1][0], team=final_four[1][1][1]).ljust(20) + line_ending, file=fp)


In [336]:
# 2022 brackets
WEST = [((1, 'Gonzaga'), (16, 'Georgia St.')),
	((8, 'Boise St.'), (9, 'Memphis')),
	((5, 'Connecticut'), (12, 'New Mexico St.')),
	((4, 'Arkansas'), (13, 'Vermont')),
	((6, 'Alabama'), (11, 'Rutgers')),
	((3, 'Texas Tech'), (14, 'Montana St.')),
	((7, 'Michigan St.'), (10, 'Davidson')),
	((2, 'Duke'), (15, 'Cal St. Fullerton'))
	]
	
SOUTH = [((1, 'Arizona'), (16, 'Wright St.')),
	((8, 'Seton Hall'), (9, 'TCU')),
	((5, 'Houston'), (12, 'UAB')),
	((4, 'Illinois'), (13, 'Chattanooga')),
	((6, 'Colorado St.'), (11, 'Michigan')),
	((3, 'Tennessee'), (14, 'Longwood')),
	((7, 'Ohio St.'), (10, 'Loyola Chicago')),
	((2, 'Villanova'), (15, 'Delaware'))
	]
	
EAST = [((1, 'Baylor'), (16, 'Norfolk St.')),
	((8, 'North Carolina'), (9, 'Marquette')),
	((5, 'Saint Mary\'s'), (12, 'Indiana')),
	((4, 'UCLA'), (13, 'Akron')),
	((6, 'Texas'), (11, 'Virginia Tech')),
	((3, 'Purdue'), (14, 'Yale')),
	((7, 'Murray St.'), (10, 'San Francisco')),
	((2, 'Kentucky'), (15, 'Saint Peter\'s'))
	]

MIDWEST = [((1, 'Kansas'), (16, 'Texas Southern')),
	((8, 'San Diego St.'), (9, 'Creighton')),
	((5, 'Iowa'), (12, 'Richmond')),
	((4, 'Providence'), (13, 'South Dakota St.')),
	((6, 'LSU'), (11, 'Iowa St.')),
	((3, 'Wisconsin'), (14, 'Colgate')),
	((7, 'USC'), (10, 'Miami FL')),
	((2, 'Auburn'), (15, 'Jacksonville St.'))
	]

R64 = {'west': WEST, 'south': SOUTH, 'east': EAST, 'midwest': MIDWEST}

team_stats_df = pd.read_csv(path_join('data', 'raw_cbb.csv'), index_col=0)
if ENCODE_CONF:
	team_stats_df['CONF_ID'] = team_stats_df.groupby('CONF').ngroup().add(1)

makedirs('results', exist_ok=True)
for model, score in models:
	fpath = path_join('results', '{}_tournament.txt'.format(model.__class__.__name__))
	with open(fpath, 'w') as fp:
		fp.write('model: {}\n'.format(model.__class__.__name__))
		fp.write('score: {}\n'.format(score))
		fp.write('bracket: \n')
		
		brkt = simulate_tournament(R64, model, team_stats_df[team_stats_df['YEAR'] == 2022])

		print_bracket(brkt, fp)