In [12]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.svm import SVC

In [13]:
data_path = "data/"

seeds = pd.read_csv(data_path + 'MNCAATourneySeeds.csv')
teams = pd.read_csv(data_path + 'MTeams.csv')
reg_szn = pd.read_csv(data_path + 'MRegularSeasonDetailedResults.csv')
tourney_seeds = pd.read_csv(data_path + '2024_tourney_seeds.csv')
tourney_results = pd.read_csv(data_path + 'MNCAATourneyDetailedResults.csv')

In [18]:
#Shuffle classes
results = tourney_results[['Season', 'WTeamID', 'LTeamID']].rename(columns={'WTeamID': 'Team1', 'LTeamID': 'Team2'})
shuffled_games = results.sample(frac=1)
win_2 = shuffled_games[:results.shape[0]//2].rename(columns={'Team1': 'Team2', 'Team2': 'Team1'})[['Season', 'Team1', 'Team2']]
win_2['won/lost'] = 0
win_1 = shuffled_games.sample(frac=1)[results.shape[0]//2:]
win_1['won/lost'] = 1
train_data = pd.concat((win_2, win_1))

In [19]:
train_data

Unnamed: 0,Season,Team1,Team2,won/lost
988,2018,1382,1196,0
1046,2018,1242,1437,0
1093,2019,1326,1222,0
880,2016,1421,1437,0
1065,2019,1330,1345,0
...,...,...,...,...
281,2007,1397,1253,1
259,2007,1207,1125,1
1092,2019,1181,1416,1
581,2012,1378,1143,1


In [26]:
def convert_seed(s):
    """
    converts the the string s given by the seed of a particular
    team in the seeds DataFrame into the integer equivalent so
    that the seeding of a team can be accounted for in the model.
    """
    if s[1] == '0':
        return int(s[2])
    else:
        return int(s[1:3])

In [27]:
def preprocess_train(tourney_games, regular_season, seeds):
    results = tourney_games[['Season', 'WTeamID', 'LTeamID']].rename(columns={'WTeamID': 'Team1', 'LTeamID': 'Team2'})
    shuffled_games = results.sample(frac=1)
    win_2 = shuffled_games[:results.shape[0]//2].rename(columns={'Team1': 'Team2', 'Team2': 'Team1'})[['Season', 'Team1', 'Team2']]
    win_2['won/lost'] = 0
    win_1 = shuffled_games.sample(frac=1)[results.shape[0]//2:]
    win_1['won/lost'] = 1
    train_data = pd.concat((win_2, win_1))
    
    Drop_cols = ['DayNum', 'LTeamID', 'NumOT', 'WStl', 'WBlk', 'WPF', 'LAst', 'LStl', 'LBlk', 'LPF', 'LFTA', 'LFTM']
    
    #aggregate team stats by wins
    w_updated = regular_season.rename(columns={'WTeamID': 'TeamID'})
    w_stat_totals = w_updated.groupby(['Season', 'TeamID']).sum()
    w_stat_totals = w_stat_totals.drop(columns=Drop_cols)
    w_stat_totals = w_stat_totals.rename(columns={'WScore': 'Pts', 'LScore': 'OPts',
                                        'WFGM': 'FGM', 'WFGA': 'FGA', 'WFGM3': 'FGM3', 
                                        'WFGA3': 'FGA3', 'WOR': 'OR', 'WDR': 'DR', 'WAst': 'Ast',
                                        'WTO': 'TO', 'LFGM': 'OFGM', 'LFGA': 'OFGA',
                                        'LFGM3': 'OFGM3', 'LFGA3': 'OFGA3', 'WFTM': 'FTM',
                                        'WFTA': 'FTA', 'LOR': 'OOR', 'LDR': 'ODR', 'LTO': 'OTO'})
    
    #aggregate team stats by losses
    updated = regular_season.rename(columns={'LTeamID': 'TeamID'})
    l_stat_totals = updated.groupby(['Season', 'TeamID']).sum()
    Drop_co = ['DayNum', 'WTeamID', 'NumOT', 'LStl', 'LBlk', 'LPF', 'WAst', 'WStl', 'WBlk', 'WPF', 'WFTA', 'WFTM']
    l_stat_totals = l_stat_totals.drop(columns=Drop_co)
    l_stat_totals = l_stat_totals.rename(columns={'WScore': 'OPts', 'LScore': 'Pts',
                                        'WFGM': 'OFGM', 'WFGA': 'OFGA', 'WFGM3': 'OFGM3', 
                                        'WFGA3': 'OFGA3', 'WOR': 'OOR', 'WDR': 'ODR', 'LAst': 'Ast',
                                        'WTO': 'OTO', 'LFGM': 'FGM', 'LFGA': 'FGA',
                                        'LFGM3': 'FGM3', 'LFGA3': 'FGA3', 'LFTM': 'FTM',
                                        'LFTA': 'FTA', 'LOR': 'OR', 'LDR': 'DR', 'LTO': 'TO'})
    index1 = pd.MultiIndex.from_tuples([(2015, 1246)], names=["Season", "TeamID"])
    index2 = pd.MultiIndex.from_tuples([(2014, 1455)], names=["Season", "TeamID"])
    index3 = pd.MultiIndex.from_tuples([(2021, 1211)], names=["Season", "TeamID"])
    kentucky_tot = pd.DataFrame([[0]*l_stat_totals.shape[1]], columns = list(l_stat_totals.columns), index = index1)
    wichita_tot = pd.DataFrame([[0]*l_stat_totals.shape[1]], columns = list(l_stat_totals.columns), index = index2)
    gonzaga_tot = pd.DataFrame([[0]*l_stat_totals.shape[1]], columns = list(l_stat_totals.columns), index = index3)
    l_stat_totals = pd.concat((l_stat_totals, kentucky_tot))
    l_stat_totals = pd.concat((l_stat_totals, wichita_tot))
    l_stat_totals = pd.concat((l_stat_totals, gonzaga_tot))
    stat_totals = l_stat_totals.add(w_stat_totals)
    
    #aggregating number of wins and losses by team
    w_tot_games = w_updated.groupby(['Season', 'TeamID']).count()

    l_tot_games = updated.groupby(['Season', 'TeamID']).count()
    idx1 = pd.MultiIndex.from_tuples([(2015, 1246)], names=["Season", "TeamID"])
    idx2 = pd.MultiIndex.from_tuples([(2014, 1455)], names=["Season", "TeamID"])
    idx3 = pd.MultiIndex.from_tuples([(2021, 1211)], names=["Season", "TeamID"])
    kentucky_num = pd.DataFrame([[0]*l_tot_games.shape[1]], columns = list(l_tot_games.columns), index = idx1)
    #Kentucky was undefeated entering the tournament in 2015
    wichita_num = pd.DataFrame([[0]*l_tot_games.shape[1]], columns = list(l_tot_games.columns), index = idx2)
    #Wichita State was undefeated entering the tournament in 2014
    gonzaga_num = pd.DataFrame([[0]*l_tot_games.shape[1]], columns = list(l_tot_games.columns), index = idx3)
    #Gonzaga was undefeated entering the tournament in 2021
    Drops = ['DayNum', 'NumOT', 'WStl', 'WBlk', 'WPF', 'LAst', 'LStl', 'LBlk', 'LPF', 'LFTA', 'LFTM', 'TeamID', 'WTeamID', 'WLoc']
    l_tot_games = pd.concat((l_tot_games, kentucky_num))
    l_tot_games = pd.concat((l_tot_games, wichita_num))
    l_tot_games = pd.concat((l_tot_games, gonzaga_num))
    
    l_tot_games['losses'] = l_tot_games['DayNum']
    l_tot_games['wins'] = 0
    w_tot_games['wins'] = w_tot_games['DayNum']
    w_tot_games['losses'] = 0
    
    Drops = ['DayNum', 'NumOT', 'WStl', 'WBlk', 'WPF', 'LAst', 'LStl', 'LBlk', 'LPF', 'LFTA', 'LFTM', 'WTeamID', 'WLoc', 'LTeamID']
    games_count = l_tot_games + w_tot_games
    games_count = games_count.rename(columns={'WScore': 'Pts', 'LScore': 'OPts',
                                        'WFGM': 'FGM', 'WFGA': 'FGA', 'WFGM3': 'FGM3', 
                                        'WFGA3': 'FGA3', 'WOR': 'OR', 'WDR': 'DR', 'WAst': 'Ast',
                                        'WTO': 'TO', 'LFGM': 'OFGM', 'LFGA': 'OFGA',
                                        'LFGM3': 'OFGM3', 'LFGA3': 'OFGA3', 'WFTM': 'FTM',
                                        'WFTA': 'FTA', 'LOR': 'OOR', 'LDR': 'ODR', 'LTO': 'OTO'})
    games_count = games_count.drop(columns=Drops).dropna()
    #drops the N/A values that represents a team that finished a season with 0 wins, as this team is assumed to have not made the tournament.
    
    #averages stats by team
    team_averages = stat_totals/games_count[games_count.columns[:-2]]
    team_averages = team_averages.merge(games_count[['wins', 'losses']], left_index=True, right_index=True)
    
    seeds['seed_conv'] = seeds['Seed'].apply(convert_seed)
    seeds = seeds[['Season', 'TeamID', 'seed_conv']]
    
    train_data = train_data.merge(seeds, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID']).drop(columns=['TeamID']).rename(columns={'seed_conv': 'Seed_1'})
    train_data = train_data.merge(seeds, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID']).drop(columns=['TeamID']).rename(columns={'seed_conv': 'Seed_2'})
    
    team_averages['FT'] = team_averages['FTM']/team_averages['FTA']*100
    team_averages['FG'] = team_averages['FGM']/team_averages['FGA']*100
    team_averages = team_averages[['Pts', 'OPts', 'FG', 'FGM3', 'FT', 'Ast', 'DR', 'OR', 'TO', 'OTO', 'wins', 'losses']]
    team_averages = team_averages.reset_index()
    
    train_data = train_data.merge(team_averages, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID']).drop(columns=['TeamID'])
    train_data.columns = np.append(train_data.columns[:6], train_data.columns[6:]+'_1')
    train_data = train_data.merge(team_averages, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID']).drop(columns=['TeamID'])
    train_data.columns = np.append(train_data.columns[:18], train_data.columns[18:]+'_2')
    train_data['W/L'] = train_data['won/lost']
    train_data = train_data.drop(columns=['won/lost'])
    
    return train_data

In [42]:
train_data = preprocess_train(tourney_results, reg_szn, seeds)

  w_stat_totals = w_updated.groupby(['Season', 'TeamID']).sum()
  l_stat_totals = updated.groupby(['Season', 'TeamID']).sum()


In [43]:
X_train, X_test, y_train, y_test = train_test_split(train_data[train_data.columns[:29]], train_data['W/L'])

In [45]:
def train_model(mdl, X, y):
    if mdl == "logistic regression":
        clf = LogisticRegression(random_state=0)
        clf.fit(X[X.columns[3:]], y)
    elif mdl == "random forest":
        clf = RandomForestClassifier(max_depth=5, random_state=0)
        clf.fit(X[X.columns[3:]], y)
    elif mdl == "svm":
        clf = SVC(gamma='auto')
        clf.fit(X[X.columns[3:]], y)
    return clf

In [None]:
def predict_outcome(team_1, team_2):
    team_averages = stat_totals/games_count[games_count.columns[:-2]]
    team_averages = team_averages.merge(games_count[['wins', 'losses']], left_index=True, right_index=True)
    
    seeds['seed_conv'] = seeds['Seed'].apply(convert_seed)
    seeds = seeds[['Season', 'TeamID', 'seed_conv']]
    
    train_data = train_data.merge(seeds, left_on=['Season', 'Team1'], right_on=['Season', 'TeamID']).drop(columns=['TeamID']).rename(columns={'seed_conv': 'Seed_1'})
    train_data = train_data.merge(seeds, left_on=['Season', 'Team2'], right_on=['Season', 'TeamID']).drop(columns=['TeamID']).rename(columns={'seed_conv': 'Seed_2'})
    
    team_averages['FT'] = team_averages['FTM']/team_averages['FTA']*100
    team_averages['FG'] = team_averages['FGM']/team_averages['FGA']*100
    team_averages = team_averages[['Pts', 'OPts', 'FG', 'FGM3', 'FT', 'Ast', 'DR', 'OR', 'TO', 'OTO', 'wins', 'losses']]
    team_averages = team_averages.reset_index()
    
    tourney_2024 = team_averages[team_averages['Season']==2024]
    tourney_teams = tourney_2024.merge(seeds, left_on=['Season', 'TeamID'], right_on=['Season', 'TeamID']).rename(columns={'seed_conv': 'Seed'})
    
    
    first = tourney_teams[tourney_teams['TeamID']==team_1]
    second = tourney_teams[tourney_teams['TeamID']==team_2]
    game = first.merge(second, on='Season').drop(columns=['Season', 'TeamID_x', 'TeamID_y'])
    cols = np.array([])
    for c in game.columns:
        if c[-1] == 'x':
            cols = np.append(cols, c[:-1] + '1')
        else:
            cols = np.append(cols, c[:-1] + '2')
    game.columns = cols
    columns_1 = np.append(game.columns[12], game.columns[25])
    columns_2 = np.append(columns_1, game.columns[:12])
    columns_3 = np.append(columns_2, game.columns[13:25])
    game = game[list(columns_3)]
    return clf.predict(game)[0]

In [46]:
def predict_prob(team_1, team_2):
    first = tourney_teams[tourney_teams['TeamName']==team_1]
    second = tourney_teams[tourney_teams['TeamName']==team_2]
    game = first.merge(second, on='Season').drop(columns=['Season', 'TeamID_x', 'TeamID_y'])
    cols = np.array([])
    for c in game.columns:
        if c[-1] == 'x':
            cols = np.append(cols, c[:-1] + '1')
        else:
            cols = np.append(cols, c[:-1] + '2')
    game.columns = cols
    columns_1 = np.append(game.columns[12], game.columns[25])
    columns_2 = np.append(columns_1, game.columns[:12])
    columns_3 = np.append(columns_2, game.columns[13:25])
    game = game[list(columns_3)]
    return clf.predict_proba(game)[0][0], clf.predict_proba(game)[0][1]

In [47]:
def predict_results(team_name):
    for i in tourney_teams['TeamID']:
        team_id = teams[teams['TeamName']==team_name]['TeamID'].iloc[0]
        output = predict_outcome(team_id, i)
        print(teams[teams['TeamID']==i]['TeamName'].iloc[0], output)