# Simple linear regression
Use this as the baseline. Any fancier algorithm should be able to beat it.
## Input
- Average score
- Average FGA
- Average FGM
- Average 3FPA
- Average 3FPM
- Seed
## Output
- Probability of winning

In [1]:
# Import src into Python path
import os, sys
src_dir = os.path.join(os.path.dirname(os.path.abspath('')), 'src')
if not src_dir in sys.path:
  sys.path.append(src_dir)

In [26]:
import constants as const
import util
import pandas as pd
import numpy as np

from tqdm import tqdm

datapath = os.path.join('..', 'data', 'march-machine-learning-mania-2023')

def load_csv(filename):
  df = pd.read_csv(filename)
  if 'Season' in df.columns:
    df.drop(df[df['Season'] < const.first_season].index, inplace=True)
  return df.reset_index()

In [3]:
# Make a dictionary of conference IDs
conf_df = load_csv(os.path.join(datapath, 'Conferences.csv'))
conf_df['ConfID'] = conf_df.index
conf_dict = dict(zip(conf_df.ConfAbbrev, conf_df.ConfID))

In [4]:
# Load teams and add conference ID
team_df = load_csv(os.path.join(datapath, 'MTeamConferences.csv'))
team_df['ConfId'] = list(map(conf_dict.get, team_df['ConfAbbrev']))
team_df

Unnamed: 0,index,Season,TeamID,ConfAbbrev,ConfId
0,5407,2003,1102,mwc,34
1,5408,2003,1103,mac,27
2,5409,2003,1104,sec,42
3,5410,2003,1105,swac,47
4,5411,2003,1106,swac,47
...,...,...,...,...,...
7250,12657,2023,1473,ovc,37
7251,12658,2023,1474,a_sun,0
7252,12659,2023,1475,ovc,37
7253,12660,2023,1476,nec,36


In [69]:
# Add average fields to teams
base_fields = ['Score', 'FGA', 'FGM', 'FGA3', 'FGM3']

for_fields = [f'For{f}Avg' for f in base_fields]
opp_fields = [f'Opp{f}Avg' for f in base_fields]

wavg_fields = [f'W{f}' for f in for_fields] + [f'W{f}' for f in opp_fields]
lavg_fields = [f'L{f}' for f in for_fields] + [f'L{f}' for f in opp_fields]

w_fields = [f'W{f}' for f in base_fields]
l_fields = [f'L{f}' for f in base_fields]

team_df[for_fields] = np.nan
team_df[opp_fields] = np.nan
team_df['NumGames'] = np.nan

In [70]:
# Load box scores and add averages to teams
box_df = load_csv(os.path.join(datapath, 'MRegularSeasonDetailedResults.csv'))
box_df[wavg_fields] = np.nan
box_df[lavg_fields] = np.nan
box_df['WNumGames'] = np.nan
box_df['LNumGames'] = np.nan

def compute_avg(games, is_winner, new_fields):
  return games[w_fields].where(
    is_winner,
    games[l_fields].values
  ).mean().rename(
    dict(zip(w_fields, new_fields))
  )

def get_team_averages(games, team):
  is_winner = games['WTeamID'] == team
  for_avg = compute_avg(games, is_winner, for_fields)
  opp_avg = compute_avg(games, ~is_winner, opp_fields)
  return for_avg, opp_avg

In [76]:
team = 1210
seas = 2013
seas_df = box_df.loc[seas == box_df['Season']].reset_index()
games = seas_df[(seas_df['WTeamID'] == team) | (seas_df['LTeamID'] == team)]
for_avg, opp_avg = get_team_averages(games, team)

pd.concat([for_avg, opp_avg])

ForScoreAvg    64.290323
ForFGAAvg      57.774194
ForFGMAvg      24.225806
ForFGA3Avg     17.548387
ForFGM3Avg      5.580645
OppScoreAvg    62.354839
OppFGAAvg      55.322581
OppFGMAvg      22.161290
OppFGA3Avg     18.161290
OppFGM3Avg      5.870968
dtype: float64

In [81]:
team_df.loc[i, for_fields + opp_fields] = pd.concat([for_avg, opp_avg])
team_df.loc[i]

index               5985
Season              2004
TeamID              1385
ConfAbbrev      big_east
ConfId                 7
ScoreFor       61.962963
FGAFor         59.555556
FGMFor          23.62963
FGA3For        14.111111
FGM3For         4.444444
ScoreOpp       70.888889
FGAOpp         56.518519
FGMOpp          24.62963
FGA3Opp        19.481481
FGM3Opp         7.148148
NumGames             NaN
ForScoreAvg    64.290323
ForFGAAvg      57.774194
ForFGMAvg      24.225806
ForFGA3Avg     17.548387
ForFGM3Avg      5.580645
OppScoreAvg    62.354839
OppFGAAvg      55.322581
OppFGMAvg       22.16129
OppFGA3Avg      18.16129
OppFGM3Avg      5.870968
Name: 578, dtype: object

In [41]:
# Compute averages
iterable = zip(team_df.index, team_df['Season'], team_df['TeamID'])
seas_df = None
for (i, seas, team) in tqdm(iterable, total=team_df.shape[0]):
  if seas_df is None or seas_df['Season'][0] != seas:
    seas_df = box_df.loc[seas == box_df['Season']].reset_index()

  # Get full season averages
  games = seas_df[(seas_df['WTeamID'] == team) | (seas_df['LTeamID'] == team)]
  for_avg, opp_avg = get_team_averages(games, team)
  team_df.loc[i, for_fields] = for_avg
  team_df.loc[i, opp_fields] = opp_avg
  team_df.loc[i, 'NumGames'] = games.shape[0]

  # Get averages minus each game
  for (j, day) in zip(games.index, games['DayNum']):
    for_avg, opp_avg = get_team_averages(games.loc[games['DayNum'] != day], team)
    fields = wavg_fields if games.loc[j, 'WTeamID'] == team else lavg_fields
    box_df.loc[j, for_fields] = for_avg
    box_df.loc[j, opp_fields] = opp_avg
    box_df.loc[j, 'NumGames'] = games.shape[0] - 1

team_df

  8%|▊         | 578/7255 [01:36<18:31,  6.01it/s]


KeyboardInterrupt: 

In [24]:
team_df.tail(25)

Unnamed: 0,index,Season,TeamID,ConfAbbrev,ConfId,ScoreFor,FGAFor,FGMFor,FGA3For,FGM3For,ScoreOpp,FGAOpp,FGMOpp,FGA3Opp,FGM3Opp,NumGames
7230,12637,2023,1453,horizon,23,59.137931,51.551724,20.448276,22.068966,6.586207,76.413793,56.965517,27.655172,19.758621,6.827586,29.0
7231,12638,2023,1454,horizon,23,74.28,58.68,26.36,22.84,8.04,75.48,64.16,26.4,27.68,8.68,25.0
7232,12639,2023,1455,aac,2,69.961538,57.653846,25.269231,21.730769,6.461538,67.5,61.0,24.307692,24.307692,7.884615,26.0
7233,12640,2023,1456,caa,14,64.703704,54.740741,23.518519,19.666667,7.148148,72.481481,56.592593,26.777778,21.333333,7.555556,27.0
7234,12641,2023,1457,big_south,10,71.666667,51.074074,23.666667,23.0,8.740741,75.0,58.444444,27.851852,22.185185,7.925926,27.0
7235,12642,2023,1458,big_ten,11,64.653846,57.153846,23.5,23.884615,8.576923,63.230769,53.5,23.615385,18.961538,6.346154,26.0
7236,12643,2023,1459,southern,43,71.961538,56.615385,26.115385,21.961538,7.153846,73.807692,57.769231,27.230769,24.576923,8.615385,26.0
7237,12644,2023,1460,horizon,23,78.888889,63.074074,30.814815,17.333333,6.074074,75.074074,62.592593,27.962963,23.62963,7.703704,27.0
7238,12645,2023,1461,mwc,34,67.884615,53.576923,23.576923,24.192308,8.307692,71.384615,55.461538,25.538462,22.615385,8.230769,26.0
7239,12646,2023,1462,big_east,7,81.857143,61.142857,30.428571,19.214286,7.571429,74.785714,62.142857,27.678571,21.071429,7.678571,28.0


In [25]:
def remove_game_influence(box_df, team_df, team, seas, daynum):
  team_entry = team_df[(team_df['TeamID'] == team) & (team_df['Season'] == seas)]
  games = util.find_team_games(box_df, seas, team)[]
  games.loc[games['DayNum'] == daynum]
  return team_entry

remove_game_influence(team_df, 1455, 2023, 108)

Unnamed: 0,index,Season,TeamID,ConfAbbrev,ConfId,ScoreFor,FGAFor,FGMFor,FGA3For,FGM3For,ScoreOpp,FGAOpp,FGMOpp,FGA3Opp,FGM3Opp,NumGames
7232,12639,2023,1455,aac,2,69.961538,57.653846,25.269231,21.730769,6.461538,67.5,61.0,24.307692,24.307692,7.884615,26.0


In [33]:
games = util.find_team_games(box_df, 2023, 1455)
game = games.loc[games['DayNum'] == 108]
compute_avg(game, game['WTeamID'] == 1455, for_fields)

ScoreFor    79.0
FGAFor      58.0
FGMFor      27.0
FGA3For     26.0
FGM3For     10.0
dtype: float64

In [None]:
# Split into test & train sets
def split_dataframe(df, train_ratio):
  is_train = np.random.random(df.shape[0]) < train_ratio
  return df.loc[is_train], df.loc[~is_train]

train_box_df, test_box_df = split_dataframe(box_df, 0.9)

In [None]:
# Import src into Python path
import os, sys
src_dir = os.path.join(os.path.dirname(os.path.abspath('')), 'src')
if not src_dir in sys.path:
  sys.path.append(src_dir)

In [62]:
# Determine which teams are in the tournament
seed_df = pd.read_csv(os.path.join(datapath, 'MNCAATourneySeeds.csv'))
teams = list(np.sort(seed_df[seed_df['Season'] == 2019]['TeamID']))