In [1]:
# Pip install fuzzywuzzy
import sys
!{sys.executable} -m pip install fuzzywuzzy

# Fuzzywuzzy module used for matching similar strings in college team names
from fuzzywuzzy import fuzz

import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from TournamentBracket import *

# Setting up multiple options per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'





In [2]:
def elo_chance(team1_elo: int, team2_elo: int) -> float:
    """Returns the probability that TEAM 1 wins based on their ELO rating."""
    diff = team2_elo - team1_elo
    return 1 / (1 + 10 ** (diff / 400))

def match_schools(series1, series2) -> dict:
    """Returns a dictionary of the most similar school names to correct for inconsistent hyphenations between websites"""
    schools_dict = {}

    for school1 in series1:
        for school2 in series2:
            ratio = fuzz.ratio(school1, school2)
            if ratio > 90:  # Adjustable threshold based on how correct we want it to be
                schools_dict[school1] = school2

    return schools_dict
    
def get_key(my_dict: dict, val):
   
    for key, value in my_dict.items():
        if val == value:
            return key
 
    return "key doesn't exist"

Adding valuable point differential data and determining the winner and loser (or tie) of every conference game.

In [3]:
sched_df = pd.read_csv('Conference_Games.csv', index_col=0)
sched_df['Score_Diff'] = sched_df['PTS'] - sched_df['PTS.1']

score_diff = list(sched_df['Score_Diff'])
visitor = list(sched_df['Visitor/Neutral'])
home = list(sched_df['Home/Neutral'])

winners = []
losers = []

for i in range(len(score_diff)):
    
    if score_diff[i] > 0:
        winners.append(visitor[i])
        losers.append(home[i])

    elif score_diff[i] < 0:
        winners.append(home[i])
        losers.append(visitor[i])

    else:
        winners.append(home[i])
        losers.append(visitor[i])

sched_df['Winner'] = winners
sched_df['Loser'] = losers
sched_df['Score_Diff'] = abs(sched_df['Score_Diff'])

sched_df = sched_df.drop(columns=['Date', 'OT', 'Notes'])
sched_df = sched_df.reindex(columns=['Home/Neutral', 'Visitor/Neutral', 'Winner', 'PTS', 'PTS.1', 'Score_Diff'])

sched_df.head(3)

Unnamed: 0,Home/Neutral,Visitor/Neutral,Winner,PTS,PTS.1,Score_Diff
0,Oregon,Southern California,Oregon,74.0,82.0,8.0
1,Oregon State,UCLA,UCLA,69.0,62.0,7.0
2,California,Arizona,Arizona,100.0,81.0,19.0


The ELO website has slightly different team names for each school. Here, we make a dictionary of the teams that are different and apply the correct ELO to each school.

In [4]:
stats_df = pd.read_csv('CombinedStats.csv', index_col=0)
elo_df = pd.read_csv('ELO_Ratings.csv', index_col=0)

# Fixing inconsistencies between websites
from replacement_dict import replacement_dict

elo_df['Team'] = elo_df['Team'].replace(replacement_dict)

stats_schools = stats_df['School']
elo_schools = elo_df['Team']

stat_to_elo_dict = match_schools(stats_schools, elo_schools)
stats_elo_scores = [elo_df[elo_df['Team'] == stat_to_elo_dict[school]]['ELO'].values[0] for school in stats_schools]

In [5]:
stats_df['ELO'] = stats_elo_scores
stats_df = stats_df.rename(columns={'Rank': 'School_id'})
stats_df.insert(2, 'ELO', stats_df.pop('ELO'))

stats_df.head(3)

Unnamed: 0,School_id,School,ELO,Games Played,Wins,Losses,W-L%,SRS,SOS,Conf W,...,O_3PAr,O_TS%,O_TRB%,O_AST%,O_STL%,O_BLK%,O_eFG%,O_TOV%,O_ORB%,O_FT/FGA
0,1,Abilene Christian,1327.19,32,15,17,0.469,-3.81,-1.17,10.0,...,0.315,0.55,52.3,46.6,8.9,7.8,0.517,18.3,29.2,0.277
1,2,Air Force,1172.15,31,9,22,0.29,-4.33,1.87,2.0,...,0.367,0.58,53.8,49.9,9.0,11.3,0.542,15.3,31.5,0.292
2,3,Akron,1439.76,34,24,10,0.706,3.02,-2.52,13.0,...,0.372,0.51,46.7,45.9,9.0,7.4,0.482,15.1,25.3,0.191


Giving each school a unique ID

In [6]:
school_id_df = pd.DataFrame()
school_id_df['School_id'] = stats_df['School_id']
school_id_df['School'] = stats_df['School']

Making sure each team (both Home and Visitors) have their IDs on the schedule_df

In [7]:
# Adding ID and ELO stats to the schedule dataframe
home_ids = [stats_df.loc[school_id_df['School']==school]['School_id'] for school in list(sched_df['Home/Neutral'])]
sched_df['Home_id'] = [ids.to_numpy()[0] if len(ids.to_numpy() > 0) else None for ids in home_ids]
sched_df['Home'] = sched_df['Home/Neutral']

visitor_ids = [stats_df.loc[school_id_df['School']==school]['School_id'] for school in list(sched_df['Visitor/Neutral'])]
sched_df['Visitor_id'] = [ids.to_numpy()[0] if len(ids.to_numpy() > 0) else None for ids in visitor_ids]
sched_df['Visitor'] = sched_df['Visitor/Neutral']

winner_ids = [stats_df.loc[school_id_df['School']==school]['School_id'] for school in list(sched_df['Winner'])]
sched_df['Winner_id'] = [ids.to_numpy()[0] if len(ids.to_numpy() > 0) else None for ids in winner_ids]

sched_df = sched_df.drop(columns=['Home/Neutral', 'Visitor/Neutral'])
sched_df.head(3)

Unnamed: 0,Winner,PTS,PTS.1,Score_Diff,Home_id,Home,Visitor_id,Visitor,Winner_id
0,Oregon,74.0,82.0,8.0,229.0,Oregon,280.0,Southern California,229.0
1,UCLA,69.0,62.0,7.0,230.0,Oregon State,324.0,UCLA,324.0
2,Arizona,100.0,81.0,19.0,40.0,California,11.0,Arizona,11.0


In [20]:
stats_df.columns
cols = ['School', 'School_id', 'ELO', 'Conf W', 'O_FTA', '3P%', 'SRS']

teams_df = stats_df[cols]
teams_df.head(3)

Index(['School_id', 'School', 'ELO', 'Games Played', 'Wins', 'Losses', 'W-L%',
       'SRS', 'SOS', 'Conf W', 'Conf L', 'Home W', 'Home L', 'Away W',
       'Away L', 'Points Tm', 'Points Opp', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'TRB', 'AST', 'STL', 'BLK',
       'TOV', 'PF', 'O_MP', 'O_FG', 'O_FGA', 'O_FG%', 'O_3P', 'O_3PA', 'O_3P%',
       'O_FT', 'O_FTA', 'O_FT%', 'O_ORB', 'O_TRB', 'O_AST', 'O_STL', 'O_BLK',
       'O_TOV', 'O_PF', 'Pace', 'ORtg', 'FTr', '3PAr', 'TS%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'O_Pace', 'O_ORtg',
       'O_FTr', 'O_3PAr', 'O_TS%', 'O_TRB%', 'O_AST%', 'O_STL%', 'O_BLK%',
       'O_eFG%', 'O_TOV%', 'O_ORB%', 'O_FT/FGA'],
      dtype='object')

Unnamed: 0,School,School_id,ELO,Conf W,O_FTA,3P%,SRS
0,Abilene Christian,1,1327.19,10.0,723,0.341,-3.81
1,Air Force,2,1172.15,2.0,637,0.361,-4.33
2,Akron,3,1439.76,13.0,541,0.328,3.02


In [21]:
merged_df = pd.merge(sched_df, teams_df, left_on='Home', right_on='School', how='left')
merged_df = pd.merge(merged_df, teams_df, left_on='Visitor', right_on='School', how='left')
merged_df['Home Wins'] = merged_df['Winner'] == merged_df['Home']
merged_df['Home Wins'] = [int(win) for win in merged_df['Home Wins']]
merged_df = merged_df.dropna()

merged_df = merged_df.drop(columns=['Winner', 'Home', 'Visitor', 'School_x', 'School_y', 'Home_id', 'Visitor_id', 'PTS', 'PTS.1', 'Score_Diff', 'Winner_id', 'School_id_x', 'School_id_y'])
merged_df.head(3)

Unnamed: 0,ELO_x,Conf W_x,O_FTA_x,3P%_x,SRS_x,ELO_y,Conf W_y,O_FTA_y,3P%_y,SRS_y,Home Wins
0,1619.81,12.0,632.0,0.336,12.26,1505.3,8.0,626.0,0.355,9.59,1
1,1312.17,5.0,651.0,0.328,3.54,1465.55,10.0,616.0,0.332,7.73,0
2,1353.77,9.0,646.0,0.338,5.24,1667.74,15.0,530.0,0.371,25.35,0


In [22]:
X = merged_df.drop(columns=['Home Wins'])
y = merged_df['Home Wins']

## Creating Deep Neural Net

- Standardizes the features in the dataset.
- Splits the dataset into training and testing sets.
- Converts the data into PyTorch tensors.
- Defines a neural network architecture using PyTorch's nn.Module.
- Initializes the network, loss function, and optimizer.
- Trains the network on the training data for a specified number of epochs.
- Evaluates the trained network on the test data and calculates accuracy.

In [23]:
# Create an instance of StandardScaler to standardize the features in X
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.20, random_state=69)

# Convert training and testing data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

In [25]:
# Define a neural network class
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)   # Define the first fully connected layer with 64 neurons
        self.fc2 = nn.Linear(64, 32)           # Define the second fully connected layer with 32 neurons
        self.fc3 = nn.Linear(32, 1)            # Define the third fully connected layer with 1 neuron

    def forward(self, x):
        x = torch.relu(self.fc1(x))            # Apply ReLU activation function to the first layer
        x = torch.relu(self.fc2(x))            # Apply ReLU activation function to the second layer
        x = torch.sigmoid(self.fc3(x))         # Apply sigmoid activation function to the output layer
        return x

In [26]:
# Determine the input size based on the number of features
input_size = X_train_tensor.shape[1]
net = Net(input_size)

# Define binary cross-entropy loss function
criterion = nn.BCELoss()
# Use Adam optimizer with learning rate 0.001
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [27]:
# Train the neural network
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()                      # Clear gradients
    outputs = net(X_train_tensor)              # Forward pass
    loss = criterion(outputs, y_train_tensor)  # Compute the loss
    loss.backward()                            # Backward pass
    optimizer.step()                           # Update weights

    # Print loss every 10 epochs
    if (epoch+1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [20/100], Loss: 0.6263
Epoch [40/100], Loss: 0.5474


Epoch [60/100], Loss: 0.5123
Epoch [80/100], Loss: 0.5045
Epoch [100/100], Loss: 0.5013


In [28]:
# Evaluate the neural network on the test set
with torch.no_grad():
    outputs = net(X_test_tensor)                                                  # Forward pass on the test data
    predicted = torch.round(outputs)                                              # Round the predictions to get binary outputs
    accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)  # Calculating accuracy
    print(f'Accuracy on test set: {accuracy:.3f}')

Accuracy on test set: 0.781


### Individual Stat Neural Network Testing

In [29]:
optimal_columns = stats_df.columns[2:]
optimal_dfs = [stats_df[['School', 'School_id', column]] for column in optimal_columns]

optimal_merged_dfs = []

for optimal_df in optimal_dfs:
    optimal_merged_df = pd.merge(sched_df, optimal_df, left_on='Home', right_on='School', how='left')
    optimal_merged_df = pd.merge(optimal_merged_df, optimal_df, left_on='Visitor', right_on='School', how='left')
    optimal_merged_df['Home Wins'] = optimal_merged_df['Winner'] == optimal_merged_df['Home']
    optimal_merged_df['Home Wins'] = [int(win) for win in optimal_merged_df['Home Wins']]
    optimal_merged_df = optimal_merged_df.dropna()

    optimal_merged_df = optimal_merged_df.drop(columns=['Winner', 'Home', 'Visitor', 'School_x', 'School_y', 'Home_id', 'Visitor_id', 'PTS', 'PTS.1', 'Score_Diff', 'Winner_id', 'School_id_x', 'School_id_y'])
    optimal_merged_dfs.append(optimal_merged_df)

optimal_X = [o_df.drop(columns=['Home Wins']) for o_df in optimal_merged_dfs]

opt_X_scaled = [scaler.fit_transform(x) for x in optimal_X]

opt_X_train_tensors = []
opt_X_test_tensors = []
opt_y_train_tensors = []
opt_y_test_tensors = []

for scaled_X in opt_X_scaled:
    # Split the dataset into training and testing sets
    opt_X_train, opt_X_test, opt_y_train, opt_y_test = train_test_split(scaled_X, y, test_size=0.20, random_state=69)

    # Convert training and testing data to PyTorch tensors
    opt_X_train_tensors.append(torch.tensor(X_train, dtype=torch.float32))
    opt_X_test_tensors.append(torch.tensor(X_test, dtype=torch.float32))
    opt_y_train_tensors.append(torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1))
    opt_y_test_tensors.append(torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1))

opt_nets = [Net(input_size) for opt_X_test_tensor in opt_X_test_tensors]

for opt_X_train_tensor, opt_y_train_tensor, opt_net in zip(opt_X_train_tensors, opt_y_train_tensors, opt_nets):
    for epoch in range(epochs):
        optimizer.zero_grad()                      # Clear gradients
        outputs = opt_net(opt_X_train_tensor)              # Forward pass
        loss = criterion(outputs, opt_y_train_tensor)  # Compute the loss
        loss.backward()                            # Backward pass
        optimizer.step()                           # Update weights

        # Print loss every 10 epochs
        if (epoch+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

i = 0
accuracies = {}
for opt_net, opt_X_test_tensor, opt_y_test_tensor in zip(opt_nets, opt_X_test_tensors, opt_y_test_tensors):
    with torch.no_grad():
        outputs = opt_net(opt_X_test_tensor)                                                  # Forward pass on the test data
        predicted = torch.round(outputs)                                              # Round the predictions to get binary outputs
        accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)  # Calculating accuracy
        accuracies[i] = accuracy
        print(f'Accuracy on test set: {accuracy:.3f}    Stat: {optimal_columns[i]}')
        i += 1

sorted(list(accuracies.values()))[::-1]

Epoch [100/100], Loss: 0.7050
Epoch [100/100], Loss: 0.7098
Epoch [100/100], Loss: 0.6925
Epoch [100/100], Loss: 0.6916
Epoch [100/100], Loss: 0.6761
Epoch [100/100], Loss: 0.7087
Epoch [100/100], Loss: 0.6986
Epoch [100/100], Loss: 0.6934
Epoch [100/100], Loss: 0.6940
Epoch [100/100], Loss: 0.7079
Epoch [100/100], Loss: 0.6930
Epoch [100/100], Loss: 0.7273
Epoch [100/100], Loss: 0.7242
Epoch [100/100], Loss: 0.7049
Epoch [100/100], Loss: 0.6742
Epoch [100/100], Loss: 0.7230
Epoch [100/100], Loss: 0.7130
Epoch [100/100], Loss: 0.7000
Epoch [100/100], Loss: 0.7394
Epoch [100/100], Loss: 0.7048
Epoch [100/100], Loss: 0.6846
Epoch [100/100], Loss: 0.7008
Epoch [100/100], Loss: 0.7286
Epoch [100/100], Loss: 0.6842
Epoch [100/100], Loss: 0.7016
Epoch [100/100], Loss: 0.6873
Epoch [100/100], Loss: 0.6991
Epoch [100/100], Loss: 0.6647
Epoch [100/100], Loss: 0.6850
Epoch [100/100], Loss: 0.6820
Epoch [100/100], Loss: 0.6914
Epoch [100/100], Loss: 0.6876
Epoch [100/100], Loss: 0.6726
Epoch [100

[0.6060606060606061,
 0.6046176046176046,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6031746031746031,
 0.6017316017316018,
 0.6017316017316018,
 0.6017316017316018,
 0.6017316017316018,
 0.6017316017316018,
 0.6002886002886003,
 0.5988455988455988,
 0.5974025974025974,
 0.5974025974025974,
 0.5974025974025974,
 0.5959595959595959,
 0.5916305916305916,
 0.5844155844155844,
 0.5786435786435786,
 0.5786435786435786,
 0.5743145743145743,
 0.56998556998557,
 0.5656565656565656,
 0.556998556998557,
 0.5324675324675324,
 0.5165945165945166,
 0.5108225108225108,
 0.49783549783549785,
 0.4935064935064935,
 0.4834054834054834,
 0.455988455988456,
 0.455988455988456,
 0.4531024531024531,
 0.44155844155844154,
 0.4401154401154401,
 0.4314574314574315,
 0.4314574314574315,
 0.42568542568542567,
 0.4199134199134199,
 0.4155844155844156,
 0.405483405483

# Implementing the Model

In [30]:
def game_predict(team_1: str, team_2: str) -> str:
    team1 = teams_df[teams_df['School'] == team_1]
    team2 = teams_df[teams_df['School'] == team_2]
    t1 = list(team1.drop(columns=['School', 'School_id']).values[0])
    t2 = list(team2.drop(columns=['School', 'School_id']).values[0])
    pred = torch.tensor(t1 + t2, dtype=torch.float32)

    output = int(torch.round(net(pred)))

    if output == 1:
        return team_1
    elif output == 0:
        return team_2
    else:
        print("Error")

def first_round(region: dict) -> list:
    
    winners = []

    for pairing in first_round_order:
        winners.append(game_predict(region[pairing[0]], region[pairing[1]]))

    seeds = [get_key(region, winner) for winner in winners]
    next_round = list(np.resize(seeds, (4,2)))

    return winners, next_round



In [31]:
first_round(east)
first_round(midwest)
first_round(south)
first_round(west)

(['Connecticut',
  'Florida Atlantic',
  'UAB',
  'Auburn',
  'Duquesne',
  'Illinois',
  'Drake',
  'Iowa State'],
 [array([1, 8]), array([12,  4]), array([11,  3]), array([10,  2])])

(['Purdue',
  'Utah State',
  'Gonzaga',
  'Kansas',
  'Oregon',
  'Creighton',
  'Texas',
  'Tennessee'],
 [array([1, 8]), array([5, 4]), array([11,  3]), array([7, 2])])

(['Houston',
  'Texas A&M',
  'James Madison',
  'Duke',
  'NC State',
  'Kentucky',
  'Utah State',
  'Marquette'],
 [array([1, 9]), array([12,  4]), array([11,  3]), array([10,  2])])

(['North Carolina',
  'Michigan State',
  "Saint Mary's (CA)",
  'College of Charleston',
  'New Mexico',
  'Baylor',
  'Nevada',
  'Arizona'],
 [array([1, 9]), array([ 5, 13]), array([11,  3]), array([10,  2])])