In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf
from tensorflow.keras.regularizers import l2
import random

# Load your data
data_2013_2023 = pd.read_csv('KenPom 2013-2023.csv')
data_2024 = pd.read_csv('KenPom 2024.csv')

# Preprocess and merge data for scaling
columns_to_use = ['Team', 'AdjEM', 'AdjO', 'AdjD', 'AdjT', 'SOS AdjEM', 'SOS OppO', 'SOS OppD', 'NCSOS AdjEM']
data_2013_2023_prepped = data_2013_2023[columns_to_use].dropna()
data_2024_prepped = data_2024[columns_to_use].dropna()
# Assuming the 'Team' column still exists in the 'data_2024_prepped' DataFrame after preprocessing
all_teams_2024 = data_2024_prepped['Team'].unique().tolist()


# Combining datasets for scaling
combined_data = pd.concat([data_2013_2023_prepped, data_2024_prepped])

scaler = StandardScaler()
combined_data_scaled = scaler.fit_transform(combined_data.iloc[:, 1:])  # Avoid scaling 'Team' column
combined_data_scaled = pd.DataFrame(combined_data_scaled, columns=columns_to_use[1:], index=combined_data.index)
combined_data_scaled['Team'] = combined_data['Team']

# Split back into training and testing data
training_data = combined_data_scaled.loc[data_2013_2023_prepped.index]
testing_data = combined_data_scaled.loc[data_2024_prepped.index]

# row-wise matchups for training data
features_diff = training_data.iloc[:-1:2, :-1].values - training_data.iloc[1::2, :-1].values
adj_em_diff = training_data['AdjEM'].values[:-1:2] - training_data['AdjEM'].values[1::2]
labels = (adj_em_diff > 0).astype(int)  # 1 if the first team is better

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features_diff, labels, test_size=0.2, random_state=42)

# Neural Network Model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(features_diff.shape[1],), kernel_regularizer=l2(0.01)),
    tf.keras.layers.Dropout(0.5),  # Dropout 50% of the neurons
    tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=l2(0.01)),
    tf.keras.layers.Dropout(0.5),  # Another dropout layer
    tf.keras.layers.Dense(1, activation='sigmoid')

])


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

model.fit(X_train, y_train, epochs=25, validation_data=(X_test, y_test), callbacks=[early_stopping])





Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 32)                288       
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 16)                528       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 833 (3.25 KB)
Trainable params: 833 (3.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Ep

In [4]:
def get_team_features(team_name, dataset):
    try:
        # Filter the dataset for the team and ensure only numeric data is handled
        team_data = dataset[dataset['Team'].str.contains(team_name, case=False, na=False)]
        if team_data.empty:
            raise ValueError(f"Data for team {team_name} not found.")
        return team_data.drop(columns=['Team']).iloc[0].astype(float).values  # Convert all entries to float
    except IndexError:
        raise ValueError(f"Data for team {team_name} not found.")

def predict_matchup(team1_name, team2_name, dataset, model, scaler):
    # Extract features for both teams
    team1_features = get_team_features(team1_name, dataset)
    team2_features = get_team_features(team2_name, dataset)

    # Scale features
    team1_features_scaled = scaler.transform([team1_features])
    team2_features_scaled = scaler.transform([team2_features])

    # Calculate the difference
    features_diff = team1_features_scaled - team2_features_scaled

    # Make a prediction
    prediction = model.predict(features_diff)
    return prediction[0][0]

# Assuming all_teams_2024 is a list of exactly 64 unique team names
random.shuffle(all_teams_2024)  # Shuffle the list to randomize pairings
matchups = [(all_teams_2024[i], all_teams_2024[i + 1]) for i in range(0, len(all_teams_2024), 2)]

def run_matchup(team1, team2, dataset, model, scaler):
    probability_team1_wins = predict_matchup(team1, team2, dataset, model, scaler)
    return team1 if probability_team1_wins >= 0.5 else team2

def tournament_round(teams, dataset, model, scaler):
    random.shuffle(teams)  # Shuffle to randomize matchups
    winners = []
    for i in range(0, len(teams), 2):
        winner = run_matchup(teams[i], teams[i+1], dataset, model, scaler)
        winners.append(winner)
    return winners

# Initialize the list of teams (assuming all_teams_2024 has exactly 64 unique teams)
teams = all_teams_2024[:]  # Create a copy of the list to work with
results = {}

# Run the tournament
current_round = teams
round_number = 0
while len(current_round) > 1:
    round_number += 1
    current_round = tournament_round(current_round, testing_data, model, scaler)
    results[f"Round_{len(current_round)}"] = current_round  # Save results of each round

    # Print the results of the current round
    # print(f"Results of Round {round_number} ({len(current_round)} teams): {current_round}")

# Output the champion
champion = current_round[0]
print(f"The champion of the tournament is: {champion}")

# Optionally, print or use saved results
for round_count, winners in results.items():
    print(f"{round_count} winners: {winners}")



































































































The champion of the tournament is: Connecticut 
Round_32 winners: ['Washington St. ', 'Wisconsin ', 'Baylor ', "Saint Mary's ", 'Dayton ', 'Colorado St. ', 'Texas ', 'Mississippi St. ', 'Duke ', 'Arizona ', 'Clemson ', 'Northwestern ', 'North Carolina ', 'Purdue ', 'Morehead St. ', 'Texas Tech ', 'Michigan St. ', 'Iowa St. ', 'Colorado ', 'Houston ', 'Creighton ', 'Nebraska ', 'Duquesne ', 'Florida ', 'Connecticut ', 'Illinois ', 'Texas A&M ', 'TCU ', 'N.C. State ', 'New Mexico ', 'Tennessee ', 'Kansas ']
Round_16 winners: ['Houston ', 'Colorado St. ', 'Texas Tech ', 'Texas ', 'Duke ', 'Florida ', 'Purdue ', 'Clemson ', 'Baylor ', 'Tennessee ', 'Wisconsin ', 'Iowa St. ', 'Connecticut ', 'N.C. State ', 'Texas A&M ', 'Creighton ']
Round_8 winners: ['Purdue ', 'Connecticut ', 'Texas ', 'Florida ', 'Creighton ', 'Iowa St. ', 'Houston ', 'Tennessee ']
Round_4 winners: ['Iowa St. ', 'Connecticut ', 'Florida ', 'Houston ']
Round_2 winners: ['Florida ', 'Connecticut ']
Round_1 winners: ['Conne

