In [None]:
%pip install pandas
%pip install --upgrade pandas
%pip install numpy
%pip install scikit-learn
%pip install xgboost


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

%cd data/Mens/Season/
games_df = pd.DataFrame()
for season in range(2003 , 2025):
    season_games = pd.read_csv(f'{season}/MRegularSeasonDetailedResults_{season}.csv')
    games_df = pd.concat([games_df, season_games])

ordinal_df = pd.DataFrame()
for season in range(2003 , 2025):
    ordinal_games = pd.read_csv(f'{season}/MMasseyOrdinals_{season}.csv')
    ordinal_df = pd.concat([ordinal_df, ordinal_games])

# ordinal_df = pd.read_csv('2024/MMasseyOrdinals_2024.csv')
# games_df = pd.read_csv('2024/MRegularSeasonDetailedResults_2024.csv')


In [None]:
ordinal_df


In [None]:
# games_df['Week'] = ((games_df['DayNum']-1)/7 +1)
# games_df['Week'] = games_df['Week'].apply(np.floor)


# ordinal_df['Week'] = ((ordinal_df['RankingDayNum']-1)/7 +1)
# ordinal_df['Week'] = ordinal_df['Week'].apply(np.floor)
# games_df


In [None]:
def calculate_additional_stats(df):
    """
    Adds calculated statistics for two-point field goals to the DataFrame.

    Parameters:
    - df (DataFrame): The original game results DataFrame.

    Returns:
    - DataFrame: The modified DataFrame with additional stats.
    """
    df['WFGM2'] = df['WFGM'] - df['WFGM3']
    df['WFGA2'] = df['WFGA'] - df['WFGA3']
    df['LFGM2'] = df['LFGM'] - df['LFGM3']
    df['LFGA2'] = df['LFGA'] - df['LFGA3']
    return df

In [None]:
def prepare_team_stats(df):
    """
    Prepares and aggregates team statistics and statistics against from game results.

    Parameters:
    - df (DataFrame): The game results DataFrame with additional stats.

    Returns:
    - DataFrame: A DataFrame with average stats per team and stats against.
    """
    df = calculate_additional_stats(df)
   # Stats when the team wins
    win_stats = df[['Season','WTeamID', 'WFGM', 'WFGA', 'WFGM2', 'WFGA2', 'WFGM3', 'WFGA3', 'WFTM',
                    'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']].copy()
    win_stats.columns = ['Season','TeamID', 'FGM', 'FGA', 'FGM2', 'FGA2', 'FGM3', 'FGA3', 'FTM',
                         'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']

    # Stats against the team when it wins (opponents' performance)
    win_against_stats = df[['Season','WTeamID', 'LFGM', 'LFGA', 'LFGM2', 'LFGA2', 'LFGM3', 'LFGA3', 'LFTM',
                            'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']].copy()
    win_against_stats.columns = ['Season','TeamID', 'FGM', 'FGA', 'FGM2', 'FGA2', 'FGM3', 'FGA3',
                                  'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 
                                  'Blk', 'PF']

    # Stats when the team loses
    lose_stats = df[['Season','LTeamID', 'LFGM', 'LFGA', 'LFGM2', 'LFGA2', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA',
                     'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']].copy()
    lose_stats.columns = ['Season','TeamID', 'FGM', 'FGA', 'FGM2', 'FGA2', 'FGM3', 'FGA3', 'FTM', 'FTA',
                          'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']

    # Stats against the team when it loses (opponents' performance)
    lose_against_stats = df[['Season','LTeamID', 'WFGM', 'WFGA', 'WFGM2', 'WFGA2', 'WFGM3', 'WFGA3', 'WFTM',
                             'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']].copy()
    lose_against_stats.columns = ['Season','TeamID', 'FGM', 'FGA', 'FGM2', 'FGA2', 'FGM3', 'FGA3',
                                  'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 
                                  'Blk', 'PF']    

    # Combine winning and losing stats
    all_stats = pd.concat([win_stats, lose_stats])
    all_against_stats = pd.concat([win_against_stats, lose_against_stats])

    # Calculate the mean for stats and stats against separately
    avg_stats = all_stats.groupby(['Season','TeamID']).mean().reset_index()
    avg_against_stats = all_against_stats.groupby(['Season','TeamID']).mean().reset_index()

    # Merge the average stats with average stats against
    avg_merged_stats = pd.merge(avg_stats, avg_against_stats, on=['Season','TeamID'], suffixes=('', '_A'))
    avg_merged_stats = avg_merged_stats.round(2)
    return avg_merged_stats

In [None]:

# seasons = games_df['Season'].unique()
# games_by_season = [games_df[games_df['Season'] == season] for season in seasons]
    

# teams_avg_stats = [prepare_team_stats(season) for season in games_by_season]
team_avg_stats = prepare_team_stats(games_df)
team_avg_stats

In [None]:



# ordinal_df = ordinal_df.sort_values(by=['TeamID', 'RankingDayNum']).reset_index(drop=True)
# ordinal_df = ordinal_df.rename(columns={'RankingDayNum':'DayNum'})

# system_names = ordinal_df['SystemName'].unique()
# teams_names = ordinal_df['TeamID'].unique()
# system_no_rank_all_teams = []

# for system in system_names:
#     teams_in_system = ordinal_df[ordinal_df['SystemName'] == system]['TeamID'].unique()
#     if len(teams_in_system) != len(teams_names):
#         system_no_rank_all_teams.append(system)

# ordinal_df = ordinal_df[~ordinal_df['SystemName'].isin(system_no_rank_all_teams)]
# ordinal_pivot = ordinal_df.pivot_table(index=['TeamID', 'DayNum', 'Week'], columns='SystemName', values='OrdinalRank').reset_index()
# ordinal_pivot.sort_values(by=['TeamID', 'DayNum'])
# ordinal_pivot = ordinal_pivot.ffill()
# ordinal_pivot = ordinal_pivot.groupby('TeamID').apply(lambda x: x.interpolate(method='linear', limit_direction='both')).reset_index(drop=True)

# ordinal_df = ordinal_df.sort_values(by=['TeamID']).reset_index(drop=True)
# ordinal_df = ordinal_df.drop(columns=['RankingDayNum'])

# system_names = ordinal_df['SystemName'].unique()
# teams_names = ordinal_df['TeamID'].unique()
# system_no_rank_all_teams = []

# for system in system_names:
#     teams_in_system = ordinal_df[ordinal_df['SystemName'] == system]['TeamID'].unique()
#     if len(teams_in_system) != len(teams_names):
#         system_no_rank_all_teams.append(system)

# ordinal_df = ordinal_df[~ordinal_df['SystemName'].isin(system_no_rank_all_teams)]
# ordinal_pivot = ordinal_df.pivot_table(index=['TeamID'], columns='SystemName', values='OrdinalRank').reset_index()
# ordinal_pivot.sort_values(by=['TeamID'])
# ordinal_pivot = ordinal_pivot.groupby('TeamID').mean().reset_index()

def prep_ordinal_ratings_for_merge(ordinal_df):
    """
    Preprocesses the ordinal ratings dataframe for merging with other dataframes.
    
    Args:
        ordinal_df (pandas.DataFrame): The ordinal ratings dataframe.
        
    Returns:
        pandas.DataFrame: The preprocessed ordinal ratings dataframe.
    """

    ordinal_df = ordinal_df.sort_values(by=['Season','TeamID']).reset_index(drop=True)
    ordinal_df = ordinal_df.drop(columns=['RankingDayNum'])

    system_names = ordinal_df['SystemName'].unique()
    teams_names = ordinal_df['TeamID'].unique()
    system_no_rank_all_teams = []

    for system in system_names:
        teams_in_system = ordinal_df[ordinal_df['SystemName'] == system]['TeamID'].unique()
        if len(teams_in_system) != len(teams_names):
            system_no_rank_all_teams.append(system)

    ordinal_df = ordinal_df[~ordinal_df['SystemName'].isin(system_no_rank_all_teams)]
    ordinal_pivot = ordinal_df.pivot_table(index=['Season','TeamID'], columns='SystemName',
                                           values='OrdinalRank').reset_index()
    ordinal_pivot.sort_values(by=['Season','TeamID'])
    ordinal_pivot = ordinal_pivot.ffill()
    ordinal_pivot = ordinal_pivot.groupby(['Season','TeamID']).apply(lambda x: x.interpolate(method='linear', limit_direction='both')).reset_index(drop=True)
    ordinal_pivot = ordinal_pivot.groupby(['Season','TeamID']).mean().reset_index()
    
    return ordinal_pivot


# ordinal_pivot[(ordinal_pivot['TeamID'] == 1104)]


In [None]:
ordinals_pivot = prep_ordinal_ratings_for_merge(ordinal_df)

In [None]:
team_avg_stats[(team_avg_stats['TeamID'] == 1104)]


In [None]:
ordinals_pivot.isna().sum()

In [None]:
# """
# This code performs the following operations:

# 1. Sorts the 'ordinal_pivot' DataFrame by 'TeamID' and 'DayNum' columns.
# 2. Sorts the 'teams_stats_weekly_df' DataFrame by 'TeamID' and 'DayNum' columns.
# 3. Merges the sorted 'teams_stats_weekly_df' and 'ordinal_pivot' DataFrames using 'DayNum' and 'TeamID' columns in a backward direction.
# 4. Sorts the resulting DataFrame by 'TeamID' and 'DayNum' columns and resets the index.
# 5. Extracts the columns starting from the 35th column and assigns them to 'rank_columns' variable.
# 6. Fills the missing values in 'rank_columns' for each 'TeamID' using backward filling.
# 7. Filters the resulting DataFrame to include only rows where 'TeamID' is equal to 1104.

# Parameters:
#     - ordinal_pivot: DataFrame containing ordinal rankings.
#     - teams_stats_weekly_df: DataFrame containing weekly team statistics.

# Returns:
#     - weekly_stats_w_rating: DataFrame with sorted and merged data, filled with missing values, and filtered by 'TeamID' 1104.
# """
# ordinals_pivot = ordinals_pivot.sort_values(by=['TeamID'])

# teams_avg_stats = teams_avg_stats.sort_values(by=['TeamID'])
# weekly_stats_w_rating = pd.merge(teams_avg_stats, ordinals_pivot, on='TeamID', suffixes=('', '_A'))
# weekly_stats_w_rating = weekly_stats_w_rating.sort_values(by=['TeamID']).reset_index(drop=True)
# rank_columns = weekly_stats_w_rating.columns[34:]

# weekly_stats_w_rating[rank_columns] = weekly_stats_w_rating.groupby('TeamID')[rank_columns].bfill()
# weekly_stats_w_rating = weekly_stats_w_rating.dropna(axis = 1, how= 'any')
# weekly_stats_w_rating

def merge_ratings_stats(ordinal_df, teams_stats_avg_df):
    """
    Merge the ordinal dataframe and the teams' weekly stats 
    dataframe based on the 'TeamID' and 'DayNum' columns.
    
    Args:
        ordinal_df (pandas.DataFrame): The ordinal dataframe containing the team ratings.
        teams_stats_weekly_df (pandas.DataFrame): The teams' weekly stats dataframe.
        
    Returns:
        pandas.DataFrame: The merged dataframe with the team ratings and weekly stats.
    """
    ordinal_df = ordinal_df.sort_values(by=['TeamID'])

    teams_stats_avg_df = teams_stats_avg_df.sort_values(by=['Season','TeamID'])
    avg_stats_w_rating = pd.merge(teams_stats_avg_df, ordinal_df, on=['Season','TeamID'], suffixes=('', '_A'))
    avg_stats_w_rating = avg_stats_w_rating.sort_values(by=['Season','TeamID']).reset_index(drop=True)
    rank_columns = avg_stats_w_rating.columns[34:]

    # Fill the NaN values in the rank columns with the previous values for each team. If still NaN, then drop the column.
    avg_stats_w_rating[rank_columns] = avg_stats_w_rating.groupby(['Season','TeamID'])[rank_columns].bfill()
    avg_stats_w_rating = avg_stats_w_rating.dropna(axis = 1, how= 'any')
    return avg_stats_w_rating

In [None]:
merged_stats = merge_ratings_stats(ordinals_pivot, team_avg_stats)
merged_stats['Season'].unique()

In [None]:
def prepare_matchup_data(games_df, stats):
    """
    Merges game data with team stats to prepare matchup data.

    Parameters:
    - games_df (DataFrame): The DataFrame containing game results.
    - avg_stats (DataFrame): The DataFrame containing average stats per team.

    Returns:
    - DataFrame: Matchup data with team stats and game outcome.
    """
    processed_data = []

    for _, row in games_df.iterrows():
        season = row['Season']
        team_1, team_2 = sorted((row['WTeamID'], row['LTeamID']))
        team_1_won = 1 if team_1 == row['WTeamID'] else 0
        team_1_stats = stats.loc[(stats['TeamID'] == team_1)].add_prefix('team_1_').iloc[-1]
        team_2_stats = stats.loc[(stats['TeamID'] == team_2)].add_prefix('team_2_').iloc[-1]

        matchup_data = {
            'Season': row['Season'],
            'DayNum': row['DayNum'],
            'team_1': team_1,
            'team_2': team_2,
            'team_1_won': team_1_won
        }
        matchup_data.update(team_1_stats)
        matchup_data.update(team_2_stats)

        processed_data.append(matchup_data)

    return pd.DataFrame(processed_data)

In [None]:
matchup_data = []
for season in merged_stats['Season'].unique():
    season_games = games_df[games_df['Season'] == season]
    season_stats = merged_stats[merged_stats['Season'] == season]
    matchup_data.append(prepare_matchup_data(season_games, season_stats))

In [None]:
test = pd.concat(matchup_data)
test.isna().sum().drop_duplicates()

In [None]:

def fit_model_scalar(model_param, model_name, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3270)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit the model
    model_param.fit(X_train_scaled, y_train)
    y_pred = model_param.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} scalar accuracy: {accuracy:.2f}')

In [None]:
def fit_model(model_param, model_name, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3270)

    # sfs = SequentialFeatureSelector(model_param, n_features_to_select=10)
    # sfs.fit(X_train, y_train)
    # X_train = sfs.transform(X_train)
    # X_test = sfs.transform(X_test)
    model_param.fit(X_train, y_train)
    y_pred = model_param.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} accuracy: {accuracy:.2f}')

In [None]:
# matchups = prepare_matchup_data(games_df, weekly_stats_w_rating)

X = test.drop(['DayNum','team_1_won'], axis=1)
y = test['team_1_won']

models = {
    'Decision Tree': DecisionTreeClassifier(random_state=3270, max_depth=10),
    'Random Forest': RandomForestClassifier(random_state=3270, n_estimators=200, max_depth=10, min_samples_split=10, n_jobs=-1),
    'Logistic Regression': LogisticRegression(random_state=3270, max_iter=1000, penalty = None, solver = 'lbfgs', ),
    'XGBoost': XGBClassifier(random_state = 3270, n_estimators = 100, max_depth = 3, learning_rate = 0.1, gamma = 0, subsample = 0.8, colsample_bytree = 0.8)
}

for name, model in models.items():
    fit_model_scalar(model, name, X, y)
    fit_model(model, name, X, y)

In [None]:
lgr = LogisticRegression(random_state=3270, max_iter=1000, penalty = None, solver = 'lbfgs')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3270)

lgr.fit(X_train, y_train)
y_pred = lgr.predict(X_test)
#Show the predictions and the actual values side by side
pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).tail(50)
#Show the accuracy of the predictions for the last 50 games
accuracy_score(y_test[len(y_test)-50:], y_pred[len(y_test)-50:])
# accuracy_score(y_test, y_pred)



In [None]:
# seasons_matchup_avgs_10_data = pd.DataFrame()
# for season in range(2003 , 2025):
#     season_games = pd.read_csv(f'{season}/MRegularSeasonDetailedResults_{season}_matchups_avg_w_rating.csv')
#     seasons_matchup_avgs_10_data = pd.concat([seasons_matchup_avgs_10_data, season_games])

seasons_matchup_avgs_10_data = test
seasons = seasons_matchup_avgs_10_data['Season'].unique()
data = seasons_matchup_avgs_10_data
seasons_matchup_avgs_10_data = [seasons_matchup_avgs_10_data[seasons_matchup_avgs_10_data['Season'] == season] for season in seasons]
    

In [None]:
test[(test['Season'] == 2024)]

In [None]:
def TimeSeriesSplit_by_season(seasons_data):
    """
    Splits the data into training and testing sets by season. Model is trained on all data up to a certain season and tested on the next season until the last season. 

    Parameters:
    - seasons_data (list): A list of DataFrames containing data for each season.

    Returns:
    - list: A list of tuples containing training and testing sets for each season.
    """
    scaler = StandardScaler()
    model = LogisticRegression(random_state=3270, max_iter=1000, penalty = None, solver = 'lbfgs')
    accuracies = []
    for i in range(1, len(seasons_data)):
        print(f'Testing on Season {seasons_data[i]["Season"].unique()[0]}')
        train = pd.concat(seasons_data[:i])
        train = train.dropna(axis = 'columns', how= 'any')
        X_train = train.drop(['Season','DayNum', 'team_1_TeamID', 'team_2_TeamID', 'team_1_won'], axis=1)
        y_train = train['team_1_won']
        X_train = scaler.fit_transform(X_train)
        test = seasons_data[i]
        test = test.dropna(axis = 'columns', how= 'any')
        X_test = test.drop(['Season','DayNum','team_1_TeamID', 'team_2_TeamID', 'team_1_won'], axis=1)
        X_test = scaler.transform(X_test)
        y_test = test['team_1_won']
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        print(f'accuracy: {accuracy:.5f}')

    print(f'Average accuracy: {np.mean(accuracies):.5f}')
    return model

    

In [None]:
model = TimeSeriesSplit_by_season(seasons_matchup_avgs_10_data)

In [145]:
season_matchups = seasons_matchup_avgs_10_data


In [154]:

from sklearn.neural_network import MLPClassifier
param_grid = {
    'hidden_layer_sizes': [(60), (30,15), (60,15), (60,30,15)], 
    'activation': ['tanh', 'relu', 'logistic'],  
    'solver': ['sgd', 'adam'], 
    'learning_rate': ['constant','adaptive', 'invscaling'],
    'learning_rate_init': [0.001, 0.01],
}

model = MLPClassifier(random_state=3270)
scaler = StandardScaler()
grid_search = GridSearchCV(model, param_grid, n_jobs=-1, verbose=2)

all_except_last_season = pd.concat(season_matchups[:len(season_matchups)-1])
last_season = pd.DataFrame(season_matchups[-1])
X_train = all_except_last_season.drop(['Season','DayNum', 'team_1_TeamID', 'team_2_TeamID', 'team_1_won'], axis=1)
y_train_ = all_except_last_season['team_1_won']
X_train = scaler.fit_transform(X_train)

X_test = last_season.drop(['Season','DayNum','team_1_TeamID', 'team_2_TeamID', 'team_1_won'], axis=1)
y_test = last_season['team_1_won']
X_test = scaler.transform(X_test)
fitted = grid_search.fit(X_train, y_train_)
print(f'Best parameters: {fitted.best_params_}')
y_pred = fitted.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'accuracy: {accuracy:.5f}')


Fitting 5 folds for each of 144 candidates, totalling 720 fits


KeyboardInterrupt: 

In [None]:
seasons_tourney_data = []
for season in range(2003 , 2025):
    season_games = pd.read_csv(f'{season}/MNCAATourneyCompactResults_{season}.csv')
    seasons_tourney_data.append(season_games)
    

In [None]:
team_stats_season = {}
for season in merged_stats['Season'].unique():
    season_stats = merged_stats[merged_stats['Season'] == season]
    team_stats_season[season] = season_stats

In [None]:
def build_game_matchups(bracket_team_matchups, team_stats):
    """
    Builds matchup data for the tournament bracket.

    Parameters:
    - bracket_team_matchups (DataFrame): The DataFrame containing the current matchups for a bracket.

    Returns:
    - DataFrame: A DataFrame where each row contains team_1's and team_2's stats for that specific matchup.
    """
    matchups = []
    for _, row in bracket_team_matchups.iterrows():
        team_1 = row['StrongSeed']
        team_2 = row['WeakSeed']

        team_1_data = team_stats[team_stats['TeamID'] == team_1]
        team_2_data = team_stats[team_stats['TeamID'] == team_2]
        
        team_1_data.columns = [f'team_1_{col}' for col in team_1_data.columns]
        team_2_data.columns = [f'team_2_{col}' for col in team_2_data.columns]
        
        team_1_data = team_1_data.reset_index(drop=True)
        team_2_data = team_2_data.reset_index(drop=True)
        matchup_data = pd.concat([team_1_data, team_2_data], axis=1)
        matchup_data['Slot'] = row['Slot']
        matchups.append(matchup_data)
    return pd.concat(matchups, axis=0)

In [None]:
def preprocess_playins(seeds_df):
    """
    Preprocess the play in teams by removing 'a' and 'b' designations and preparing strong and weak seeds.
    
    Parameters:
    - seeds_df (DataFrame): The DataFrame containing tournament seeds data, including 'Seed', 'TeamID' columns.
    
    Returns:
    - DataFrame: Processed DataFrame with 'StrongSeed' and 'WeakSeed' for play-in teams.
    """
    playin_teams = seeds_df[seeds_df['Seed'].str.contains('a') | seeds_df['Seed'].str.contains('b')].copy()
    playin_teams['Seed'] = playin_teams['Seed'].str.extract('([0-9A-Z]+)')
    playin_teams_match_df = playin_teams.groupby('Seed')['TeamID'].apply(list).reset_index()
    playin_teams_match_df['StrongSeed'] = playin_teams_match_df['TeamID'].apply(lambda x: x[0])
    playin_teams_match_df['WeakSeed'] = playin_teams_match_df['TeamID'].apply(lambda x: x[1])
    playin_teams_match_df.rename(columns={'Seed': 'Slot'}, inplace=True)
    return playin_teams_match_df.drop(columns='TeamID')


In [None]:
def predict_bracket_winners(bracket_matchups, model, scalar):
    """
    Predict the winners in the lower bracket using a pre-trained model and scaler.
    
    Parameters:
    - bracket_matchups (DataFrame): DataFrame of matchups in the lower bracket, excluding 'Seed' from scaling.
    - model (Model): Pre-trained prediction model.
    - scalar (Scaler): Pre-fitted scaler object for normalizing data.
    
    Returns:
    - DataFrame: Lower bracket DataFrame with an additional column 'team_1_won' indicating the predicted winner.
    """
    bracket_matchups = bracket_matchups.drop(columns= [])
    lower_bracket_scaled = scalar.fit_transform(bracket_matchups.drop(columns=['Slot']))
    bracket_matchups['team_1_won'] = model.predict(lower_bracket_scaled)
    return bracket_matchups


In [None]:
def update_seeds_with_winners(bracket, seeds_df):
    """
    Update the seeds DataFrame with the winners from the lower bracket predictions.
    
    Parameters:
    - bracket (DataFrame): The lower bracket DataFrame with predictions.
    - seeds_df (DataFrame): The original seeds DataFrame to be updated with current teams seedings.
    
    Returns:
    - DataFrame: Updated seeds DataFrame with winners.
    """
    bracket_winners = {}
    for _, row in bracket.iterrows():
        slot = row['Slot']
        if slot not in bracket_winners:
            bracket_winners[slot] = []
        bracket_winners[slot].append(row['team_1_TeamID'] if row['team_1_won'] == 1 else row['team_2_TeamID'])
    
    for curr_seed, team in bracket_winners.items():
        seeds_df.loc[len(seeds_df.index)] = [curr_seed, team[0]]
    seeds_df.sort_values(by='Seed', inplace=True)
    return seeds_df


In [None]:
def build_round_matchups(team_seeds, round_slots):
    """
    Build the Tourney round matchups based on seeds and updates team slots.
    
    Parameters:
    - team_seeds (DataFrame): DataFrame containing the seeds and corresponding team IDs.
    - round_slots (DataFrame): DataFrame containing the slots for the tournament matchups.
    
    Returns:
    - DataFrame: First-round matchups with updated team slots based on seeds.
    """

    for index, row in round_slots.iterrows():
        strong_team = team_seeds[(team_seeds['Seed'] == row['StrongSeed'])]['TeamID'].values[0]
        weak_team = team_seeds[(team_seeds['Seed'] == row['WeakSeed'])]['TeamID'].values[0]
        round_slots.at[index, 'StrongSeed'] = strong_team
        round_slots.at[index, 'WeakSeed'] = weak_team

    return round_slots


In [None]:
tourney_seeds_df = pd.read_csv('2023/MNCAATourneySeeds_2023.csv')
tourney_seeds_df.drop(columns=['Season'], inplace=True)
tourney_slots_df = pd.read_csv('2023/MNCAATourneySlots_2023.csv')

scalar = StandardScaler()

playin_teams_match_df = preprocess_playins(tourney_seeds_df)
bracket_matchups = build_game_matchups(playin_teams_match_df, team_stats_season[2023].groupby('TeamID').last().reset_index())
bracket_matchups.columns
bracket_matchups = predict_bracket_winners(bracket_matchups, model, scalar)
tourney_seeds_df = update_seeds_with_winners(bracket_matchups, tourney_seeds_df)
print('Done with play-ins')


# curr_round_slots = tourney_slots_df[tourney_slots_df['Slot'].str.contains('R1')]

# round_df = build_round_matchups(tourney_seeds_df,curr_round_slots)

# current_round_bracket = build_game_matchups(round_df, team_stats_season[2023])
# current_round_bracket.reset_index(drop=True, inplace=True)
# current_round_bracket = predict_bracket_winners(current_round_bracket, model, scalar)
# tourney_seeds_df = update_seeds_with_winners(current_round_bracket, tourney_seeds_df)


rounds = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6']
# rounds = ['R1']
matchups = []
for current_round in rounds:
    curr_round_slots = tourney_slots_df[tourney_slots_df['Slot'].str.contains(current_round)]

    round_matchups = build_round_matchups(tourney_seeds_df,curr_round_slots)
    matchups.append(round_matchups)
    current_round_bracket = build_game_matchups(round_matchups, team_stats_season[2023].groupby('TeamID').last().reset_index())
    current_round_bracket = predict_bracket_winners(current_round_bracket, model, scalar)
    tourney_seeds_df = update_seeds_with_winners(current_round_bracket, tourney_seeds_df)
    


In [None]:
complete_bracket = pd.concat(matchups)
complete_bracket.reset_index(drop=True, inplace=True)
# complete_bracket.to_csv('2023/MNCAATourneyPredictions_matchup_2023.csv', index=False)
complete_bracket

# tourney_seeds_df
# test_dict = {}
# test_dict[type(lgr).__name__] = [accuracy_score(y_test, y_pred)]
# test_dict

In [None]:
%cd ../../../

In [None]:
import cs3270p1_the_overfitting_overlords_train_save_load_all_models as tsm
import os


models = tsm.load_models()
for model_name, model in models['avg_10'].items():
    print(model_name)
    print(model)




In [None]:

prediction_dir = 'data/Mens/Season/2024/predictions'
prediction_file = 'MNCAATourneyPredictions_matchups_2024_rating_rol10_LogisticRegression.csv'
seeds_df = 'MNCAATourneyPredictions__seeds_2024_rating_rol10_LogisticRegression.csv'

matchups_df = pd.read_csv(os.path.join(prediction_dir, prediction_file))
seeds_df = pd.read_csv(os.path.join(prediction_dir, seeds_df)).head(63)
matchups_df.drop(columns=['Season'], inplace=True)
seeds_df.rename(columns={'TeamID': 'Winner'}, inplace=True)


merged_df = pd.merge(matchups_df,seeds_df , left_on='Slot', right_on='Seed', how='left')
merged_df.drop(columns=['Seed'], inplace=True)

merged_df
## Get all files in a dir
files = os.listdir(prediction_dir)


for file in files:
    if "matchups" in file:
        curr_matchups = pd.read_csv(os.path.join(prediction_dir, file))
        seed_file = file.replace('matchups', '_seeds')
        curr_seeds = pd.read_csv(os.path.join(prediction_dir, seed_file))
        curr_matchups.drop(columns=['Season'], inplace=True)
        model_name = file.split('2024_')[-1].split('.')[0]
        curr_matchups.rename(columns = {'StrongSeed' : 'StrongSeed_'+ model_name, 'WeakSeed' : 'WeakSeed_'+ model_name}, inplace=True)
        curr_seeds.rename(columns = {'TeamID' : 'Winner_'+ model_name}, inplace=True)

        curr_merge = pd.merge(curr_matchups, curr_seeds, left_on='Slot', right_on='Seed', how='left')
        curr_merge.drop(columns=['Seed', 'Slot'], inplace=True)

        merged_df = pd.concat([merged_df, curr_merge], axis=1)


teams_names_df = pd.read_csv('data/Mens/MTeams.csv')

team_names_map = teams_names_df.set_index('TeamID')['TeamName'].to_dict()

for col in merged_df.columns[1:]:
    merged_df[col] = merged_df[col].map(team_names_map)
    
# for index, row in pred_df.iterrows():
#     strong_team = teams_names_df[teams_names_df['TeamID'] == row['StrongSeed']]['TeamName'].values[0]
#     weak_team = teams_names_df[teams_names_df['TeamID'] == row['WeakSeed']]['TeamName'].values[0]
#     pred_df.at[index, 'StrongSeed'] = strong_team
#     pred_df.at[index, 'WeakSeed'] = weak_team

merged_df.to_csv(os.path.join(prediction_dir,'MNCAATourneyPredictions.csv'), index=False)

In [None]:
print(tourney_seeds_df.head(63))

In [None]:
stats = pd.read_csv('data/Mens/Season/2023/MRegularSeasonDetailedResults_2023_avg_10_games.csv')
stats.columns

In [None]:
stats = pd.read_csv(f'data/Mens/Season/2023/MRegularSeasonDetailedResults_2023_avg_w_rating.csv')
stats.columns

In [None]:
stats = pd.read_csv(f'data/Mens/Season/2023/MRegularSeasonDetailedResults_2023_avg.csv')
stats.columns