In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss

# Load Data
men_seed_df = pd.read_csv('/app/MarchMadness/Kaggle-March-Madness-Prediction/Data/march-machine-learning-mania-2025/MNCAATourneySeeds.csv')
men_match_results = pd.read_csv('/app/MarchMadness/Kaggle-March-Madness-Prediction/Data/march-machine-learning-mania-2025/MNCAATourneyCompactResults.csv')
women_seed_df = pd.read_csv('/app/MarchMadness/Kaggle-March-Madness-Prediction/Data/march-machine-learning-mania-2025/WNCAATourneySeeds.csv')
women_match_results = pd.read_csv('/app/MarchMadness/Kaggle-March-Madness-Prediction/Data/march-machine-learning-mania-2025/WNCAATourneyCompactResults.csv')
submission_df = pd.read_csv('/app/MarchMadness/Kaggle-March-Madness-Prediction/Data/march-machine-learning-mania-2025/SampleSubmissionStage1.csv')

def extract_game_info(id_str):
    parts = id_str.split('_')
    return int(parts[0]), int(parts[1]), int(parts[2])

def extract_seed_value(seed_str):
    try:
        return int(seed_str[1:])
    except ValueError:
        return 16

# Prepare Seed Data
for seed_df in [men_seed_df, women_seed_df]:
    seed_df['SeedValue'] = seed_df['Seed'].apply(extract_seed_value)

# Combine Men and Women Seed Data
total_seed_df = pd.concat([men_seed_df, women_seed_df], ignore_index=True)

def prepare_data(seed_df, match_results):
    match_results = match_results[['Season', 'WTeamID', 'LTeamID']]
    match_results['Winner'] = 1
    inverse_results = match_results.rename(columns={'WTeamID': 'LTeamID', 'LTeamID': 'WTeamID'}).copy()
    inverse_results['Winner'] = 0
    match_results = pd.concat([match_results, inverse_results], ignore_index=True)
    match_results = match_results.merge(seed_df[['Season', 'TeamID', 'SeedValue']], 
                                        left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
    match_results = match_results.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])
    match_results = match_results.merge(seed_df[['Season', 'TeamID', 'SeedValue']], 
                                        left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
    match_results = match_results.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])
    match_results['SeedDiff'] = match_results['SeedValue1'] - match_results['SeedValue2']
    return match_results

men_data = prepare_data(men_seed_df, men_match_results)
women_data = prepare_data(women_seed_df, women_match_results)
combined_data = pd.concat([men_data, women_data], ignore_index=True)

X = combined_data[['SeedDiff']]
y = combined_data['Winner']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
brier_score = brier_score_loss(y_test, y_pred)
print(f'Brier Score: {brier_score}')

# Prepare Submission Data
submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()
submission_df = submission_df.merge(total_seed_df[['Season', 'TeamID', 'SeedValue']], left_on=['Season', 'TeamID1'], right_on=['Season', 'TeamID'], how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])
submission_df = submission_df.merge(total_seed_df[['Season', 'TeamID', 'SeedValue']], left_on=['Season', 'TeamID2'], right_on=['Season', 'TeamID'], how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])
submission_df[['SeedValue1', 'SeedValue2']] = submission_df[['SeedValue1', 'SeedValue2']].fillna(16)
submission_df['SeedDiff'] = submission_df['SeedValue1'] - submission_df['SeedValue2']
submission_df['Pred'] = model.predict_proba(submission_df[['SeedDiff']])[:, 1]
submission_df['Pred'] = submission_df['Pred'].clip(0.05, 0.95)
submission_df[['ID', 'Pred']].to_csv('/app/MarchMadness/Kaggle-March-Madness-Prediction/SubmissionPredictions/submission.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match_results['Winner'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match_results['Winner'] = 1


Brier Score: 0.1727726203820053
