In [None]:
import sys
import numpy as np
import pandas as pd
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import brier_score_loss

# File Paths for data loading and model persistence
data_dir = "/app/MarchMadness/Kaggle-March-Madness-Prediction/Data/march-machine-learning-mania-2025/"
train_data_path = "/app/MarchMadness/Kaggle-March-Madness-Prediction/Training/training_data.csv"
train_split_path = "/app/MarchMadness/Kaggle-March-Madness-Prediction/Training/train_indices.csv"
test_split_path = "/app/MarchMadness/Kaggle-March-Madness-Prediction/Training/test_indices.csv"
model_path = "/app/MarchMadness/Kaggle-March-Madness-Prediction/ModelPath/model.pkl"
submission_path = "/app/MarchMadness/Kaggle-March-Madness-Prediction/SubmissionPredictions/submission.csv"
log_path = "/app/MarchMadness/Kaggle-March-Madness-Prediction/Training/training_log.txt"

# Load tournament seeds and match results
men_seed_df = pd.read_csv(os.path.join(data_dir, "MNCAATourneySeeds.csv"))
men_match_results = pd.read_csv(os.path.join(data_dir, "MNCAATourneyCompactResults.csv"))
women_seed_df = pd.read_csv(os.path.join(data_dir, "WNCAATourneySeeds.csv"))
women_match_results = pd.read_csv(os.path.join(data_dir, "WNCAATourneyCompactResults.csv"))
submission_df = pd.read_csv(os.path.join(data_dir, "SampleSubmissionStage1.csv"))

# Function to Extract (Season, Team1, Team2) from matchup ID
def extract_game_info(id_str):
    parts = id_str.split('_')
    return int(parts[0]), int(parts[1]), int(parts[2])

# Function to extract numeric seed value from string (e.g., 'W01' -> 1)
def extract_seed_value(seed_str):
    try:
        return int(seed_str[1:3])  # Extracts first two digits of the seed and converts to int
    except ValueError:
        return 16  # Default for unknown seeds in case of parsing error

# Apply seed extraction function to both men's and women's seed data
for seed_df in [men_seed_df, women_seed_df]:
    seed_df['SeedValue'] = seed_df['Seed'].apply(extract_seed_value)

# Combine Men and Women Seed Data into one dataframe
total_seed_df = pd.concat([men_seed_df, women_seed_df], ignore_index=True)

# Function to Prepare Matchup Data by adding seed values and computing seed difference
def prepare_data(seed_df, match_results):
    # keep only relevant columns from match results
    match_results = match_results[['Season', 'WTeamID', 'LTeamID']]
    
    # Assign winner label (1 for winning team)
    match_results['Winner'] = 1
    
    # Create duplicate dataset where winning and losing teams are flipped
    inverse_results = match_results.rename(columns={'WTeamID': 'LTeamID', 'LTeamID': 'WTeamID'}).copy()
    inverse_results['Winner'] = 0
    
    # Merge both datasets so we have all matchups (Win-Loss and Loss-Win)
    match_results = pd.concat([match_results, inverse_results], ignore_index=True)
    
    # Merge Seed Data for both teams
    match_results = match_results.merge(seed_df[['Season', 'TeamID', 'SeedValue']], left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
    match_results = match_results.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])
    
    match_results = match_results.merge(seed_df[['Season', 'TeamID', 'SeedValue']], left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
    match_results = match_results.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])
    
    # Compute seed difference (higher seed minus Lower seed)
    match_results['SeedDiff'] = match_results['SeedValue1'] - match_results['SeedValue2']
    
    return match_results

# Prepare men's and women's datasets and combine them
men_data = prepare_data(men_seed_df, men_match_results)
women_data = prepare_data(women_seed_df, women_match_results)
combined_data = pd.concat([men_data, women_data], ignore_index=True)

# Define features (X) and target labels (y)
X = combined_data[['SeedDiff']]
y = combined_data['Winner']

# Load previous training data if available abd merge it
if os.path.exists(train_data_path):
    past_data = pd.read_csv(train_data_path)
    X_past = past_data[['SeedDiff']]
    y_past = past_data['Winner']
    X = pd.concat([X_past, X], ignore_index=True)
    y = pd.concat([y_past, y], ignore_index=True)

# Load or Generate Train-Test Split
if os.path.exists(train_split_path) and os.path.exists(test_split_path):
    train_indices = pd.read_csv(train_split_path)['train_indices'].values
    test_indices = pd.read_csv(test_split_path)['test_indices'].values
else:
    indices = np.arange(len(X))
    train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)
    
    pd.DataFrame({'train_indices': train_indices}).to_csv(train_split_path, index=False)
    pd.DataFrame({'test_indices': test_indices}).to_csv(test_split_path, index=False)

X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

# Load existing model or train a new one
if os.path.exists(model_path):
    model = joblib.load(model_path)
    print("Loaded existing model.")
else:
    model = SGDClassifier(loss="log_loss")
    print("Training new model.")

sys.stdout.flush()

# Train or update model
for _ in range(5):  # Multiple iterations for better learning
    model.partial_fit(X_train, y_train, classes=np.array([0, 1]))

# Evaluate model on test set
y_pred = model.predict_proba(X_test)[:, 1]
brier_score = brier_score_loss(y_test, y_pred)
print(f'Brier Score: {brier_score}')

# Save the updated model
joblib.dump(model, model_path)

# Save updated training data
train_data = pd.DataFrame({'SeedDiff': X_train['SeedDiff'], 'Winner': y_train})
train_data.to_csv(train_data_path, index=False)

# Prepare Submission Data
submission_df[['Season', 'TeamID1', 'TeamID2']] = submission_df['ID'].apply(extract_game_info).tolist()

# Merge seed values for both teams
submission_df = submission_df.merge(total_seed_df[['Season', 'TeamID', 'SeedValue']], left_on=['Season', 'TeamID1'], right_on=['Season', 'TeamID'], how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue1'}).drop(columns=['TeamID'])

submission_df = submission_df.merge(total_seed_df[['Season', 'TeamID', 'SeedValue']], left_on=['Season', 'TeamID2'], right_on=['Season', 'TeamID'], how='left')
submission_df = submission_df.rename(columns={'SeedValue': 'SeedValue2'}).drop(columns=['TeamID'])

# Fill missing seed values with the lowest possible seed
submission_df[['SeedValue1', 'SeedValue2']] = submission_df[['SeedValue1', 'SeedValue2']].fillna(16)

# Compute seed difference for prediction
submission_df['SeedDiff'] = submission_df['SeedValue1'] - submission_df['SeedValue2']

# Predict probabilities for submission dataset
submission_df['Pred'] = model.predict_proba(submission_df[['SeedDiff']])[:, 1]

# Clip predictions between 0.05 and 0.95 to prevent extreme values
submission_df['Pred'] = submission_df['Pred'].clip(0.05, 0.95)

# Save final predictions to CSV for Kaggle submission
submission_df[['ID', 'Pred']].to_csv(submission_path, index=False)

# Log Brier Score for tracking improvements
with open(log_path, "a") as log_file:
    log_file.write(f"Brier Score: {brier_score}\n")


Loaded existing model.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match_results['Winner'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  match_results['Winner'] = 1


Brier Score: 0.17543167769336993
