In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load datasets
train = pd.read_csv("bracket_training.csv")
test = pd.read_csv("bracket_test.csv")


# Load institutions data
institutions = pd.read_csv("ccac_institutions.csv")

# Select relevant features
institutions = institutions[['InstitutionID', 'InstitutionEnrollment_Total', 
                             'RegularSeasonWins', 'RegularSeasonLosses', 'RegularSeasonAverageScore']]



In [96]:
# Identify column types
date_columns = ['NCAACustomerRecordCreated', 'BracketEntryCreatedDate']
numerical_columns = [
    'CustomerAreaCode',
    'CustomerPostalCodeLatitude',
    'CustomerPostalCodeLongitude',
    'CustomerDMACode',
    'BracketEntryId'
]
categorical_columns = [
    'CustomerID',
    'CustomerPostalCode',
    'CustomerDMADescription',
    #'BracketEntryId',
    'RegionWinner_East',
    'RegionWinner_West',
    'RegionWinner_South',
    'RegionWinner_Midwest',
    'SemifinalWinner_East_West',
    'SemifinalWinner_South_Midwest',
    'NationalChampion'
]
target_columns = [
    'SemifinalWinner_East_West',
    'SemifinalWinner_South_Midwest',
    'NationalChampion'
]


# Identify new columns from ccac_institutions
new_numerical_cols = [
    'InstitutionEnrollment_Total',
    'RegularSeasonWins',
    'RegularSeasonLosses',
    'RegularSeasonAverageScore'
]
#new_categorical_cols = ['InstitutionConference']
#new_date_cols = ['InstitutionNCAAMemberSinceDate']

# Append them to the respective lists
numerical_columns += new_numerical_cols
#categorical_columns += new_categorical_cols
#date_columns += new_date_cols

# Merge institutions data for each region winner
for region in ['RegionWinner_East', 'RegionWinner_West', 'RegionWinner_South', 'RegionWinner_Midwest']:
    # Update the suffixes to match what we'll use in create_matchup_features
    train = train.merge(institutions, left_on=region, right_on='InstitutionID', how='left', 
                       suffixes=('', f'_{region}'))
    test = test.merge(institutions, left_on=region, right_on='InstitutionID', how='left', 
                     suffixes=('', f'_{region}'))
    
    # Rename the columns to match our expected format
    for col in new_numerical_cols:
        if f'{col}' in train.columns:  # If this is the first merge
            train = train.rename(columns={f'{col}': f'{col}_{region}'})
            test = test.rename(columns={f'{col}': f'{col}_{region}'})

# Drop duplicate InstitutionID columns
train.drop(columns=['InstitutionID'], inplace=True, errors='ignore')
test.drop(columns=['InstitutionID'], inplace=True, errors='ignore')

# Handle missing values
# For categorical columns (including target variables)
for col in categorical_columns:
    if col in train.columns:
        train[col].fillna(train[col].mode()[0], inplace=True)
    if col in test.columns:
        test[col].fillna(test[col].mode()[0], inplace=True)

# For numerical columns
for col in numerical_columns:
    if col in train.columns:
        train[col].fillna(train[col].median(), inplace=True)
    if col in test.columns:
        test[col].fillna(test[col].median(), inplace=True)

# Convert date columns to datetime format
for col in date_columns:
    if col in train.columns:
        train[col] = pd.to_datetime(train[col], errors='coerce')
    if col in test.columns:
        test[col] = pd.to_datetime(test[col], errors='coerce')

print("Data preprocessing complete! 🚀")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Data preprocessing complete! 🚀


In [110]:
# Create features for each team's matchup
def create_matchup_features(df, region1, region2):
    features = []
    for col in ['InstitutionEnrollment_Total', 'RegularSeasonWins', 'RegularSeasonLosses', 'RegularSeasonAverageScore']:
        # Get stats for both teams
        team1_col = f"{col}_{region1}"
        team2_col = f"{col}_{region2}"
        
        # Create relative features
        features.append(f"{col}_diff_{region1}_{region2}")
        df[f"{col}_diff_{region1}_{region2}"] = df[team1_col] - df[team2_col]
        
        # Create ratio features (avoiding division by zero)
        features.append(f"{col}_ratio_{region1}_{region2}")
        df[f"{col}_ratio_{region1}_{region2}"] = df[team1_col] / df[team2_col].replace(0, 1)
    
    return features

# Create features for East vs West matchup
east_west_features = create_matchup_features(train, 'RegionWinner_East', 'RegionWinner_West')
create_matchup_features(test, 'RegionWinner_East', 'RegionWinner_West')

# Create features for South vs Midwest matchup
south_midwest_features = create_matchup_features(train, 'RegionWinner_South', 'RegionWinner_Midwest')
create_matchup_features(test, 'RegionWinner_South', 'RegionWinner_Midwest')

# Train models for each semifinal matchup
# East vs West
X_east_west = train[east_west_features]
y_east_west = (train['SemifinalWinner_East_West'] == train['RegionWinner_East']).astype(int)
X_train_ew, X_val_ew, y_train_ew, y_val_ew = train_test_split(X_east_west, y_east_west, test_size=0.2, random_state=42)

# South vs Midwest
X_south_midwest = train[south_midwest_features]
y_south_midwest = (train['SemifinalWinner_South_Midwest'] == train['RegionWinner_South']).astype(int)
X_train_sm, X_val_sm, y_train_sm, y_val_sm = train_test_split(X_south_midwest, y_south_midwest, test_size=0.2, random_state=42)

# Train models
model_east_west = RandomForestClassifier(random_state=42)
model_south_midwest = RandomForestClassifier(random_state=42)

# Fit and evaluate East vs West model
model_east_west.fit(X_train_ew, y_train_ew)
train_acc_ew = accuracy_score(y_train_ew, model_east_west.predict(X_train_ew))
val_acc_ew = accuracy_score(y_val_ew, model_east_west.predict(X_val_ew))
print(f"\nEast vs West Accuracy Scores:")
print(f"Training accuracy: {train_acc_ew:.3f}")
print(f"Validation accuracy: {val_acc_ew:.3f}")

# Fit and evaluate South vs Midwest model
model_south_midwest.fit(X_train_sm, y_train_sm)
train_acc_sm = accuracy_score(y_train_sm, model_south_midwest.predict(X_train_sm))
val_acc_sm = accuracy_score(y_val_sm, model_south_midwest.predict(X_val_sm))
print(f"\nSouth vs Midwest Accuracy Scores:")
print(f"Training accuracy: {train_acc_sm:.3f}")
print(f"Validation accuracy: {val_acc_sm:.3f}")

#model_east_west.fit(X_east_west, y_east_west)
#model_south_midwest.fit(X_south_midwest, y_south_midwest)

# Make predictions
# East vs West
east_west_probs = model_east_west.predict_proba(test[east_west_features])
test['East_Win_Probability'] = east_west_probs[:, 1]
test['SemifinalWinner_East_West'] = np.where(
    test['East_Win_Probability'] >= 0.5,
    test['RegionWinner_East'],
    test['RegionWinner_West']
)

# South vs Midwest
south_midwest_probs = model_south_midwest.predict_proba(test[south_midwest_features])
test['South_Win_Probability'] = south_midwest_probs[:, 1]
test['SemifinalWinner_South_Midwest'] = np.where(
    test['South_Win_Probability'] >= 0.5,
    test['RegionWinner_South'],
    test['RegionWinner_Midwest']
)


# Train model for championship game
X_championship = train[championship_features]
y_championship = (train['NationalChampion'] == train['SemifinalWinner_East_West']).astype(int)
# Split championship data
X_train_ch, X_val_ch, y_train_ch, y_val_ch = train_test_split(X_championship, y_championship, test_size=0.2, random_state=42)
# Train and evaluate championship model
model_championship = RandomForestClassifier(random_state=42)
model_championship.fit(X_train_ch, y_train_ch)
train_acc_ch = accuracy_score(y_train_ch, model_championship.predict(X_train_ch))
val_acc_ch = accuracy_score(y_val_ch, model_championship.predict(X_val_ch))
print(f"\nChampionship Game Accuracy Scores:")
print(f"Training accuracy: {train_acc_ch:.3f}")
print(f"Validation accuracy: {val_acc_ch:.3f}")

# Train championship model
model_championship = RandomForestClassifier(random_state=42)
model_championship.fit(X_championship, y_championship)

# Make championship predictions
championship_probs = model_championship.predict_proba(test[championship_features])
test['EastWest_Win_Probability'] = championship_probs[:, 1]
test['NationalChampion'] = np.where(
    test['EastWest_Win_Probability'] >= 0.5,
    test['SemifinalWinner_East_West'],
    test['SemifinalWinner_South_Midwest']
)

# Select the relevant columns
output_columns = [
    'BracketEntryId',
    'RegionWinner_East',
    'RegionWinner_West',
    'RegionWinner_South',
    'RegionWinner_Midwest',
    'SemifinalWinner_East_West',
    'SemifinalWinner_South_Midwest',
    'NationalChampion'  # Added this column
]

# Create a new DataFrame with only these columns
output_df = test[output_columns]

# Export to Excel
output_df.to_csv('bracket_predictions.csv', index=False)

print("Predictions exported to bracket_predictions.csv! 📊")
print("Predictions complete! 🏀")


East vs West Accuracy Scores:
Training accuracy: 0.676
Validation accuracy: 0.672

South vs Midwest Accuracy Scores:
Training accuracy: 0.632
Validation accuracy: 0.629

Championship Game Accuracy Scores:
Training accuracy: 0.625
Validation accuracy: 0.618
Predictions exported to bracket_predictions.csv! 📊
Predictions complete! 🏀
