In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load datasets
train = pd.read_csv("bracket_training.csv")
test = pd.read_csv("bracket_test.csv")


In [55]:

# Identify column types
date_columns = ['NCAACustomerRecordCreated', 'BracketEntryCreatedDate']
numerical_columns = [
    'CustomerAreaCode',
    'CustomerPostalCodeLatitude',
    'CustomerPostalCodeLongitude',
    'CustomerDMACode'
]
categorical_columns = [
    'CustomerID',
    'CustomerPostalCode',
    'CustomerDMADescription',
    'BracketEntryId',
    'RegionWinner_East',
    'RegionWinner_West',
    'RegionWinner_South',
    'RegionWinner_Midwest',
    'SemifinalWinner_East_West',
    'SemifinalWinner_South_Midwest',
    'NationalChampion'
]

target_columns = [
    'SemifinalWinner_East_West',
    'SemifinalWinner_South_Midwest',
    'NationalChampion'
]

# Handle missing values
# For categorical columns (including target variables)
for col in categorical_columns:
    if col in train.columns:
        train[col].fillna(train[col].mode()[0], inplace=True)
    if col in test.columns:
        test[col].fillna(test[col].mode()[0], inplace=True)

# For numerical columns
for col in numerical_columns:
    if col in train.columns:
        train[col].fillna(train[col].median(), inplace=True)
    if col in test.columns:
        test[col].fillna(test[col].median(), inplace=True)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

In [None]:
train.drop(columns=date_columns, inplace=True, errors='ignore')
test.drop(columns=date_columns, inplace=True, errors='ignore')

# Convert categorical columns to numeric codes
for col in categorical_columns + target_columns:
    train[col] = train[col].astype('category').cat.codes
    if col in test.columns:  
        test[col] = test[col].astype('category').cat.codes

# Split train data into features and target
X = train.drop(columns=target_columns)  # Features
y = train[target_columns]  # Targets

# Drop unnecessary columns
X = X.drop(columns=['CustomerID', 'BracketEntryId'], errors='ignore')
test = test.drop(columns=['CustomerID', 'BracketEntryId'], errors='ignore')  # ✅ Drop from test too

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)


In [None]:

model = RandomForestClassifier(
    n_estimators=100,    # Keep 100 trees
    max_depth=5,         # Limit tree depth (reduce complexity)
    min_samples_split=10, # Require more samples to split a node
    min_samples_leaf=5,   # Minimum samples per leaf
    random_state=42
)


# Train and predict separately for each target variable
predictions = {}
for target in target_columns:
    model.fit(X_train, y_train[target])
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train[target], y_train_pred)
    val_accuracy = accuracy_score(y_val[target], y_val_pred)
    
    print(f"Accuracy for {target}:")
    print(f"  ✅ Training Accuracy: {train_accuracy:.4f}")
    print(f"  ✅ Validation Accuracy: {val_accuracy:.4f}\n")
    
    # Predict on test data
    predictions[target] = model.predict(test)

# Convert predictions to DataFrame and save to CSV
predictions_df = pd.DataFrame(predictions)
predictions_df.to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv")

Accuracy for SemifinalWinner_East_West:
  ✅ Training Accuracy: 0.6054
  ✅ Validation Accuracy: 0.6007

Accuracy for SemifinalWinner_South_Midwest:
  ✅ Training Accuracy: 0.5610
  ✅ Validation Accuracy: 0.5542

Accuracy for NationalChampion:
  ✅ Training Accuracy: 0.3957
  ✅ Validation Accuracy: 0.3896

Predictions saved to predictions.csv


In [62]:
# Drop date columns

# Initialize RandomForest model
model = RandomForestClassifier(
    n_estimators=100,    
    max_depth=6,         
    min_samples_split=10, 
    min_samples_leaf=3,  
    random_state=42
)

# Train and predict separately for each target variable
predictions = {}
for target in target_columns:
    model.fit(X_train, y_train[target])
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # Calculate accuracy
    train_accuracy = accuracy_score(y_train[target], y_train_pred)
    val_accuracy = accuracy_score(y_val[target], y_val_pred)
    
    print(f"Accuracy for {target}:")
    print(f"  ✅ Training Accuracy: {train_accuracy:.4f}")
    print(f"  ✅ Validation Accuracy: {val_accuracy:.4f}\n")
    
    # Predict on test data
    predictions[target] = model.predict(test)  # ✅ No mismatch now

# Convert predictions to DataFrame and save to CSV
predictions_df = pd.DataFrame(predictions)
predictions_df.to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv")


Accuracy for SemifinalWinner_East_West:
  ✅ Training Accuracy: 0.6480
  ✅ Validation Accuracy: 0.6428

Accuracy for SemifinalWinner_South_Midwest:
  ✅ Training Accuracy: 0.6019
  ✅ Validation Accuracy: 0.5943

Accuracy for NationalChampion:
  ✅ Training Accuracy: 0.4229
  ✅ Validation Accuracy: 0.4166

Predictions saved to predictions.csv
