In [43]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# Load original baseline artifacts
scaler_original = pickle.load(open("../outputs/baseline_scaler.pkl", "rb"))
model_original = pickle.load(open("../outputs/baseline_model.pkl", "rb"))

# Get EXACT features the original baseline used
feature_cols = list(scaler_original.feature_names_in_)
print(f"Using {len(feature_cols)} features from original baseline:")
print(feature_cols)

# Load model input
model_input = pd.read_csv("../data/processed/model_input.csv")

# Use ONLY the 28 features
X = model_input[feature_cols].fillna(0)
y = model_input['home_win']

# Same split as original
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Use the ORIGINAL scaler
X_train_scaled = scaler_original.transform(X_train)
X_test_scaled = scaler_original.transform(X_test)

# Verify original baseline still works
baseline_pred = model_original.predict_proba(X_test_scaled)[:, 1]
baseline_auc = roc_auc_score(y_test, baseline_pred)

print(f"\n‚úÖ Original baseline AUC: {baseline_auc:.4f}")
print(f"‚úÖ Data loaded correctly")
print(f"Training: {X_train.shape}")
print(f"Test: {X_test.shape}")

# Store
models = {}
predictions = {}
auc_scores = {}
scaler = scaler_original


Using 28 features from original baseline:
['home_xg', 'away_xg', 'home_shots', 'away_shots', 'home_penalties_committed', 'away_penalties_committed', 'home_games', 'home_losses', 'home_goal_diff', 'home_first_xg', 'home_first_toi', 'home_first_eff', 'home_second_xg', 'home_second_toi', 'home_second_eff', 'home_offensive_disparity', 'away_games', 'away_wins', 'away_losses', 'away_goal_diff', 'away_win_rate', 'away_first_xg', 'away_first_toi', 'away_first_eff', 'away_second_xg', 'away_second_toi', 'away_second_eff', 'away_offensive_disparity']

‚úÖ Original baseline AUC: 0.6900
‚úÖ Data loaded correctly
Training: (1049, 28)
Test: (263, 28)


In [44]:
print("="*60)
print("MODEL 1: LOGISTIC REGRESSION")
print("="*60)

from sklearn.linear_model import LogisticRegression

# Train
lr = LogisticRegression(
    max_iter=1000,
    C=1.0,  # Regularization strength
    random_state=42
)
lr.fit(X_train_scaled, y_train)

# Predict and evaluate
pred_lr = lr.predict_proba(X_test_scaled)[:, 1]
auc_lr = roc_auc_score(y_test, pred_lr)

# Save
models['Logistic'] = {'model': lr, 'needs_scaling': True}
predictions['Logistic'] = pred_lr
auc_scores['Logistic'] = auc_lr

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': lr.coef_[0]
}).sort_values('coefficient', key=abs, ascending=False)

print(f"\n‚úÖ AUC: {auc_lr:.4f}")
print(f"\nTop 5 features:")
print(feature_importance.head().to_string(index=False))
print("="*60 + "\n")


MODEL 1: LOGISTIC REGRESSION

‚úÖ AUC: 0.6900

Top 5 features:
                 feature  coefficient
           away_first_xg     0.597675
away_offensive_disparity    -0.574454
              away_shots    -0.546656
             home_losses    -0.520231
          away_second_xg    -0.479761



In [45]:
print("="*60)
print("MODEL 2: RANDOM FOREST")
print("="*60)

from sklearn.ensemble import RandomForestClassifier

# Train
rf = RandomForestClassifier(
    n_estimators=200,      # Number of trees
    max_depth=8,           # Tree depth
    min_samples_split=20,  # Min samples to split
    min_samples_leaf=10,   # Min samples per leaf
    max_features='sqrt',   # Features per split
    random_state=42,
    n_jobs=-1              # Use all CPU cores
)
rf.fit(X_train, y_train)

# Predict and evaluate
pred_rf = rf.predict_proba(X_test)[:, 1]
auc_rf = roc_auc_score(y_test, pred_rf)

# Save
models['RandomForest'] = {'model': rf, 'needs_scaling': False}
predictions['RandomForest'] = pred_rf
auc_scores['RandomForest'] = auc_rf

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n‚úÖ AUC: {auc_rf:.4f}")
print(f"Improvement over Logistic: {auc_rf - auc_lr:+.4f}")
print(f"\nTop 5 features:")
print(feature_importance.head().to_string(index=False))
print("="*60 + "\n")


MODEL 2: RANDOM FOREST

‚úÖ AUC: 0.6747
Improvement over Logistic: -0.0153

Top 5 features:
                 feature  importance
                 away_xg    0.137498
              away_shots    0.085912
                 home_xg    0.085168
home_penalties_committed    0.069544
              home_shots    0.050195



In [46]:
print("="*60)
print("MODEL 3: XGBOOST")
print("="*60)

from xgboost import XGBClassifier

# Train
xgb = XGBClassifier(
    n_estimators=200,       # Number of boosting rounds
    max_depth=5,            # Tree depth
    learning_rate=0.05,     # Shrinkage (lower = more conservative)
    subsample=0.8,          # Row sampling per tree
    colsample_bytree=0.8,   # Column sampling per tree
    min_child_weight=3,     # Minimum samples per leaf
    gamma=0.1,              # Regularization
    random_state=42,
    eval_metric='logloss'
)
xgb.fit(X_train, y_train)

# Predict and evaluate
pred_xgb = xgb.predict_proba(X_test)[:, 1]
auc_xgb = roc_auc_score(y_test, pred_xgb)

# Save
models['XGBoost'] = {'model': xgb, 'needs_scaling': False}
predictions['XGBoost'] = pred_xgb
auc_scores['XGBoost'] = auc_xgb

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n‚úÖ AUC: {auc_xgb:.4f}")
print(f"Improvement over Logistic: {auc_xgb - auc_lr:+.4f}")
print(f"Improvement over RandomForest: {auc_xgb - auc_rf:+.4f}")
print(f"\nTop 5 features:")
print(feature_importance.head().to_string(index=False))
print("="*60 + "\n")


MODEL 3: XGBOOST

‚úÖ AUC: 0.6586
Improvement over Logistic: -0.0314
Improvement over RandomForest: -0.0161

Top 5 features:
                 feature  importance
           away_win_rate    0.078207
                 away_xg    0.048013
               away_wins    0.045788
         home_second_eff    0.045492
home_penalties_committed    0.044027



In [47]:
print("="*60)
print("MODEL 4: LIGHTGBM")
print("="*60)

from lightgbm import LGBMClassifier

# Train
lgb = LGBMClassifier(
    n_estimators=200,       # Number of boosting rounds
    max_depth=5,            # Tree depth
    learning_rate=0.05,     # Shrinkage
    num_leaves=31,          # Max leaves per tree
    subsample=0.8,          # Row sampling
    colsample_bytree=0.8,   # Column sampling
    min_child_samples=20,   # Min samples per leaf
    reg_alpha=0.1,          # L1 regularization
    reg_lambda=0.1,         # L2 regularization
    random_state=42,
    verbose=-1              # Suppress output
)
lgb.fit(X_train, y_train)

# Predict and evaluate
pred_lgb = lgb.predict_proba(X_test)[:, 1]
auc_lgb = roc_auc_score(y_test, pred_lgb)

# Save
models['LightGBM'] = {'model': lgb, 'needs_scaling': False}
predictions['LightGBM'] = pred_lgb
auc_scores['LightGBM'] = auc_lgb

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': lgb.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n‚úÖ AUC: {auc_lgb:.4f}")
print(f"Improvement over Logistic: {auc_lgb - auc_lr:+.4f}")
print(f"Improvement over XGBoost: {auc_lgb - auc_xgb:+.4f}")
print(f"\nTop 5 features:")
print(feature_importance.head().to_string(index=False))
print("="*60 + "\n")


MODEL 4: LIGHTGBM

‚úÖ AUC: 0.6273
Improvement over Logistic: -0.0626
Improvement over XGBoost: -0.0313

Top 5 features:
                 feature  importance
                 away_xg         352
                 home_xg         346
              away_shots         188
              home_shots         183
away_penalties_committed         158



In [48]:
print("="*60)
print("MODEL 5: CATBOOST")
print("="*60)

from catboost import CatBoostClassifier

# Train
cat = CatBoostClassifier(
    iterations=200,         # Number of boosting rounds
    depth=5,                # Tree depth
    learning_rate=0.05,     # Shrinkage
    l2_leaf_reg=3,          # L2 regularization
    random_seed=42,
    verbose=False           # Suppress output
)
cat.fit(X_train, y_train)

# Predict and evaluate
pred_cat = cat.predict_proba(X_test)[:, 1]
auc_cat = roc_auc_score(y_test, pred_cat)

# Save
models['CatBoost'] = {'model': cat, 'needs_scaling': False}
predictions['CatBoost'] = pred_cat
auc_scores['CatBoost'] = auc_cat

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': cat.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n‚úÖ AUC: {auc_cat:.4f}")
print(f"Improvement over Logistic: {auc_cat - auc_lr:+.4f}")
print(f"Improvement over LightGBM: {auc_cat - auc_lgb:+.4f}")
print(f"\nTop 5 features:")
print(feature_importance.head().to_string(index=False))
print("="*60 + "\n")


MODEL 5: CATBOOST

‚úÖ AUC: 0.6635
Improvement over Logistic: -0.0265
Improvement over LightGBM: +0.0361

Top 5 features:
                 feature  importance
                 away_xg   10.302674
home_penalties_committed    7.918733
              away_shots    7.305247
away_penalties_committed    7.010467
                 home_xg    6.527714



In [49]:
print("\n" + "="*60)
print("MODEL PERFORMANCE COMPARISON")
print("="*60)

results = pd.DataFrame({
    'Model': list(auc_scores.keys()),
    'AUC': list(auc_scores.values())
}).sort_values('AUC', ascending=False)

results['Improvement'] = results['AUC'] - auc_lr
results['Rank'] = range(1, len(results) + 1)

print(results.to_string(index=False))
print("="*60)

best_model = results.iloc[0]['Model']
best_auc = results.iloc[0]['AUC']
print(f"\nüèÜ BEST MODEL: {best_model} (AUC: {best_auc:.4f})")



MODEL PERFORMANCE COMPARISON
       Model      AUC  Improvement  Rank
    Logistic 0.689953     0.000000     1
RandomForest 0.674677    -0.015276     2
    CatBoost 0.663455    -0.026498     3
     XGBoost 0.658578    -0.031375     4
    LightGBM 0.627321    -0.062632     5

üèÜ BEST MODEL: Logistic (AUC: 0.6900)


In [50]:
print("\n" + "="*60)
print("ENSEMBLE CREATION")
print("="*60)

from scipy.optimize import minimize

# Create prediction matrix
pred_matrix = np.column_stack([predictions[m] for m in auc_scores.keys()])

# Optimization function
def ensemble_auc(weights):
    weights = np.abs(weights)
    weights = weights / weights.sum()
    ensemble = pred_matrix @ weights
    return -roc_auc_score(y_test, ensemble)

# Optimize weights
initial_weights = np.ones(len(auc_scores)) / len(auc_scores)
result = minimize(ensemble_auc, initial_weights, method='Nelder-Mead')

# Get optimal weights
optimal_weights = np.abs(result.x)
optimal_weights = optimal_weights / optimal_weights.sum()

print("\nOptimal weights:")
for model_name, weight in zip(auc_scores.keys(), optimal_weights):
    print(f"  {model_name:15} ‚Üí {weight:.3f}")

# Ensemble prediction
ensemble_pred = pred_matrix @ optimal_weights
auc_ensemble = roc_auc_score(y_test, ensemble_pred)

print(f"\n‚úÖ ENSEMBLE AUC: {auc_ensemble:.4f}")
print(f"   Best single: {best_auc:.4f}")
print(f"   Improvement: {auc_ensemble - best_auc:+.4f}")
print("="*60)



ENSEMBLE CREATION

Optimal weights:
  Logistic        ‚Üí 0.340
  RandomForest    ‚Üí 0.370
  XGBoost         ‚Üí 0.006
  LightGBM        ‚Üí 0.001
  CatBoost        ‚Üí 0.283

‚úÖ ENSEMBLE AUC: 0.6848
   Best single: 0.6900
   Improvement: -0.0052


In [51]:
print("FILTERING TO BEST MODELS ONLY")
print("="*60)

# Keep only models better than baseline
baseline_auc = 0.69
best_models = {k: v for k, v in auc_scores.items() if v >= baseline_auc}

print(f"Models above baseline ({baseline_auc:.4f}):")
for model_name, auc in best_models.items():
    print(f"  ‚úÖ {model_name:15} ‚Üí {auc:.4f}")

print(f"\nDropped models:")
for model_name, auc in auc_scores.items():
    if auc < baseline_auc:
        print(f"  ‚ùå {model_name:15} ‚Üí {auc:.4f}")

# Re-optimize with only best models
if len(best_models) > 1:
    best_pred_matrix = np.column_stack([predictions[m] for m in best_models.keys()])
    
    def best_ensemble_auc(weights):
        weights = np.abs(weights)
        weights = weights / weights.sum()
        ensemble = best_pred_matrix @ weights
        return -roc_auc_score(y_test, ensemble)
    
    initial = np.ones(len(best_models)) / len(best_models)
    result = minimize(best_ensemble_auc, initial, method='Nelder-Mead')
    
    best_weights = np.abs(result.x)
    best_weights = best_weights / best_weights.sum()
    
    print("\nOptimal weights (best models only):")
    for model_name, weight in zip(best_models.keys(), best_weights):
        print(f"  {model_name:15} ‚Üí {weight:.3f}")
    
    best_ensemble_pred = best_pred_matrix @ best_weights
    auc_best_ensemble = roc_auc_score(y_test, best_ensemble_pred)
    
    print(f"\n‚úÖ FILTERED ENSEMBLE AUC: {auc_best_ensemble:.4f}")
    print(f"   Original ensemble: {auc_ensemble:.4f}")
    print(f"   Improvement: {auc_best_ensemble - auc_ensemble:+.4f}")
else:
    print("\n‚ö†Ô∏è Only 1 model above baseline - use single best model")


FILTERING TO BEST MODELS ONLY
Models above baseline (0.6900):

Dropped models:
  ‚ùå Logistic        ‚Üí 0.6900
  ‚ùå RandomForest    ‚Üí 0.6747
  ‚ùå XGBoost         ‚Üí 0.6586
  ‚ùå LightGBM        ‚Üí 0.6273
  ‚ùå CatBoost        ‚Üí 0.6635

‚ö†Ô∏è Only 1 model above baseline - use single best model


In [52]:
# Package ensemble
ensemble_package = {
    'models': models,
    'weights': dict(zip(auc_scores.keys(), optimal_weights)),
    'scaler': scaler,
    'feature_cols': feature_cols,
    'auc': auc_ensemble,
    'individual_aucs': auc_scores
}

with open('../outputs/ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_package, f)

print("‚úÖ Saved: outputs/ensemble_model.pkl")
print(f"\nPackage contains:")
print(f"  - {len(models)} trained models")
print(f"  - Optimal weights")
print(f"  - Feature scaler")
print(f"  - Ensemble AUC: {auc_ensemble:.4f}")


‚úÖ Saved: outputs/ensemble_model.pkl

Package contains:
  - 5 trained models
  - Optimal weights
  - Feature scaler
  - Ensemble AUC: 0.6848


In [53]:
print("="*60)
print("INDIVIDUAL MODEL PERFORMANCE:")
print("="*60)

for model_name, auc in auc_scores.items():
    diff = auc - 0.69
    symbol = "‚úÖ" if auc > 0.69 else "‚ö†Ô∏è"
    print(f"{symbol} {model_name:15} AUC: {auc:.4f}  ({diff:+.4f})")

print("="*60)
print(f"Ensemble AUC: {auc_ensemble:.4f}")
print(f"Baseline (Logistic): 0.6900")
print("="*60)


INDIVIDUAL MODEL PERFORMANCE:
‚ö†Ô∏è Logistic        AUC: 0.6900  (-0.0000)
‚ö†Ô∏è RandomForest    AUC: 0.6747  (-0.0153)
‚ö†Ô∏è XGBoost         AUC: 0.6586  (-0.0314)
‚ö†Ô∏è LightGBM        AUC: 0.6273  (-0.0627)
‚ö†Ô∏è CatBoost        AUC: 0.6635  (-0.0265)
Ensemble AUC: 0.6848
Baseline (Logistic): 0.6900
