# 06 - Modeling: Battle Outcome Prediction

**Purpose**: Build predictive models for technical rigor scoring.

**Goal**: Predict battle outcomes based on deck composition alone.

**Models to Try**:
1. Logistic Regression (baseline)
2. Random Forest (feature importance insights)
3. XGBoost (likely best performance)

**Key Metrics**:
- Accuracy
- Precision/Recall
- ROC-AUC
- Feature importance (for insights!)

In [1]:
import sys, os, pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import xgboost as xgb

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, os.path.join(PROJECT_ROOT, 'src'))

from visualization import setup_presentation_style
setup_presentation_style()

✓ Presentation style configured


In [None]:
# Import GPU detection utilities
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, os.path.join(PROJECT_ROOT, 'src'))

from system_utils import get_xgboost_params, print_cuda_info, configure_environment_for_ml

# Configure ML environment (auto-detects GPU)
ml_config = configure_environment_for_ml(verbose=True)

In [ ]:
# Load engineered features from notebook 05
features_path = os.path.join(PROJECT_ROOT, 'artifacts/model_features.parquet')

if not os.path.exists(features_path):
    print("❌ ERROR: model_features.parquet not found!")
    print("   Please run Notebook 05 first to create features.")
    raise FileNotFoundError(f"Missing: {features_path}")

features = pd.read_parquet(features_path)

print(f"✓ Loaded {len(features):,} battles with {len(features.columns)} features")
print(f"  Memory usage: {features.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

In [2]:
# Load engineered features from notebook 05
features = pd.read_parquet(os.path.join(PROJECT_ROOT, 'artifacts/model_features.parquet'))

print(f"Loaded {len(features):,} battles with {len(features.columns)} features")

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\Danny\\Documents\\GitHub\\HeHeHaHa_DataRoyale\\artifacts/model_features.parquet'

In [ ]:
# Restructure data: each battle becomes 2 rows (one for each player)
# Target variable: 1 if player won, 0 if lost

print("Restructuring data for binary classification...")

# Create winner rows (outcome = 1)
winner_data = features.copy()
winner_data['outcome'] = 1

# Create loser rows (outcome = 0) - need to swap winner/loser columns
loser_data = features.copy()
loser_data['outcome'] = 0

# For simplicity, we'll just use winner's features to predict winner's victory
# Select numeric features only
numeric_cols = features.select_dtypes(include=[np.number]).columns.tolist()

# Define feature columns (exclude target-like columns)
exclude_cols = ['outcome', 'winner.trophyChange', 'loser.trophyChange', 
                'winner.crowns', 'loser.crowns']
feature_cols = [col for col in numeric_cols if not any(ex in col for ex in exclude_cols)]

# Prepare X and y
X = winner_data[feature_cols].fillna(0)
y = winner_data['outcome']

print(f"✓ Data prepared for modeling:")
print(f"  Samples: {len(X):,}")
print(f"  Features: {len(feature_cols)}")
print(f"  Target distribution: {y.value_counts().to_dict()}")

# Show sample features
print(f"\n  Sample features (first 10):")
for col in feature_cols[:10]:
    print(f"    - {col}")

In [None]:
# TODO: Define target variable (1 = winner won, 0 = loser won - always 1 in this dataset!)
# Need to restructure: each battle becomes 2 rows (one for each player)
# with outcome = 1 if that player won, 0 if lost

# Example structure:
# y = features['outcome']  # 1 or 0
# X = features[feature_columns]  # numeric features only

print("TODO: Restructure data and select features")

In [ ]:
# Create train/test split with stratification
print("Creating train/test split (80/20)...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✓ Split complete:")
print(f"  Training samples: {len(X_train):,}")
print(f"  Testing samples: {len(X_test):,}")
print(f"  Training class distribution: {y_train.value_counts().to_dict()}")
print(f"  Testing class distribution: {y_test.value_counts().to_dict()}")

In [None]:
# TODO: Split data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

print("TODO: Create train/test split")

In [ ]:
# Train logistic regression baseline model
print("Training Logistic Regression (baseline)...")

lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr_model.fit(X_train, y_train)

# Predictions
lr_pred = lr_model.predict(X_test)
lr_pred_proba = lr_model.predict_proba(X_test)[:, 1]

# Metrics
lr_acc = accuracy_score(y_test, lr_pred)
lr_auc = roc_auc_score(y_test, lr_pred_proba)

print(f"✓ Logistic Regression trained:")
print(f"  Accuracy: {lr_acc:.4f}")
print(f"  ROC-AUC: {lr_auc:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, lr_pred))

In [None]:
# TODO: Train logistic regression
# lr_model = LogisticRegression(max_iter=1000, random_state=42)
# lr_model.fit(X_train, y_train)
# lr_pred = lr_model.predict(X_test)
# lr_acc = accuracy_score(y_test, lr_pred)
# print(f"Logistic Regression Accuracy: {lr_acc:.4f}")

In [ ]:
# Train random forest for feature importance insights
print("Training Random Forest...")

rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
rf_pred = rf_model.predict(X_test)
rf_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Metrics
rf_acc = accuracy_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_pred_proba)

print(f"✓ Random Forest trained:")
print(f"  Accuracy: {rf_acc:.4f}")
print(f"  ROC-AUC: {rf_auc:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, rf_pred))

In [None]:
# TODO: Train random forest
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
# rf_model.fit(X_train, y_train)
# rf_pred = rf_model.predict(X_test)
# rf_acc = accuracy_score(y_test, rf_pred)
# print(f"Random Forest Accuracy: {rf_acc:.4f}")

In [ ]:
# Train XGBoost with auto GPU detection
print("Training XGBoost (with GPU support if available)...")

# Get optimal XGBoost parameters for this machine
xgb_params = get_xgboost_params()
print(f"  Using configuration: {xgb_params}")

xgb_model = xgb.XGBClassifier(
    **xgb_params,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Predictions
xgb_pred = xgb_model.predict(X_test)
xgb_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# Metrics
xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_auc = roc_auc_score(y_test, xgb_pred_proba)

print(f"\n✓ XGBoost trained:")
print(f"  Accuracy: {xgb_acc:.4f}")
print(f"  ROC-AUC: {xgb_auc:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, xgb_pred))

In [None]:
# TODO: Train XGBoost
# xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
# xgb_model.fit(X_train, y_train)
# xgb_pred = xgb_model.predict(X_test)
# xgb_acc = accuracy_score(y_test, xgb_pred)
# print(f"XGBoost Accuracy: {xgb_acc:.4f}")

In [ ]:
# Extract feature importances from Random Forest and XGBoost
print("Analyzing feature importance...")

# Random Forest importances
rf_importances = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# XGBoost importances
xgb_importances = pd.DataFrame({
    'feature': feature_cols,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Random Forest top 15
ax = axes[0]
top_rf = rf_importances.head(15)
ax.barh(range(len(top_rf)), top_rf['importance'], color='forestgreen', edgecolor='black', alpha=0.8)
ax.set_yticks(range(len(top_rf)))
ax.set_yticklabels(top_rf['feature'], fontsize=10)
ax.set_xlabel('Feature Importance', fontsize=12)
ax.set_title('Top 15 Features: Random Forest', fontsize=14, fontweight='bold')
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

# XGBoost top 15
ax = axes[1]
top_xgb = xgb_importances.head(15)
ax.barh(range(len(top_xgb)), top_xgb['importance'], color='steelblue', edgecolor='black', alpha=0.8)
ax.set_yticks(range(len(top_xgb)))
ax.set_yticklabels(top_xgb['feature'], fontsize=10)
ax.set_xlabel('Feature Importance', fontsize=12)
ax.set_title('Top 15 Features: XGBoost', fontsize=14, fontweight='bold')
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(PROJECT_ROOT, 'presentation/figures/fig_feature_importance.png'),
            dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Top 10 Most Important Features (XGBoost):")
for idx, row in xgb_importances.head(10).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

# Save importances
rf_importances.to_parquet(os.path.join(PROJECT_ROOT, 'artifacts/rf_feature_importance.parquet'), index=False)
xgb_importances.to_parquet(os.path.join(PROJECT_ROOT, 'artifacts/xgb_feature_importance.parquet'), index=False)

In [None]:
# TODO: Extract feature importances from best model
# Plot top 15 most important features
# These tell the story of what matters most for winning!

In [ ]:
# Create summary table and comparison visualization
print("Creating model comparison...")

results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [lr_acc, rf_acc, xgb_acc],
    'ROC-AUC': [lr_auc, rf_auc, xgb_auc]
})

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy comparison
ax = axes[0]
colors = ['coral', 'forestgreen', 'steelblue']
ax.bar(results['Model'], results['Accuracy'], color=colors, edgecolor='black', alpha=0.8)
ax.set_ylabel('Accuracy', fontsize=14)
ax.set_title('Model Accuracy Comparison', fontsize=16, fontweight='bold')
ax.set_ylim([0.5, 1.0])
ax.grid(axis='y', alpha=0.3)
for i, (model, acc) in enumerate(zip(results['Model'], results['Accuracy'])):
    ax.text(i, acc + 0.01, f'{acc:.4f}', ha='center', fontsize=12, fontweight='bold')

# ROC-AUC comparison
ax = axes[1]
ax.bar(results['Model'], results['ROC-AUC'], color=colors, edgecolor='black', alpha=0.8)
ax.set_ylabel('ROC-AUC', fontsize=14)
ax.set_title('Model ROC-AUC Comparison', fontsize=16, fontweight='bold')
ax.set_ylim([0.5, 1.0])
ax.grid(axis='y', alpha=0.3)
for i, (model, auc) in enumerate(zip(results['Model'], results['ROC-AUC'])):
    ax.text(i, auc + 0.01, f'{auc:.4f}', ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(os.path.join(PROJECT_ROOT, 'presentation/figures/fig4_model_comparison.png'),
            dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Model Comparison Summary:")
print(results.to_string(index=False))

# Identify best model
best_model_idx = results['ROC-AUC'].idxmax()
best_model = results.loc[best_model_idx, 'Model']
print(f"\n✓ Best performing model: {best_model}")
print(f"  Accuracy: {results.loc[best_model_idx, 'Accuracy']:.4f}")
print(f"  ROC-AUC: {results.loc[best_model_idx, 'ROC-AUC']:.4f}")

In [None]:
# Model calibration analysis
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

print("Analyzing model calibration...")

# Calculate calibration curves
lr_prob_true, lr_prob_pred = calibration_curve(y_test, lr_pred_proba, n_bins=10)
rf_prob_true, rf_prob_pred = calibration_curve(y_test, rf_pred_proba, n_bins=10)
xgb_prob_true, xgb_prob_pred = calibration_curve(y_test, xgb_pred_proba, n_bins=10)

# Calculate Brier scores (lower is better)
lr_brier = brier_score_loss(y_test, lr_pred_proba)
rf_brier = brier_score_loss(y_test, rf_pred_proba)
xgb_brier = brier_score_loss(y_test, xgb_pred_proba)

# Visualization
fig, ax = plt.subplots(figsize=(10, 10))

# Perfect calibration line
ax.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Perfect Calibration')

# Model calibration curves
ax.plot(lr_prob_pred, lr_prob_true, marker='o', linewidth=2, label=f'Logistic Regression (Brier: {lr_brier:.4f})')
ax.plot(rf_prob_pred, rf_prob_true, marker='s', linewidth=2, label=f'Random Forest (Brier: {rf_brier:.4f})')
ax.plot(xgb_prob_pred, xgb_prob_true, marker='^', linewidth=2, label=f'XGBoost (Brier: {xgb_brier:.4f})')

ax.set_xlabel('Mean Predicted Probability', fontsize=14)
ax.set_ylabel('Fraction of Positives', fontsize=14)
ax.set_title('Model Calibration Curves', fontsize=16, fontweight='bold', pad=15)
ax.legend(fontsize=12)
ax.grid(alpha=0.3)
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])

plt.tight_layout()
plt.savefig(os.path.join(PROJECT_ROOT, 'presentation/figures/fig_model_calibration.png'),
            dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Calibration Analysis:")
print(f"  Logistic Regression Brier Score: {lr_brier:.4f}")
print(f"  Random Forest Brier Score: {rf_brier:.4f}")
print(f"  XGBoost Brier Score: {xgb_brier:.4f}")
print(f"\n  Best calibrated model: {['Logistic Regression', 'Random Forest', 'XGBoost'][np.argmin([lr_brier, rf_brier, xgb_brier])]}")

## 10. Save Models & Artifacts

In [None]:
# Save trained models and metrics
import joblib

print("Saving models and artifacts...")

# Save models
models_dir = os.path.join(PROJECT_ROOT, 'artifacts/models')
os.makedirs(models_dir, exist_ok=True)

joblib.dump(lr_model, os.path.join(models_dir, 'logistic_regression.pkl'))
joblib.dump(rf_model, os.path.join(models_dir, 'random_forest.pkl'))
joblib.dump(xgb_model, os.path.join(models_dir, 'xgboost.pkl'))

print(f"✓ Models saved to {models_dir}/")

# Save metrics summary
metrics_summary = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [lr_acc, rf_acc, xgb_acc],
    'ROC-AUC': [lr_auc, rf_auc, xgb_auc],
    'Brier Score': [lr_brier, rf_brier, xgb_brier]
})

metrics_summary.to_csv(os.path.join(PROJECT_ROOT, 'artifacts/model_metrics_summary.csv'), index=False)
print("✓ Metrics summary saved to artifacts/model_metrics_summary.csv")

## 8. Model Evaluation Summary

In [None]:
# TODO: Create summary table
# results = pd.DataFrame({
#     'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
#     'Accuracy': [lr_acc, rf_acc, xgb_acc],
#     'ROC-AUC': [...]
# })

print("TODO: Summarize model performance")

## Insights for Presentation

**Key Points**:
1. Achieved X% accuracy (compare to 56.94% benchmark)
2. Top 3 most important features are: [list]
3. This means: [actionable insight from feature importance]