# ðŸ¤– Model Training & Evaluation

**Author:** Your Name  
**Date:** 2024  
**Objective:** Train, evaluate, and select the best model for churn prediction

---

## Table of Contents
1. [Setup](#1.-Setup)
2. [Data Preparation](#2.-Data-Preparation)
3. [Baseline Models](#3.-Baseline-Models)
4. [Advanced Models](#4.-Advanced-Models)
5. [Model Comparison](#5.-Model-Comparison)
6. [Hyperparameter Tuning](#6.-Hyperparameter-Tuning)
7. [Final Evaluation](#7.-Final-Evaluation)
8. [Model Interpretability](#8.-Model-Interpretability)
9. [Save Best Model](#9.-Save-Best-Model)

## 1. Setup

In [None]:
# Standard imports
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Sklearn
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, classification_report, average_precision_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# Advanced models
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False

try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False

# Project modules
from src.data.data_loader import DataLoader
from src.data.preprocessing import DataPreprocessor

# Settings
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("âœ… Libraries loaded!")
print(f"   XGBoost available: {XGBOOST_AVAILABLE}")
print(f"   LightGBM available: {LIGHTGBM_AVAILABLE}")

## 2. Data Preparation

In [None]:
# Load data
loader = DataLoader(config_path=str(project_root / 'config' / 'config.yaml'))
df = loader.load_data()

print(f"Dataset shape: {df.shape}")

In [None]:
# Preprocess data
preprocessor = DataPreprocessor(config_path=str(project_root / 'config' / 'config.yaml'))
X_train, X_test, y_train, y_test = preprocessor.fit_transform(df)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget distribution (train):")
print(f"  Class 0 (No Churn): {(y_train == 0).sum()} ({(y_train == 0).mean()*100:.1f}%)")
print(f"  Class 1 (Churn): {(y_train == 1).sum()} ({(y_train == 1).mean()*100:.1f}%)")

In [None]:
# Get feature names
feature_names = preprocessor.get_feature_names()
print(f"Number of features: {len(feature_names)}")
print(f"\nFirst 10 features: {feature_names[:10]}")

## 3. Baseline Models

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """Train and evaluate a model, returning metrics."""
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    
    # Metrics
    metrics = {
        'model': model_name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std()
    }
    
    return model, metrics, y_pred, y_proba

In [None]:
# Define baseline models
baseline_models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000, random_state=RANDOM_STATE, class_weight='balanced'
    ),
    'Decision Tree': DecisionTreeClassifier(
        max_depth=5, random_state=RANDOM_STATE, class_weight='balanced'
    ),
    'Naive Bayes': GaussianNB()
}

# Train and evaluate
results = []
trained_models = {}
predictions = {}

for name, model in baseline_models.items():
    print(f"\nTraining {name}...")
    trained_model, metrics, y_pred, y_proba = evaluate_model(
        model, X_train, X_test, y_train, y_test, name
    )
    results.append(metrics)
    trained_models[name] = trained_model
    predictions[name] = {'y_pred': y_pred, 'y_proba': y_proba}
    
    print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")
    print(f"  CV Score: {metrics['cv_mean']:.4f} (+/- {metrics['cv_std']*2:.4f})")

## 4. Advanced Models

In [None]:
# Advanced models
advanced_models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=200, max_depth=10, random_state=RANDOM_STATE,
        class_weight='balanced', n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200, max_depth=5, learning_rate=0.1,
        random_state=RANDOM_STATE
    )
}

if XGBOOST_AVAILABLE:
    advanced_models['XGBoost'] = XGBClassifier(
        n_estimators=200, max_depth=5, learning_rate=0.1,
        scale_pos_weight=3, random_state=RANDOM_STATE,
        use_label_encoder=False, eval_metric='logloss'
    )

if LIGHTGBM_AVAILABLE:
    advanced_models['LightGBM'] = LGBMClassifier(
        n_estimators=200, max_depth=5, learning_rate=0.1,
        class_weight='balanced', random_state=RANDOM_STATE, verbose=-1
    )

# Train and evaluate
for name, model in advanced_models.items():
    print(f"\nTraining {name}...")
    trained_model, metrics, y_pred, y_proba = evaluate_model(
        model, X_train, X_test, y_train, y_test, name
    )
    results.append(metrics)
    trained_models[name] = trained_model
    predictions[name] = {'y_pred': y_pred, 'y_proba': y_proba}
    
    print(f"  ROC-AUC: {metrics['roc_auc']:.4f}")
    print(f"  CV Score: {metrics['cv_mean']:.4f} (+/- {metrics['cv_std']*2:.4f})")

## 5. Model Comparison

In [None]:
# Results DataFrame
results_df = pd.DataFrame(results).set_index('model')
results_df = results_df.round(4)
results_df = results_df.sort_values('roc_auc', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)
results_df

In [None]:
# Visualization: Model comparison
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

fig = go.Figure()

for metric in metrics_to_plot:
    fig.add_trace(go.Bar(
        name=metric.upper(),
        x=results_df.index,
        y=results_df[metric],
        text=[f'{v:.3f}' for v in results_df[metric]],
        textposition='outside'
    ))

fig.update_layout(
    title='Model Performance Comparison',
    barmode='group',
    xaxis_title='Model',
    yaxis_title='Score',
    height=500,
    legend=dict(orientation='h', y=1.15)
)
fig.show()

In [None]:
# ROC Curves comparison
fig = go.Figure()

colors = px.colors.qualitative.Set1

for idx, (name, preds) in enumerate(predictions.items()):
    if preds['y_proba'] is not None:
        fpr, tpr, _ = roc_curve(y_test, preds['y_proba'])
        auc = roc_auc_score(y_test, preds['y_proba'])
        
        fig.add_trace(go.Scatter(
            x=fpr, y=tpr,
            name=f'{name} (AUC={auc:.3f})',
            mode='lines',
            line=dict(color=colors[idx % len(colors)], width=2)
        ))

# Diagonal line
fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines',
    line=dict(color='gray', dash='dash'),
    name='Random Classifier'
))

fig.update_layout(
    title='ROC Curves Comparison',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    height=500,
    legend=dict(x=0.6, y=0.1)
)
fig.show()

In [None]:
# Precision-Recall Curves
fig = go.Figure()

for idx, (name, preds) in enumerate(predictions.items()):
    if preds['y_proba'] is not None:
        precision, recall, _ = precision_recall_curve(y_test, preds['y_proba'])
        ap = average_precision_score(y_test, preds['y_proba'])
        
        fig.add_trace(go.Scatter(
            x=recall, y=precision,
            name=f'{name} (AP={ap:.3f})',
            mode='lines',
            line=dict(color=colors[idx % len(colors)], width=2)
        ))

fig.update_layout(
    title='Precision-Recall Curves Comparison',
    xaxis_title='Recall',
    yaxis_title='Precision',
    height=500
)
fig.show()

## 6. Hyperparameter Tuning

In [None]:
# Select best model for tuning
best_model_name = results_df['roc_auc'].idxmax()
print(f"Best model: {best_model_name}")
print(f"ROC-AUC: {results_df.loc[best_model_name, 'roc_auc']:.4f}")

In [None]:
# Optional: Hyperparameter tuning with Optuna
try:
    import optuna
    from optuna.samplers import TPESampler
    
    def objective(trial):
        """Optuna objective function for XGBoost."""
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
            'random_state': RANDOM_STATE,
            'use_label_encoder': False,
            'eval_metric': 'logloss'
        }
        
        model = XGBClassifier(**params)
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
        
        return scores.mean()
    
    # Run optimization (limited trials for demo)
    print("Running hyperparameter optimization...")
    sampler = TPESampler(seed=RANDOM_STATE)
    study = optuna.create_study(direction='maximize', sampler=sampler)
    study.optimize(objective, n_trials=20, show_progress_bar=True)
    
    print(f"\nBest ROC-AUC: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")
    
    # Train with best params
    best_params = study.best_params
    best_params['random_state'] = RANDOM_STATE
    best_params['use_label_encoder'] = False
    best_params['eval_metric'] = 'logloss'
    
    tuned_model = XGBClassifier(**best_params)
    
except ImportError:
    print("Optuna not available. Using default hyperparameters.")
    tuned_model = trained_models[best_model_name]

## 7. Final Evaluation

In [None]:
# Get best model
best_model = trained_models[best_model_name]

# Final predictions
y_pred_final = best_model.predict(X_test)
y_proba_final = best_model.predict_proba(X_test)[:, 1]

# Detailed classification report
print("="*60)
print(f"FINAL MODEL EVALUATION: {best_model_name}")
print("="*60)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final, target_names=['No Churn', 'Churn']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_final)

fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=['Predicted: No', 'Predicted: Yes'],
    y=['Actual: No', 'Actual: Yes'],
    text=cm,
    texttemplate='%{text}',
    textfont={'size': 20},
    colorscale='Blues',
    showscale=False
))

fig.update_layout(
    title=f'Confusion Matrix - {best_model_name}',
    height=400,
    width=500
)
fig.show()

# Print detailed breakdown
tn, fp, fn, tp = cm.ravel()
print(f"\nConfusion Matrix Breakdown:")
print(f"  True Negatives:  {tn} (correctly identified non-churners)")
print(f"  False Positives: {fp} (incorrectly flagged as churners)")
print(f"  False Negatives: {fn} (missed churners - COSTLY!)")
print(f"  True Positives:  {tp} (correctly identified churners)")

## 8. Model Interpretability

In [None]:
# Feature Importance
if hasattr(best_model, 'feature_importances_'):
    importance = best_model.feature_importances_
elif hasattr(best_model, 'coef_'):
    importance = np.abs(best_model.coef_[0])
else:
    importance = None

if importance is not None:
    # Create DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False).head(15)
    
    # Plot
    fig = go.Figure(go.Bar(
        x=importance_df['importance'],
        y=importance_df['feature'],
        orientation='h',
        marker_color='steelblue'
    ))
    
    fig.update_layout(
        title=f'Top 15 Feature Importance - {best_model_name}',
        xaxis_title='Importance',
        yaxis_title='Feature',
        height=500,
        yaxis={'categoryorder': 'total ascending'}
    )
    fig.show()

In [None]:
# SHAP Analysis (if available)
try:
    import shap
    
    print("Computing SHAP values...")
    
    # Sample for efficiency
    sample_idx = np.random.choice(len(X_test), size=min(100, len(X_test)), replace=False)
    X_sample = X_test[sample_idx]
    
    # Create explainer
    if hasattr(best_model, 'feature_importances_'):
        explainer = shap.TreeExplainer(best_model)
    else:
        explainer = shap.LinearExplainer(best_model, X_train)
    
    shap_values = explainer.shap_values(X_sample)
    
    # Handle different shap_values formats
    if isinstance(shap_values, list):
        shap_values = shap_values[1]  # For class 1 (Churn)
    
    # Summary plot
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_sample, feature_names=feature_names, show=False)
    plt.title(f'SHAP Feature Importance - {best_model_name}')
    plt.tight_layout()
    plt.show()
    
except ImportError:
    print("SHAP not available. Install with: pip install shap")

## 9. Save Best Model

In [None]:
import joblib
from datetime import datetime
import yaml

# Create models directory
models_dir = project_root / 'models'
models_dir.mkdir(exist_ok=True)

# Save model
model_path = models_dir / 'best_model.pkl'
joblib.dump(best_model, model_path)
print(f"âœ… Model saved to: {model_path}")

# Save preprocessor
preprocessor.save(str(models_dir / 'preprocessor.pkl'))
print(f"âœ… Preprocessor saved")

# Save metadata
metadata = {
    'model_name': best_model_name,
    'training_date': datetime.now().isoformat(),
    'metrics': {
        'accuracy': float(results_df.loc[best_model_name, 'accuracy']),
        'precision': float(results_df.loc[best_model_name, 'precision']),
        'recall': float(results_df.loc[best_model_name, 'recall']),
        'f1': float(results_df.loc[best_model_name, 'f1']),
        'roc_auc': float(results_df.loc[best_model_name, 'roc_auc']),
        'cv_mean': float(results_df.loc[best_model_name, 'cv_mean']),
        'cv_std': float(results_df.loc[best_model_name, 'cv_std'])
    },
    'feature_names': feature_names,
    'n_features': len(feature_names),
    'n_train_samples': len(X_train),
    'n_test_samples': len(X_test)
}

metadata_path = models_dir / 'best_model_metadata.yaml'
with open(metadata_path, 'w') as f:
    yaml.dump(metadata, f, default_flow_style=False)

print(f"âœ… Metadata saved to: {metadata_path}")

In [None]:
# Final summary
print("\n" + "="*60)
print("ðŸŽ‰ MODEL TRAINING COMPLETE!")
print("="*60)
print(f"\nBest Model: {best_model_name}")
print(f"\nPerformance Metrics:")
print(f"  â€¢ Accuracy:  {metadata['metrics']['accuracy']:.4f}")
print(f"  â€¢ Precision: {metadata['metrics']['precision']:.4f}")
print(f"  â€¢ Recall:    {metadata['metrics']['recall']:.4f}")
print(f"  â€¢ F1 Score:  {metadata['metrics']['f1']:.4f}")
print(f"  â€¢ ROC-AUC:   {metadata['metrics']['roc_auc']:.4f}")
print(f"\nNext Steps:")
print("  1. Run the Streamlit app: streamlit run app/streamlit_app.py")
print("  2. Test predictions with new data")
print("  3. Monitor model performance over time")