In [None]:
# %% [markdown]
# # Model Training & Evaluation - Customer Churn Prediction
# 
# **Objective:** Train and compare ML models for production churn prediction
# 
# **Models:**
# 1. Logistic Regression (Baseline, interpretable)
# 2. XGBoost (Production model)
# 
# **Focus:** Business metrics, not just accuracy!

# %% [markdown]
# ## 1. Setup

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add project root to path
project_root = os.path.dirname(os.getcwd())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Now import works
from src.train import ChurnModelTrainer
from src.evaluate import ChurnModelEvaluator

import warnings
warnings.filterwarnings('ignore')

import warnings
warnings.filterwarnings('ignore')

# %% [markdown]
# ## 2. Load Engineered Features

# %%
df = pd.read_csv('../data/processed/churn_features.csv')
print(f"Dataset Shape: {df.shape}")
print(f"Churn Rate: {df['Churn'].mean()*100:.2f}%")
print(f"\nFeatures available: {len(df.columns)}")

# %% [markdown]
# ## 3. Quick Data Check

# %%
# Check for missing values
print("\n=== MISSING VALUES CHECK ===")
missing = df.isnull().sum().sum()
if missing == 0:
    print("‚úÖ No missing values!")
else:
    print(f"‚ö†Ô∏è {missing} missing values found")

# Check churn distribution
print("\n=== CHURN DISTRIBUTION ===")
print(df['Churn'].value_counts())
print(f"\nClass imbalance ratio: {df['Churn'].value_counts()[0] / df['Churn'].value_counts()[1]:.2f}:1")

# %% [markdown]
# ## 4. Train Models

# %%
# Initialize trainer
trainer = ChurnModelTrainer(features_path='../data/processed/churn_features.csv')

# Run training pipeline
print("\n" + "="*70)
print("STARTING MODEL TRAINING")
print("="*70)

trainer.train_pipeline()

# %% [markdown]
# ## 5. Model Performance Comparison

# %%
# Compare models
print("\n" + "="*70)
print("MODEL PERFORMANCE SUMMARY")
print("="*70)

for model_name in ['logistic_regression', 'xgboost']:
    results = trainer.results[model_name]
    print(f"\n{model_name.upper().replace('_', ' ')}:")
    print(f"  ROC-AUC:   {results['test_auc']:.4f}")
    print(f"  Recall:    {results['recall']:.4f}")
    print(f"  Precision: {results['precision']:.4f}")
    print(f"  F1-Score:  {results['f1']:.4f}")

# %% [markdown]
# ## 6. Confusion Matrices Visualization

# %%
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, (model_name, results) in enumerate(trainer.results.items()):
    cm = results['confusion_matrix']
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx])
    axes[idx].set_title(f'{model_name.replace("_", " ").title()} - Confusion Matrix', 
                        fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Actual')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_xticklabels(['No Churn', 'Churn'])
    axes[idx].set_yticklabels(['No Churn', 'Churn'])

plt.tight_layout()
plt.savefig('../reports/figures/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 7. ROC Curves

# %%
from sklearn.metrics import roc_curve, auc

fig, ax = plt.subplots(figsize=(10, 8))

for model_name, results in trainer.results.items():
    y_pred_proba = results['y_pred_proba']
    fpr, tpr, _ = roc_curve(trainer.y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    ax.plot(fpr, tpr, lw=2, label=f'{model_name.replace("_", " ").title()} (AUC = {roc_auc:.3f})')

# Plot diagonal line
ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
ax.legend(loc="lower right", fontsize=10)
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 8. Feature Importance (XGBoost)

# %%
# Get XGBoost model
xgb_model = trainer.models['xgboost']
feature_importance = pd.DataFrame({
    'Feature': trainer.feature_cols,
    'Importance': xgb_model.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot top 15
plt.figure(figsize=(10, 8))
top_15 = feature_importance.head(15)
sns.barplot(data=top_15, y='Feature', x='Importance', palette='viridis')
plt.title('Top 15 Feature Importance (XGBoost)', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.savefig('../reports/figures/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n=== TOP 10 MOST IMPORTANT FEATURES ===\n")
print(feature_importance.head(10).to_string(index=False))

# %% [markdown]
# ## 9. Business Cost Analysis (CRITICAL FOR INTERVIEWS!)

# %%
print("\n" + "="*70)
print("üí∞ BUSINESS COST ANALYSIS")
print("="*70)

# Define costs
FN_COST = 5000  # Cost of losing a customer
FP_COST = 500   # Cost of retention offer

print(f"\nüíµ Cost Assumptions:")
print(f"   False Negative (missed churner): ‚Çπ{FN_COST:,}")
print(f"   False Positive (wrong alert): ‚Çπ{FP_COST:,}")
print(f"   Rationale: Acquiring new customer costs 10x retention")

# Calculate costs for each model
print(f"\nüìä Total Business Cost per Model:\n")

cost_comparison = []

for model_name, results in trainer.results.items():
    cm = results['confusion_matrix']
    tn, fp, fn, tp = cm.ravel()
    
    fn_total = fn * FN_COST
    fp_total = fp * FP_COST
    total_cost = fn_total + fp_total
    cost_per_customer = total_cost / len(trainer.y_test)
    
    cost_comparison.append({
        'Model': model_name.replace('_', ' ').title(),
        'FN': fn,
        'FP': fp,
        'Total Cost': total_cost,
        'Cost/Customer': cost_per_customer
    })
    
    print(f"{model_name.upper()}:")
    print(f"   False Negatives: {fn} √ó ‚Çπ{FN_COST:,} = ‚Çπ{fn_total:,}")
    print(f"   False Positives: {fp} √ó ‚Çπ{FP_COST:,} = ‚Çπ{fp_total:,}")
    print(f"   Total Cost: ‚Çπ{total_cost:,}")
    print(f"   Cost per Customer: ‚Çπ{cost_per_customer:.2f}\n")

# Calculate do-nothing baseline
total_churners = trainer.y_test.sum()
do_nothing_cost = total_churners * FN_COST

print(f"DO NOTHING BASELINE:")
print(f"   All churners lost: {total_churners} √ó ‚Çπ{FN_COST:,} = ‚Çπ{do_nothing_cost:,}")
print(f"   Cost per Customer: ‚Çπ{do_nothing_cost/len(trainer.y_test):.2f}")

# Calculate savings
print(f"\nüí° COST SAVINGS VS DO-NOTHING:\n")
for item in cost_comparison:
    savings = do_nothing_cost - item['Total Cost']
    savings_pct = (savings / do_nothing_cost) * 100
    print(f"   {item['Model']}: ‚Çπ{savings:,} ({savings_pct:.1f}% reduction)")

# %% [markdown]
# ## 10. Cost Visualization

# %%
# Visualize costs
cost_df = pd.DataFrame(cost_comparison)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Total cost
axes[0].bar(cost_df['Model'], cost_df['Total Cost'], color=['#3498db', '#2ecc71'])
axes[0].axhline(y=do_nothing_cost, color='red', linestyle='--', 
                label=f'Do Nothing: ‚Çπ{do_nothing_cost:,}')
axes[0].set_title('Total Business Cost by Model', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Total Cost (‚Çπ)')
axes[0].legend()
axes[0].ticklabel_format(style='plain', axis='y')

# FN vs FP breakdown
x = np.arange(len(cost_df))
width = 0.35

fn_costs = cost_df['FN'] * FN_COST
fp_costs = cost_df['FP'] * FP_COST

axes[1].bar(x, fn_costs, width, label='False Negative Cost', color='#e74c3c')
axes[1].bar(x, fp_costs, width, bottom=fn_costs, label='False Positive Cost', color='#f39c12')
axes[1].set_title('Cost Breakdown: FN vs FP', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Cost (‚Çπ)')
axes[1].set_xticks(x)
axes[1].set_xticklabels(cost_df['Model'])
axes[1].legend()
axes[1].ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.savefig('../reports/figures/business_cost_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %% [markdown]
# ## 11. Prediction Examples

# %%
# Show some prediction examples
print("\n" + "="*70)
print("üéØ SAMPLE PREDICTIONS")
print("="*70)

# Use XGBoost model
xgb_model = trainer.models['xgboost']
y_pred_proba = xgb_model.predict_proba(trainer.X_test)[:, 1]

# Create results dataframe
results_df = pd.DataFrame({
    'Actual_Churn': trainer.y_test.values,
    'Churn_Probability': y_pred_proba,
    'Risk_Level': pd.cut(y_pred_proba, bins=[0, 0.3, 0.7, 1.0], 
                         labels=['Low', 'Medium', 'High'])
})

# Show examples from each risk level
print("\nSample customers by risk level:\n")

for risk in ['High', 'Medium', 'Low']:
    print(f"\n{risk.upper()} RISK CUSTOMERS:")
    print("-" * 50)
    
    samples = results_df[results_df['Risk_Level'] == risk].head(3)
    for idx, row in samples.iterrows():
        actual = "CHURNED ‚ùå" if row['Actual_Churn'] == 1 else "RETAINED ‚úÖ"
        print(f"  Probability: {row['Churn_Probability']:.2%} | Actual: {actual}")

# %% [markdown]
# ## 12. Model Selection Recommendation

# %%
print("\n" + "="*70)
print("üèÜ MODEL SELECTION RECOMMENDATION")
print("="*70)

# Determine best model based on business cost
best_model = min(cost_comparison, key=lambda x: x['Total Cost'])

print(f"""
RECOMMENDED MODEL: {best_model['Model'].upper()}

RATIONALE:
1. Lowest Total Business Cost: ‚Çπ{best_model['Total Cost']:,}
2. Best balance of False Negatives and False Positives
3. Highest ROC-AUC score: {trainer.results[best_model['Model'].lower().replace(' ', '_')]['test_auc']:.4f}
4. Production-ready with strong generalization

DEPLOYMENT CONSIDERATIONS:
- Set probability threshold to optimize cost (default: 0.5)
- Monitor model performance weekly
- Retrain quarterly with new data
- A/B test retention campaigns

EXPECTED IMPACT:
- Identify {trainer.y_test.sum()} at-risk customers per cycle
- Prevent ~{best_model['FN'] + (best_model['FP'] * 0.3):.0f} churns with interventions
- Save ‚Çπ{do_nothing_cost - best_model['Total Cost']:,} vs do-nothing
- ROI: {((do_nothing_cost - best_model['Total Cost']) / best_model['Total Cost'] * 100):.0f}%

NEXT STEPS:
1. Save model to production: ‚úÖ (already done in models/)
2. Create API endpoint for real-time scoring
3. Build monitoring dashboard
4. Integrate with CRM for automated alerts
""")
print("="*70)

# %% [markdown]
# ## 13. Save Final Report

# %%
# Create comprehensive report
report_path = '../reports/modeling_final_report.txt'

with open(report_path, 'w') as f:
    f.write("="*70 + "\n")
    f.write("CHURN PREDICTION - FINAL MODELING REPORT\n")
    f.write("="*70 + "\n\n")
    
    f.write("MODELS TRAINED:\n")
    f.write("-"*70 + "\n")
    f.write("1. Logistic Regression (Baseline)\n")
    f.write("2. XGBoost (Production Model)\n\n")
    
    f.write("PERFORMANCE METRICS:\n")
    f.write("-"*70 + "\n")
    for model_name, results in trainer.results.items():
        f.write(f"\n{model_name.upper()}:\n")
        f.write(f"  ROC-AUC: {results['test_auc']:.4f}\n")
        f.write(f"  Recall: {results['recall']:.4f}\n")
        f.write(f"  Precision: {results['precision']:.4f}\n")
        f.write(f"  F1-Score: {results['f1']:.4f}\n")
    
    f.write("\n" + "="*70 + "\n")
    f.write("BUSINESS COST ANALYSIS:\n")
    f.write("-"*70 + "\n")
    for item in cost_comparison:
        f.write(f"\n{item['Model']}:\n")
        f.write(f"  Total Cost: ‚Çπ{item['Total Cost']:,}\n")
        f.write(f"  Cost per Customer: ‚Çπ{item['Cost/Customer']:.2f}\n")
    
    f.write(f"\n\nRECOMMENDED MODEL: {best_model['Model'].upper()}\n")
    f.write(f"Savings vs Do-Nothing: ‚Çπ{do_nothing_cost - best_model['Total Cost']:,}\n")

print(f"\n‚úÖ Final report saved to: {report_path}")
print("\nüéâ MODELING COMPLETE! Models are ready for production deployment.")