# Customer Churn Prediction - Model Training and Evaluation

This notebook contains the complete machine learning pipeline for predicting customer churn.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import joblib
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries imported successfully!')

## 2. Load Preprocessed Data

In [None]:
# Load features and target
X = pd.read_csv('../data/processed/training.csv', index_col=0)
y = pd.read_csv('../data/processed/test.csv', index_col=0)['Exited']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"\nChurn rate: {y.mean()*100:.2f}%")

In [None]:
# Display first few rows
print("Features:")
print(X.head())
print("\nFeature names:")
print(X.columns.tolist())

## 3. Split Data into Train and Validation Sets

In [None]:
# Split data (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"\nTraining set churn rate: {y_train.mean()*100:.2f}%")
print(f"Validation set churn rate: {y_val.mean()*100:.2f}%")

## 4. Feature Scaling

In [None]:
# Scale features for models that are sensitive to feature scales
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print("Features scaled successfully!")

## 5. Train Multiple Models

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
}

# Train and evaluate models
results = {}
trained_models = {}

print("Training models...\n")

for name, model in models.items():
    print(f"Training {name}...")
    
    # Use scaled data for Logistic Regression, original for tree-based
    if 'Logistic' in name:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
        y_pred_proba = model.predict_proba(X_val_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': roc_auc
    }
    
    trained_models[name] = {
        'model': model,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    print(f"  Accuracy: {accuracy:.4f}, ROC-AUC: {roc_auc:.4f}\n")

print("All models trained successfully!")

## 6. Compare Model Performance

In [None]:
# Create comparison dataframe
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
results_df = results_df.sort_values('ROC-AUC', ascending=False)

print("Model Performance Comparison:")
print("="*70)
print(results_df)
print("\nBest model by ROC-AUC:", results_df.index[0])

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: All metrics comparison
results_df.plot(kind='bar', ax=axes[0])
axes[0].set_title('Model Performance Comparison - All Metrics', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Model', fontsize=12)
axes[0].set_ylabel('Score', fontsize=12)
axes[0].legend(loc='lower right')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')
axes[0].set_ylim([0, 1])
axes[0].grid(True, alpha=0.3)

# Plot 2: ROC-AUC comparison
results_df['ROC-AUC'].plot(kind='barh', ax=axes[1], color='steelblue')
axes[1].set_title('ROC-AUC Score Comparison', fontsize=14, fontweight='bold')
axes[1].set_xlabel('ROC-AUC Score', fontsize=12)
axes[1].set_ylabel('Model', fontsize=12)
axes[1].set_xlim([0.7, 0.9])
axes[1].grid(True, alpha=0.3)

# Add values on bars
for i, v in enumerate(results_df['ROC-AUC']):
    axes[1].text(v, i, f' {v:.4f}', va='center')

plt.tight_layout()
plt.savefig('../results/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Model comparison visualization saved!")

## 7. Detailed Analysis of Best Model

In [None]:
# Get best model
best_model_name = results_df.index[0]
best_model_info = trained_models[best_model_name]
best_model = best_model_info['model']
best_predictions = best_model_info['predictions']
best_probabilities = best_model_info['probabilities']

print(f"Best Model: {best_model_name}")
print("="*70)
print("\nDetailed Classification Report:")
print(classification_report(y_val, best_predictions, target_names=['Not Churned', 'Churned']))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_val, best_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Not Churned', 'Churned'],
            yticklabels=['Not Churned', 'Churned'])
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.savefig('../results/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrix visualization saved!")

In [None]:
# ROC Curve for all models
plt.figure(figsize=(10, 8))

for name in models.keys():
    y_proba = trained_models[name]['probabilities']
    fpr, tpr, _ = roc_curve(y_val, y_proba)
    auc_score = roc_auc_score(y_val, y_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.4f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=2)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - All Models', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.savefig('../results/roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("ROC curves visualization saved!")

## 8. Feature Importance Analysis

In [None]:
# Feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("Top 10 Most Important Features:")
    print("="*50)
    print(feature_importance.head(10).to_string(index=False))
    
    # Visualize feature importance
    plt.figure(figsize=(10, 8))
    plt.barh(range(len(feature_importance)), feature_importance['Importance'])
    plt.yticks(range(len(feature_importance)), feature_importance['Feature'])
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.title(f'Feature Importance - {best_model_name}', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.savefig('../results/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nFeature importance visualization saved!")
else:
    print(f"Feature importance not available for {best_model_name}")

## 9. Hyperparameter Tuning for Best Model

In [None]:
# Hyperparameter tuning for the best model
print(f"Performing hyperparameter tuning for {best_model_name}...\n")

if 'Random Forest' in best_model_name:
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    base_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    
elif 'XGBoost' in best_model_name:
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 0.9, 1.0]
    }
    base_model = XGBClassifier(random_state=42, eval_metric='logloss')
    
elif 'Gradient Boosting' in best_model_name:
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3],
        'subsample': [0.8, 0.9, 1.0]
    }
    base_model = GradientBoostingClassifier(random_state=42)
else:
    param_grid = None
    print("Skipping hyperparameter tuning for this model type.")

if param_grid:
    grid_search = GridSearchCV(
        base_model, param_grid, cv=3, scoring='roc_auc',
        n_jobs=-1, verbose=1
    )
    grid_search.fit(X_train, y_train)
    
    print(f"\nBest parameters: {grid_search.best_params_}")
    print(f"Best cross-validation ROC-AUC: {grid_search.best_score_:.4f}")
    
    # Evaluate tuned model
    tuned_model = grid_search.best_estimator_
    y_pred_tuned = tuned_model.predict(X_val)
    y_proba_tuned = tuned_model.predict_proba(X_val)[:, 1]
    
    print(f"\nTuned Model Performance on Validation Set:")
    print(f"  Accuracy: {accuracy_score(y_val, y_pred_tuned):.4f}")
    print(f"  ROC-AUC: {roc_auc_score(y_val, y_proba_tuned):.4f}")
    print(f"  F1-Score: {f1_score(y_val, y_pred_tuned):.4f}")
    
    # Update best model if tuned model is better
    if roc_auc_score(y_val, y_proba_tuned) > roc_auc_score(y_val, best_probabilities):
        best_model = tuned_model
        print("\n✓ Tuned model performs better! Using tuned model as final model.")
    else:
        print("\n✓ Original model performs better. Keeping original model.")

## 10. Save Final Model and Scaler

In [None]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save the best model
joblib.dump(best_model, '../models/best_churn_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

# Save model metadata
model_info = {
    'model_name': best_model_name,
    'features': X.columns.tolist(),
    'performance': results[best_model_name],
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

import json
with open('../models/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=4)

print("✓ Model saved successfully!")
print(f"  - Model file: ../models/best_churn_model.pkl")
print(f"  - Scaler file: ../models/scaler.pkl")
print(f"  - Model info: ../models/model_info.json")

## 11. Generate Final Report

In [None]:
print("="*70)
print("CUSTOMER CHURN PREDICTION - FINAL REPORT")
print("="*70)
print(f"\nDataset Information:")
print(f"  - Total samples: {len(X)}")
print(f"  - Number of features: {X.shape[1]}")
print(f"  - Churn rate: {y.mean()*100:.2f}%")
print(f"\nData Split:")
print(f"  - Training samples: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"  - Validation samples: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
print(f"\nModels Evaluated: {len(models)}")
for model_name in models.keys():
    print(f"  - {model_name}")
print(f"\nBest Model: {best_model_name}")
print(f"\nPerformance Metrics (Validation Set):")
for metric, value in results[best_model_name].items():
    print(f"  - {metric}: {value:.4f}")
print(f"\nFiles Generated:")
print(f"  - Model: models/best_churn_model.pkl")
print(f"  - Scaler: models/scaler.pkl")
print(f"  - Model info: models/model_info.json")
print(f"  - Visualizations: results/")
print("="*70)
print("\n✓ Customer Churn Prediction Project Completed Successfully!")