# Telecom Customer Churn Prediction - Model Training and Evaluation

This notebook demonstrates the training and evaluation of machine learning models for predicting customer churn in a telecom company.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add the scripts directory to the path
sys.path.append('../scripts')

# Import our modules
from base_model import BaseModel
from gradient_boosting import XGBoostModel, LightGBMModel
from neural_network import NeuralNetworkModel
from training_pipeline import ModelTrainer, compare_models
from utils import align_features

# Set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## 1. Load the Data

In [None]:
# Load the preprocessed data
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').iloc[:, 0]  # Use iloc instead of squeeze
X_val = pd.read_csv('../data/processed/X_val.csv')
y_val = pd.read_csv('../data/processed/y_val.csv').iloc[:, 0]  # Use iloc instead of squeeze
X_holdout = pd.read_csv('../data/processed/X_holdout.csv')
y_holdout = pd.read_csv('../data/processed/y_holdout.csv').iloc[:, 0]  # Use iloc instead of squeeze

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Holdout set: {X_holdout.shape}")

## 2. Initialize Models

We'll train and compare three different models:
1. XGBoost
2. LightGBM
3. Neural Network

In [None]:
# Initialize XGBoost model
xgb_model = XGBoostModel(
    model_name="XGBoost_Churn_Predictor",
    params={
        'max_depth': 5,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'use_label_encoder': False
    },
    random_state=42
)

# Initialize LightGBM model
lgb_model = LightGBMModel(
    model_name="LightGBM_Churn_Predictor",
    params={
        'num_leaves': 31,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'binary',
        'metric': 'auc'
    },
    random_state=42
)

# Initialize Neural Network model
nn_model = NeuralNetworkModel(
    model_name="NeuralNetwork_Churn_Predictor",
    hidden_layers=[64, 32, 16],
    activations='relu',
    dropout_rate=0.3,
    learning_rate=0.001,
    batch_size=64,
    epochs=100,
    early_stopping_patience=10,
    random_state=42
)

## 3. Initialize Model Trainers with SMOTE Resampling

We'll use SMOTE (Synthetic Minority Over-sampling Technique) to handle class imbalance.

In [None]:
# Initialize trainers with SMOTE resampling
xgb_trainer = ModelTrainer(
    model=xgb_model,
    resampling_strategy='smote',
    resampling_ratio=0.5,
    random_state=42
)

lgb_trainer = ModelTrainer(
    model=lgb_model,
    resampling_strategy='smote',
    resampling_ratio=0.5,
    random_state=42
)

nn_trainer = ModelTrainer(
    model=nn_model,
    resampling_strategy='smote',
    resampling_ratio=0.5,
    random_state=42
)

## 4. Train and Evaluate XGBoost Model

In [None]:
# Run the training pipeline for XGBoost
xgb_results = xgb_trainer.run_training_pipeline(
    X_train, y_train,
    tune_hyperparameters=False,
    tune_threshold=True,
    cross_validate=True,
    cv=5,
    save_model=True,
    save_history=True,
    plot_cm=True,
    plot_roc=True,
    plot_pr=True,
    plot_prob_dist=True,
    plot_importance=True,
    importance_top_n=20,
    threshold_metric='f1'
)

## 5. Train and Evaluate LightGBM Model

In [None]:
# Run the training pipeline for LightGBM
lgb_results = lgb_trainer.run_training_pipeline(
    X_train, y_train,
    tune_hyperparameters=False,
    tune_threshold=True,
    cross_validate=True,
    cv=5,
    save_model=True,
    save_history=True,
    plot_cm=True,
    plot_roc=True,
    plot_pr=True,
    plot_prob_dist=True,
    plot_importance=True,
    importance_top_n=20,
    threshold_metric='f1'
)

## 6. Train and Evaluate Neural Network Model

In [None]:
# Run the training pipeline for Neural Network
nn_results = nn_trainer.run_training_pipeline(
    X_train, y_train,
    tune_hyperparameters=False,
    tune_threshold=True,
    cross_validate=True,
    cv=5,
    save_model=True,
    save_history=True,
    plot_cm=True,
    plot_roc=True,
    plot_pr=True,
    plot_prob_dist=True,
    plot_importance=True,  # Will use permutation importance
    importance_top_n=20,
    threshold_metric='f1',
    train_params={
        'validation_split': 0.2,
        'verbose': 1
    }
)

## 7. Compare Models

In [None]:
# Compare all models
trainers = [xgb_trainer, lgb_trainer, nn_trainer]
comparison_df = compare_models(
    trainers=trainers,
    X=X_val,
    y=y_val,
    test_size=0.0,  # Use the entire validation set
    metrics=['accuracy', 'precision', 'recall', 'f1', 'auc'],
    plot=True,
    figsize=(14, 10)
)

In [None]:
# Determine the best model based on F1 score
best_model_name = comparison_df.loc['f1'].idxmax()
print(f"Best model based on F1 score: {best_model_name}")

# Get the corresponding trainer
if best_model_name == 'XGBoost_Churn_Predictor':
    best_trainer = xgb_trainer
elif best_model_name == 'LightGBM_Churn_Predictor':
    best_trainer = lgb_trainer
else:
    best_trainer = nn_trainer

## 8. Evaluate on Holdout Set

Now we'll evaluate the best model on the holdout set. First, we need to align the features between the training and holdout sets.

In [None]:
# Check for feature differences
train_features = set(X_train.columns)
holdout_features = set(X_holdout.columns)

print(f"Number of features in training set: {len(train_features)}")
print(f"Number of features in holdout set: {len(holdout_features)}")
print(f"Features in training but not in holdout: {train_features - holdout_features}")
print(f"Features in holdout but not in training: {holdout_features - train_features}")

In [None]:
# Align the holdout features with the training features
X_holdout_aligned = align_features(X_train, X_holdout)

# Verify the alignment
print(f"Aligned holdout set shape: {X_holdout_aligned.shape}")
print(f"Features match training set: {list(X_train.columns) == list(X_holdout_aligned.columns)}")

In [None]:
# Evaluate the best model on the holdout set
if y_holdout is not None:
    # Get the optimal threshold from validation
    optimal_threshold = best_trainer.training_history.get('optimal_threshold', {}).get('value', 0.5)

    print(f"Using optimal threshold: {optimal_threshold:.4f}")

    # Evaluate on holdout set
    holdout_metrics = best_trainer.evaluate_model(X_holdout_aligned, y_holdout, threshold=optimal_threshold)

    # Plot confusion matrix
    best_trainer.plot_confusion_matrix(X_holdout_aligned, y_holdout, threshold=optimal_threshold)

    # Plot ROC curve
    best_trainer.plot_roc_curve(X_holdout_aligned, y_holdout)

    # Plot Precision-Recall curve
    best_trainer.plot_precision_recall_curve(X_holdout_aligned, y_holdout)
else:
    print("Holdout set does not have target labels for evaluation.")

## 9. Save the Best Model for Deployment

In [None]:
# Save the best model for deployment
best_model_path = best_trainer.model.save_model(model_dir='../models/deployment')
print(f"Best model saved to: {best_model_path}")

# Save the optimal threshold
import json
threshold_info = {
    'model_name': best_trainer.model.model_name,
    'optimal_threshold': optimal_threshold,
    'metrics': holdout_metrics
}

with open(f"../models/deployment/{best_trainer.model.model_name}_threshold.json", 'w') as f:
    json.dump(threshold_info, f, indent=4)

print(f"Threshold information saved to: ../models/deployment/{best_trainer.model.model_name}_threshold.json")