In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, classification_report, make_scorer
import pickle
import os
import plotly.graph_objects as go
import plotly.express as px
from typing import Dict, List, Tuple, Any, Optional

In [None]:
# Define the absolute path for saving models
MODELS_DIR = r"C:\Users\alex5\Documents\Projects\telecom_churn_project\telecom_churn\models"

# Create the models directory if it doesn't exist
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
    print(f"Created models directory at {MODELS_DIR}")

In [None]:
# Load the data
data = pd.read_csv("../data/cell2celltrain.csv")
data.head()

In [None]:
# Preprocessing
data = data.drop('CustomerID', axis=1)

# Handle missing values
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].fillna('Unknown')

# Convert categorical features to numerical features
for col in data.columns:
    if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])

In [None]:
# Define features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and testing with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Define models (expanded to 5 models)
models = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear'),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Neural Network': MLPClassifier(random_state=42, max_iter=1000)
}

# Define hyperparameter grids
param_grids = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10]},
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3]},
    'Gradient Boosting': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3]},
    'Neural Network': {'hidden_layer_sizes': [(50,), (100,), (50, 50)], 'alpha': [0.0001, 0.001, 0.01]}
}

# Define scoring metrics
scoring = ['roc_auc', 'f1', 'precision']

In [None]:
# Create a function to perform cross-validation and hyperparameter tuning
def tune_and_evaluate(model, param_grid, X_train, y_train, X_test, y_test, scoring, model_name):
    grid_search = GridSearchCV(model, param_grid, scoring=scoring, refit='roc_auc', 
                              cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
                              return_train_score=True)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1]
    
    print(f"Model: {model_name}")
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_prob)}")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}")
    print("\n" + "-"*50 + "\n")
    
    return best_model, y_prob

In [None]:
# Train, evaluate, and save models
trained_models = {}
model_predictions = {}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    best_model, y_prob = tune_and_evaluate(model, param_grids[model_name], X_train, y_train, X_test, y_test, scoring, model_name)
    trained_models[model_name] = best_model
    model_predictions[model_name] = y_prob
    
    # Save each model
    filename = os.path.join(MODELS_DIR, f"{model_name.lower().replace(' ', '_')}_model.pkl")
    pickle.dump(best_model, open(filename, 'wb'))
    print(f"{model_name} model saved to {filename}")

In [None]:
# Create a blended model using the best weights
# We'll try different weight combinations and find the best one
weights_to_try = [
    {'Logistic Regression': 0.1, 'Random Forest': 0.3, 'XGBoost': 0.3, 'Gradient Boosting': 0.2, 'Neural Network': 0.1},
    {'Logistic Regression': 0.2, 'Random Forest': 0.2, 'XGBoost': 0.3, 'Gradient Boosting': 0.2, 'Neural Network': 0.1},
    {'Logistic Regression': 0.1, 'Random Forest': 0.2, 'XGBoost': 0.4, 'Gradient Boosting': 0.2, 'Neural Network': 0.1},
    {'Logistic Regression': 0.1, 'Random Forest': 0.2, 'XGBoost': 0.3, 'Gradient Boosting': 0.3, 'Neural Network': 0.1}
]

best_auc = 0
best_weights = None

for weights in weights_to_try:
    # Ensure weights sum to 1
    total = sum(weights.values())
    weights = {k: v/total for k, v in weights.items()}
    
    # Calculate weighted predictions
    weighted_pred = np.zeros_like(model_predictions['Logistic Regression'])
    for model_name, weight in weights.items():
        weighted_pred += weight * model_predictions[model_name]
    
    # Calculate AUC
    auc = roc_auc_score(y_test, weighted_pred)
    print(f"Weights: {weights}")
    print(f"ROC AUC: {auc}")
    
    if auc > best_auc:
        best_auc = auc
        best_weights = weights

print(f"\nBest weights: {best_weights}")
print(f"Best ROC AUC: {best_auc}")

In [None]:
# Save the best weights and models for the ensemble
ensemble_data = {
    'weights': best_weights,
    'models': trained_models
}

# Save the ensemble model
filename = os.path.join(MODELS_DIR, "ensemble_model.pkl")
pickle.dump(ensemble_data, open(filename, 'wb'))
print(f"Ensemble model saved to {filename}")

In [None]:
# Save the preprocessing objects for later use
preprocessing = {
    'imputer': imputer,
    'scaler': scaler
}

filename = os.path.join(MODELS_DIR, "preprocessing.pkl")
pickle.dump(preprocessing, open(filename, 'wb'))
print(f"Preprocessing objects saved to {filename}")

In [None]:
# Create a function to make predictions with the ensemble model
def predict_with_ensemble(X, ensemble_data):
    models = ensemble_data['models']
    weights = ensemble_data['weights']
    
    # Get predictions from each model
    predictions = {}
    for model_name, model in models.items():
        predictions[model_name] = model.predict_proba(X)[:, 1]
    
    # Calculate weighted predictions
    weighted_pred = np.zeros(X.shape[0])
    for model_name, weight in weights.items():
        weighted_pred += weight * predictions[model_name]
    
    return weighted_pred

In [None]:
# Test the ensemble model on the test set
ensemble_pred = predict_with_ensemble(X_test, ensemble_data)
ensemble_auc = roc_auc_score(y_test, ensemble_pred)
print(f"Ensemble Model ROC AUC: {ensemble_auc}")

# Compare with individual models
print("\nModel Performance Comparison:")
for model_name, y_prob in model_predictions.items():
    model_auc = roc_auc_score(y_test, y_prob)
    print(f"{model_name}: {model_auc}")
print(f"Ensemble: {ensemble_auc}")

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

# Plot individual model ROC curves
for model_name, y_prob in model_predictions.items():
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    model_auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {model_auc:.3f})')

# Plot ensemble ROC curve
fpr, tpr, _ = roc_curve(y_test, ensemble_pred)
plt.plot(fpr, tpr, label=f'Ensemble (AUC = {ensemble_auc:.3f})', linestyle='--', linewidth=3)

# Plot random guess line
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Save model metrics for dashboard
model_metrics = {}

for model_name, y_prob in model_predictions.items():
    y_pred = (y_prob > 0.5).astype(int)
    model_auc = roc_auc_score(y_test, y_prob)
    
    # Get classification report as dict
    report = classification_report(y_test, y_pred, output_dict=True)
    
    model_metrics[model_name] = {
        'roc_auc': float(model_auc),
        'precision': float(report['weighted avg']['precision']),
        'recall': float(report['weighted avg']['recall']),
        'f1_score': float(report['weighted avg']['f1-score'])
    }

# Add ensemble metrics
ensemble_pred_binary = (ensemble_pred > 0.5).astype(int)
ensemble_report = classification_report(y_test, ensemble_pred_binary, output_dict=True)

model_metrics['Ensemble'] = {
    'roc_auc': float(ensemble_auc),
    'precision': float(ensemble_report['weighted avg']['precision']),
    'recall': float(ensemble_report['weighted avg']['recall']),
    'f1_score': float(ensemble_report['weighted avg']['f1-score'])
}

# Save metrics to JSON file
import json
metrics_path = os.path.join(os.path.dirname(MODELS_DIR), 'dashboard', 'model_metrics.json')
os.makedirs(os.path.dirname(metrics_path), exist_ok=True)
with open(metrics_path, 'w') as f:
    json.dump(model_metrics, f, indent=4)

print(f"Model metrics saved to {metrics_path}")

In [None]:
print(f"\nAll models have been successfully trained and saved to {MODELS_DIR}")
print("\nModels created:")
for i, model_name in enumerate(trained_models.keys(), 1):
    print(f"{i}. {model_name}")
print(f"{len(trained_models) + 1}. Ensemble Model (Weighted Blend)")