In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier

# Load the training data
train_data = pd.read_csv("../data/cell2celltrain.csv")

# Remove customer_id
train_data = train_data.drop('CustomerID', axis=1)

# Handle missing values BEFORE Label Encoding
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        train_data[col] = train_data[col].fillna('Unknown')

# Convert categorical features to numerical features
for col in train_data.columns:
    if train_data[col].dtype == 'object':
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col])

# Split the training data into training and validation sets
X = train_data.drop('Churn', axis=1)
y = train_data['Churn']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)

# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

# Define the base models
logistic_regression = LogisticRegression(random_state=42)
random_forest = RandomForestClassifier(random_state=42)
xgboost = XGBClassifier(random_state=42)

# Train the base models
logistic_regression.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
xgboost.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_lr = logistic_regression.predict_proba(X_val)[:, 1]
y_pred_rf = random_forest.predict_proba(X_val)[:, 1]
y_pred_xgb = xgboost.predict_proba(X_val)[:, 1]

# Create a dataframe with the predictions
predictions_df = pd.DataFrame({'lr': y_pred_lr, 'rf': y_pred_rf, 'xgb': y_pred_xgb})

# Define the meta-models
meta_model_lr = LogisticRegression(random_state=42)
meta_model_rf = RandomForestClassifier(random_state=42)
meta_model_nn = MLPClassifier(random_state=42)

# Train the meta-models
meta_model_lr.fit(predictions_df, y_val)
meta_model_rf.fit(predictions_df, y_val)
meta_model_nn.fit(predictions_df, y_val)

# Make predictions on the validation set using the stacked model
y_pred_stacked_lr = meta_model_lr.predict(predictions_df)
y_pred_stacked_rf = meta_model_rf.predict(predictions_df)
y_pred_stacked_nn = meta_model_nn.predict(predictions_df)

# Evaluate the stacked models
print("\nStacked Model (Logistic Regression Meta-Model):")
print(f"Accuracy: {accuracy_score(y_val, y_pred_stacked_lr)}")
print(f"ROC AUC: {roc_auc_score(y_val, y_pred_stacked_lr)}")
print(f"Classification Report:\n{classification_report(y_val, y_pred_stacked_lr)}")

print("\nStacked Model (Random Forest Meta-Model):")
print(f"Accuracy: {accuracy_score(y_val, y_pred_stacked_rf)}")
print(f"ROC AUC: {roc_auc_score(y_val, y_pred_stacked_rf)}")
print(f"Classification Report:\n{classification_report(y_val, y_pred_stacked_rf)}")

print("\nStacked Model (Neural Network Meta-Model):")
print(f"Accuracy: {accuracy_score(y_val, y_pred_stacked_nn)}")
print(f"ROC AUC: {roc_auc_score(y_val, y_pred_stacked_nn)}")
print(f"Classification Report:\n{classification_report(y_val, y_pred_stacked_nn)}")


Stacked Model (Logistic Regression Meta-Model):
Accuracy: 0.7255631733594515
ROC AUC: 0.5460033697010573
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.96      0.83      7308
           1       0.58      0.13      0.21      2902

    accuracy                           0.73     10210
   macro avg       0.66      0.55      0.52     10210
weighted avg       0.69      0.73      0.66     10210


Stacked Model (Random Forest Meta-Model):
Accuracy: 1.0
ROC AUC: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7308
           1       1.00      1.00      1.00      2902

    accuracy                           1.00     10210
   macro avg       1.00      1.00      1.00     10210
weighted avg       1.00      1.00      1.00     10210


Stacked Model (Neural Network Meta-Model):
Accuracy: 0.7266405484818805
ROC AUC: 0.5461327088088656
Classification Report:
