In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import scipy.stats as stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data = pd.read_csv('cleaned_data_telecom.csv')  

In [3]:
data_no_total = data.drop(['total_charges'], axis=1).reset_index(drop=True)

In [4]:
features = data_no_total.columns.drop('churn')
target = 'churn'
X_train, X_temp, y_train, y_temp = train_test_split(data_no_total[features], data_no_total[target], test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
train_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

val_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

test_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)])

X_train_preprocessed = train_preprocessor.fit_transform(X_train)
X_val_preprocessed = val_preprocessor.fit_transform(X_val)
X_test_preprocessed = test_preprocessor.fit_transform(X_test)

print("Training set shape:", X_train_preprocessed.shape)
print("Validation set shape:", X_val_preprocessed.shape)
print("Test set shape:", X_test_preprocessed.shape)

Training set shape: (4206, 29)
Validation set shape: (1402, 29)
Test set shape: (1402, 29)


In [5]:
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

model_performance = {}
feature_importance_df = pd.DataFrame()

for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    model.fit(X_train_preprocessed, y_train)
    y_val_pred = model.predict(X_val_preprocessed)
    accuracy = accuracy_score(y_val, y_val_pred)
    model_performance[model_name] = accuracy
    print(f"{model_name} Validation Accuracy: {accuracy * 100:.2f}%")
    print(f"\n{model_name} Classification Report:")
    print(classification_report(y_val, y_val_pred))
    if model_name in ['Random Forest', 'Decision Tree', 'Gradient Boosting']:
        importances = model.feature_importances_
        feature_importance_df[model_name] = importances
if not feature_importance_df.empty:
    feature_names = train_preprocessor.get_feature_names_out()
    feature_importance_df['Feature'] = feature_names
    feature_importance_df.set_index('Feature', inplace=True)

print("\nModel Performance Summary:")
print(model_performance)

if not feature_importance_df.empty:
    print("\nFeature Importances Summary:")
    print(feature_importance_df)

Training and evaluating Random Forest...
Random Forest Validation Accuracy: 79.53%

Random Forest Classification Report:
              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1037
         Yes       0.66      0.44      0.53       365

    accuracy                           0.80      1402
   macro avg       0.74      0.68      0.70      1402
weighted avg       0.78      0.80      0.78      1402

Training and evaluating SVM...
SVM Validation Accuracy: 81.81%

SVM Classification Report:
              precision    recall  f1-score   support

          No       0.85      0.91      0.88      1037
         Yes       0.69      0.55      0.61       365

    accuracy                           0.82      1402
   macro avg       0.77      0.73      0.75      1402
weighted avg       0.81      0.82      0.81      1402

Training and evaluating Logistic Regression...
Logistic Regression Validation Accuracy: 81.95%

Logistic Regression Classification Rep