In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/processed/featured_churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group,total_addons,high_value_customer
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,0-12,1,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,One year,No,Mailed check,56.95,1889.5,0,24-48,2,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,Month-to-month,Yes,Mailed check,53.85,108.15,1,0-12,2,0
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,24-48,3,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,Month-to-month,Yes,Electronic check,70.7,151.65,1,0-12,0,1


In [3]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [4]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()

In [5]:
cat_cols

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'tenure_group']

In [6]:
num_cols

['SeniorCitizen',
 'tenure',
 'MonthlyCharges',
 'TotalCharges',
 'total_addons',
 'high_value_customer']

Categorical and numerical features were processed separately using a preprocessing pipeline.

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
    ]
)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [9]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        class_weight="balanced",
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)

In [10]:
from sklearn.metrics import classification_report, roc_auc_score

def evaluate(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:,1]
    print(classification_report(y, y_pred))
    print("Area of the curve", roc_auc_score(y, y_prob))

In [11]:
print("Train set evaluation:")
evaluate(rf_model, X_train, y_train)

Train set evaluation:
              precision    recall  f1-score   support

           0       0.91      0.75      0.83      4139
           1       0.54      0.80      0.65      1495

    accuracy                           0.77      5634
   macro avg       0.73      0.78      0.74      5634
weighted avg       0.81      0.77      0.78      5634

Area of the curve 0.856834935813265


In [12]:
print("Test set evaluation:")
evaluate(rf_model, X_test, y_test)

Test set evaluation:




              precision    recall  f1-score   support

           0       0.89      0.74      0.81      1035
           1       0.51      0.76      0.61       374

    accuracy                           0.75      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.75      0.76      1409

Area of the curve 0.8329174093879976


Random Forest outperformed Logistic Regression in recall and ROC-AUC, making it more suitable for identifying churn-prone customers.

In [13]:
import joblib
joblib.dump(rf_model, "../churn_model.pkl")

['../churn_model.pkl']