In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import joblib

In [14]:
df= pd.read_csv("processed_data.csv")

In [15]:
df.head()

Unnamed: 0,Age,Days_Since_Last_Purchase,Last_Interaction_Days,Purchases_Per_Year,Avg_Visits_Per_Month,Customer_Support_Calls,Satisfaction_Score,Complaints_Filed,Gender,Location,Subscription_Type,Company_Type,Auto_Payment,Total_Spend,Churn
0,-0.140618,0.117488,1.380184,-0.309135,-0.732385,-1.226708,1.705168,-1.397908,-0.783364,-1.192227,0.16982,-0.937617,-0.840057,-0.840057,1
1,-0.239528,-0.311732,-0.557397,0.132004,1.164532,-1.595767,0.06128,0.832971,-0.783364,-0.056117,-1.101797,-0.937617,1.190395,1.190395,0
2,-0.140618,0.585727,-0.051095,-0.272373,0.658688,1.356702,-0.349692,1.576597,-0.783364,-1.192227,0.16982,-0.937617,-0.840057,-0.840057,1
3,-0.437349,-0.370262,0.727832,1.308373,-1.111768,-1.595767,-1.582608,0.832971,-0.783364,-1.192227,0.16982,-0.937617,-0.840057,-0.840057,0
4,-0.041707,-1.65792,0.727832,0.940758,1.290993,-0.85765,1.705168,0.832971,-0.783364,-0.056117,0.16982,-0.937617,-0.840057,-0.840057,0


In [16]:
X = df.drop(columns=['Churn'])
y = df['Churn']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}


In [20]:
param_grid = {
    "Random Forest": {"n_estimators": [100, 200], "max_depth": [10, 20]},
    "XGBoost": {"n_estimators": [100, 200], "learning_rate": [0.01, 0.1]}
}
best_model = None
best_accuracy = 0


In [21]:
for name, model in models.items():
    print(f"Training {name}...")
    if name in param_grid:
        grid = GridSearchCV(model, param_grid[name], cv=3, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train, y_train)
        model = grid.best_estimator_
    else:
        model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model


Training Logistic Regression...
Logistic Regression Accuracy: 0.6639
              precision    recall  f1-score   support

           0       0.67      0.65      0.66      2904
           1       0.66      0.68      0.67      2918

    accuracy                           0.66      5822
   macro avg       0.66      0.66      0.66      5822
weighted avg       0.66      0.66      0.66      5822

Training Random Forest...
Random Forest Accuracy: 0.7825
              precision    recall  f1-score   support

           0       0.77      0.80      0.79      2904
           1       0.79      0.77      0.78      2918

    accuracy                           0.78      5822
   macro avg       0.78      0.78      0.78      5822
weighted avg       0.78      0.78      0.78      5822

Training XGBoost...
XGBoost Accuracy: 0.7685
              precision    recall  f1-score   support

           0       0.76      0.79      0.77      2904
           1       0.78      0.75      0.76      2918

    accurac

In [22]:
joblib.dump(best_model, "best_churn_model.pkl")
print(f"Best model ({type(best_model).__name__}) saved as best_churn_model.pkl")


Best model (RandomForestClassifier) saved as best_churn_model.pkl
