In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import joblib
import warnings
warnings.filterwarnings("ignore")

In [None]:
# load data
df = pd.read_csv("C:/Projects/Customer_churn_project/data/processed/featured_customer_churn.csv")
X = df.drop("Exited", axis=1)
y = df["Exited"]

# Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (12000, 8)
Test shape: (3000, 8)


In [None]:
# Model list
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("Random Forest", RandomForestClassifier()),
    ("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ("LightGBM", LGBMClassifier()),
    ("CatBoost", CatBoostClassifier(verbose=0))
]

In [None]:
# Save results
results = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "ROC_AUC": roc_auc
    })

    # We save each model
    joblib.dump(model, f"C:/Projects/Customer_churn_project/models/{name.replace(' ', '_').lower()}_model.pkl")

[LightGBM] [Info] Number of positive: 2382, number of negative: 9618
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 576
[LightGBM] [Info] Number of data points in the train set: 12000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.198500 -> initscore=-1.395696
[LightGBM] [Info] Start training from score -1.395696


In [6]:
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='ROC_AUC', ascending=False)
df_results.reset_index(drop=True, inplace=True)
df_results

Unnamed: 0,Model,Accuracy,Precision,Recall,ROC_AUC
0,CatBoost,0.899333,0.780115,0.685714,0.932036
1,LightGBM,0.897,0.766791,0.690756,0.930162
2,XGBoost,0.896333,0.766917,0.685714,0.922784
3,Random Forest,0.884,0.726606,0.665546,0.91207
4,Logistic Regression,0.869333,0.732265,0.537815,0.883122


In [None]:
# We keep the best model separately
best_model_name = df_results.iloc[0]["Model"]
print("Eng yaxshi model:", best_model_name)

# Upload and save file
best_model = joblib.load(f"C:/Projects/Customer_churn_project/models/{best_model_name.replace(' ', '_').lower()}_model.pkl")
joblib.dump(best_model, "C:/Projects/Customer_churn_project/models/best_model.pkl")

Eng yaxshi model: CatBoost


['C:/Projects/Customer_churn_project/models/best_model.pkl']