In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import joblib
import pickle
import warnings
warnings.filterwarnings("ignore")

In [8]:
# load data
df = pd.read_csv("C:/Projects/Customer_churn_project/data/processed/cleaned_customer_churn.csv")
X = df.drop("Exited", axis=1)
y = df["Exited"]

# Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

with open("C:/Projects/Customer_churn_project/models/preprocessing_pipeline.pkl", "rb") as f:
    preprocessing_pipeline = pickle.load(f)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (12000, 10)
Test shape: (3000, 10)


In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weights_dict = dict(zip(classes, weights))
print(weights_dict)


{np.float64(0.0): np.float64(0.6238303181534622), np.float64(1.0): np.float64(2.5188916876574305)}


In [15]:
class_weights = {0: 0.624, 1: 2.519}

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight=class_weights
    ),

    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        class_weight=class_weights
    ),

    "XGBoost": XGBClassifier(
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42,
        scale_pos_weight=class_weights[1] / class_weights[0]  
    ),

    "LightGBM": LGBMClassifier(
        random_state=42,
        scale_pos_weight=class_weights[1] / class_weights[0]
    ),

    "Gradient Boosting": GradientBoostingClassifier(
        random_state=42
    ),

    "CatBoost": CatBoostClassifier(
        verbose=0,
        random_state=42,
        class_weights=[class_weights[0], class_weights[1]]
    )
}

best_model = None
best_score = 0
best_model_name = None

for name, model in models.items():
    pipeline = ImbPipeline([
        ("preprocess", preprocessing_pipeline),
        #("smote", SMOTE(random_state=42)),  
        ("model", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    
    if acc > best_score:
        best_score = acc
        best_model = pipeline
        best_model_name = name

print(f"\n Eng yaxshi model: {best_model_name} (Accuracy: {best_score:.4f})")


with open("C:/Projects/Customer_churn_project/models/best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

Logistic Regression Accuracy: 0.8243
              precision    recall  f1-score   support

         0.0       0.95      0.83      0.88      2405
         1.0       0.54      0.81      0.65       595

    accuracy                           0.82      3000
   macro avg       0.74      0.82      0.76      3000
weighted avg       0.86      0.82      0.84      3000

Random Forest Accuracy: 0.8943
              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      2405
         1.0       0.78      0.64      0.71       595

    accuracy                           0.89      3000
   macro avg       0.85      0.80      0.82      3000
weighted avg       0.89      0.89      0.89      3000

XGBoost Accuracy: 0.8817
              precision    recall  f1-score   support

         0.0       0.94      0.91      0.92      2405
         1.0       0.67      0.79      0.73       595

    accuracy                           0.88      3000
   macro avg       0.81      0.85  