In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.data_loader import load_data
from src.preprocessing import clean_data
from src.features import build_features

In [2]:
df = load_data()
df = clean_data(df)
df = build_features(df)

TARGET = "Churn"

X = df.drop(columns=[TARGET, "customerID"])
y = df[TARGET]


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [4]:
y_baseline = np.zeros_like(y_test)

print("Baseline accuracy:", (y_baseline == y_test).mean())


Baseline accuracy: 0.7341862117981521


In [5]:
from sklearn.metrics import roc_auc_score, f1_score

def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]

    print("ROC-AUC:", roc_auc_score(y_test, proba))
    print("F1:", f1_score(y_test, preds))


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)

lr.fit(X_train, y_train)

evaluate(lr, X_test, y_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42
)

rf.fit(X_train, y_train)

evaluate(rf, X_test, y_test)


In [8]:
X_test.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TotalCharges,avg_monthly_bill,high_charges,is_new_client,is_long_term,tenure_bucket,num_services,is_month_to_month,is_auto_pay,revenue_per_tenure
974,Female,0,Yes,Yes,59,Yes,No,DSL,No,Yes,...,4542.35,75.705833,1,0,1,3,5,0,1,75.705833
619,Female,0,No,No,7,Yes,Yes,Fiber optic,No,Yes,...,522.95,65.36875,1,1,0,0,2,1,1,65.36875
4289,Female,0,No,No,54,Yes,No,No,No internet service,No internet service,...,1079.45,19.626364,0,0,1,3,1,0,0,19.626364
3721,Female,0,No,No,2,Yes,No,No,No internet service,No internet service,...,38.7,12.9,0,1,0,0,1,1,0,12.9
4533,Female,0,Yes,No,71,Yes,Yes,Fiber optic,No,Yes,...,7555.0,104.930556,1,0,1,3,5,0,1,104.930556


In [6]:
from catboost import CatBoostClassifier

cat_features = ["gender", "Partner", "Dependents", "PhoneService", "OnlineSecurity",
                "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
                "PaperlessBilling", "MultipleLines", "InternetService", "Contract", "PaymentMethod"
                ]

cat = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    cat_features=cat_features,
    verbose=0
)

cat.fit(X_train, y_train)

evaluate(cat, X_test, y_test)


ROC-AUC: 0.8336473901362005
F1: 0.5889046941678521


In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_estimator(lr, X_test, y_test)
RocCurveDisplay.from_estimator(rf, X_test, y_test)
RocCurveDisplay.from_estimator(cat, X_test, y_test)

plt.show()


In [None]:
import pandas as pd

fi = pd.Series(cat.get_feature_importance(), index=X.columns)
fi.sort_values(ascending=False).head(10).plot(kind="barh")
plt.show()


In [None]:
import joblib

joblib.dump(cat, "../models/churn_model.pkl")
