In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.data_loader import load_data
from src.preprocessing import clean_data
from src.features import build_features

In [None]:
df = load_data()
df = clean_data(df)
df = build_features(df)

TARGET = "Churn"

X = df.drop(columns=[TARGET, "customerID"])
y = df[TARGET]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
y_baseline = np.zeros_like(y_test)

print("Baseline accuracy:", (y_baseline == y_test).mean())


In [None]:
from sklearn.metrics import roc_auc_score, f1_score

def evaluate(model, X_test, y_test):
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]

    print("ROC-AUC:", roc_auc_score(y_test, proba))
    print("F1:", f1_score(y_test, preds))


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)

lr.fit(X_train, y_train)

evaluate(lr, X_test, y_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42
)

rf.fit(X_train, y_train)

evaluate(rf, X_test, y_test)


In [None]:
X_test.head()

In [None]:
from catboost import CatBoostClassifier

cat_features = ["gender", "Partner", "Dependents", "PhoneService", "OnlineSecurity",
                "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies",
                "PaperlessBilling", "MultipleLines", "InternetService", "Contract", "PaymentMethod"
                ]

cat = CatBoostClassifier(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    cat_features=cat_features,
    verbose=0
)

cat.fit(X_train, y_train)

evaluate(cat, X_test, y_test)


In [None]:
from sklearn.metrics import RocCurveDisplay

RocCurveDisplay.from_estimator(lr, X_test, y_test)
RocCurveDisplay.from_estimator(rf, X_test, y_test)
RocCurveDisplay.from_estimator(cat, X_test, y_test)

plt.show()


In [None]:
import pandas as pd

fi = pd.Series(cat.get_feature_importance(), index=X.columns)
fi.sort_values(ascending=False).head(10).plot(kind="barh")
plt.show()


In [None]:
import joblib

joblib.dump(cat, "../models/churn_model.pkl")
