In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine

load_dotenv()
engine = create_engine(os.getenv("DATABASE_URL"))

df = pd.read_sql("SELECT * FROM hpce.churn_model_dataset_multi", engine)
print(df.shape)
df.head()

(107194, 14)


Unnamed: 0,customer_id,feature_date,recency_days,f_30,m_30,views_30,events_30,f_90,m_90,views_90,events_90,churn_30d,churn_60d,churn_90d
0,fed5b7561b589c773a4a7d7028d0ae9b,2017-10-17,999,0.0,0.0,13.0,14.0,0.0,0.0,13.0,14.0,0,0,0
1,fed7332ea01f08de619d1f26298d9179,2017-10-17,999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,fed77c9c964215dcfeab228ecb2626f9,2017-10-17,999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,fed9258e995bc38766aea4fcdead8676,2017-10-17,999,0.0,0.0,0.0,0.0,0.0,0.0,7.0,7.0,0,0,0
4,fedad553036c744cca336c2ed0722e3c,2017-10-17,999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [11]:
from sklearn.model_selection import train_test_split

FEATURES = [
    "recency_days","f_30","m_30","views_30","events_30",
    "f_90","m_90","views_90","events_90"
]
TARGET = "churn_90d"

X = df[FEATURES].fillna(0)
y = df[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print("Train:", X_train.shape)
print("Test:", X_test.shape)
print("Target distribution test:\n", y_test.value_counts(normalize=True))

Train: (85755, 9)
Test: (21439, 9)
Target distribution test:
 churn_90d
0    0.998694
1    0.001306
Name: proportion, dtype: float64


In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    n_jobs=-1
)

model.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [14]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, average_precision_score

proba = model.predict_proba(X_test_scaled)[:, 1]
pred  = (proba >= 0.5).astype(int)

auc = roc_auc_score(y_test, proba) if y_test.nunique() == 2 else None
ap  = average_precision_score(y_test, proba) if y_test.nunique() == 2 else None

print("ROC-AUC:", auc)
print("PR-AUC (Average Precision):", ap)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

ROC-AUC: 0.9954354237141122
PR-AUC (Average Precision): 0.9646360997096807

Confusion Matrix:
 [[21337    74]
 [    1    27]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     21411
           1       0.27      0.96      0.42        28

    accuracy                           1.00     21439
   macro avg       0.63      0.98      0.71     21439
weighted avg       1.00      1.00      1.00     21439



In [15]:
import numpy as np

importance = pd.DataFrame({
    "feature": FEATURES,
    "coef": model.coef_[0],
    "abs_coef": np.abs(model.coef_[0])
}).sort_values("abs_coef", ascending=False)

importance

Unnamed: 0,feature,coef,abs_coef
4,events_30,2.567289,2.567289
7,views_90,-2.248646,2.248646
0,recency_days,-2.144763,2.144763
5,f_90,-1.755103,1.755103
8,events_90,-1.236,1.236
1,f_30,-0.775683,0.775683
6,m_90,-0.715906,0.715906
2,m_30,-0.484969,0.484969
3,views_30,0.468755,0.468755


In [16]:
X_all_scaled = scaler.fit_transform(X)  # refit scaler on all
final_model = LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1)
final_model.fit(X_all_scaled, y)

final_proba = final_model.predict_proba(X_all_scaled)[:, 1]
df["churn_probability_90d"] = final_proba

In [17]:
score_df = df[["customer_id", "feature_date", "churn_probability_90d"]].copy()

score_df.to_sql(
    name="churn_scores_daily",
    con=engine,
    schema="hpce",
    if_exists="replace",
    index=False,
    chunksize=100_000,
    method="multi"
)
print("Saved to hpce.churn_scores_daily")

Saved to hpce.churn_scores_daily
