In [7]:
import os
import pandas as pd
from sqlalchemy import create_engine
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

load_dotenv()


engine = create_engine(os.getenv("DATABASE_URL"))
df = pd.read_sql("SELECT * FROM hpce.churn_model_dataset_multi", engine)

features = ["recency_days","f_30","m_30","views_30","events_30","f_90","m_90","views_90","events_90"]
label = "churn_90d"

df["feature_date"] = pd.to_datetime(df["feature_date"], errors="coerce")
df = df.dropna(subset=["feature_date"]).copy()

# Use unique sorted timestamps (not .date) to avoid collapsing
unique_dates = sorted(df["feature_date"].unique())

# Pick 3 safe cut points by index (guarantees train and test exist)
# 60%, 75%, 90% positions
cut_idxs = [int(len(unique_dates)*0.60), int(len(unique_dates)*0.75), int(len(unique_dates)*0.90)]
cut_idxs = [min(max(1, i), len(unique_dates)-2) for i in cut_idxs]  # keep inside range
cuts = [unique_dates[i] for i in cut_idxs]
def forward_auc(cut_ts, label=label):
    train = df[df["feature_date"] <= cut_ts]
    test  = df[df["feature_date"] >  cut_ts]

    # Safety: ensure test and train are non-empty
    if len(train) == 0 or len(test) == 0:
        return {"cut": cut_ts, "status": "SKIP_empty_split", "train_n": len(train), "test_n": len(test), "auc": None}

    Xtr, ytr = train[features].fillna(0), train[label].astype(int)
    Xte, yte = test[features].fillna(0),  test[label].astype(int)

    # Safety: AUC undefined if test has one class
    if yte.nunique() < 2:
        return {"cut": cut_ts, "status": "SKIP_single_class_test", "train_n": len(train), "test_n": len(test), "auc": None}

    model = XGBClassifier(
        n_estimators=250,
        max_depth=4,
        learning_rate=0.08,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    )
    model.fit(Xtr, ytr)
    p = model.predict_proba(Xte)[:, 1]
    auc = roc_auc_score(yte, p)

    return {"cut": cut_ts, "status": "OK", "train_n": len(train), "test_n": len(test), "auc": auc}

results = [forward_auc(c) for c in cuts]
results_df = pd.DataFrame(results)
results_df



Unnamed: 0,cut,status,train_n,test_n,auc
0,2017-10-17,SKIP_empty_split,107194,0,
1,2017-10-17,SKIP_empty_split,107194,0,
2,2017-10-17,SKIP_empty_split,107194,0,
