In [3]:
import os
import pandas as pd
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

engine = create_engine(os.getenv("DATABASE_URL"))
df = pd.read_sql("SELECT * FROM hpce.churn_model_dataset_segmented", engine)

features = ["recency_days","f_30","m_30","views_30","events_30","f_90","m_90","views_90","events_90"]
label = "churn_90d"

def safe_auc(y_true, y_prob):
    # AUC undefined if only one class exists in y_true
    if pd.Series(y_true).nunique() < 2:
        return None
    return roc_auc_score(y_true, y_prob)

def train_segment_model(segment_group):
    sub = df[df["segment_group"] == segment_group].copy()

    # 1) Hard skip tiny sets
    if len(sub) < 5000:
        return {"segment_group": segment_group, "status": "SKIP_small", "n": len(sub), "auc": None}

    # 2) Skip segments with only one class overall
    if sub[label].nunique() < 2:
        return {"segment_group": segment_group, "status": "SKIP_single_class_overall", "n": len(sub), "auc": None}

    X = sub[features].fillna(0)
    y = sub[label].astype(int)

    # 3) Stratified split (but still may create single-class in test if extremely imbalanced)
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # 4) If test ends up single-class, re-split with different random seed a few times
    if pd.Series(yte).nunique() < 2:
        for seed in [7, 21, 99, 123]:
            Xtr, Xte, ytr, yte = train_test_split(
                X, y, test_size=0.3, random_state=seed, stratify=y
            )
            if pd.Series(yte).nunique() == 2:
                break

    # If still single-class, cannot compute AUC
    if pd.Series(yte).nunique() < 2:
        return {"segment_group": segment_group, "status": "FAIL_single_class_test", "n": len(sub), "auc": None}

    model = XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.08,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    )

    model.fit(Xtr, ytr)
    p = model.predict_proba(Xte)[:, 1]
    auc = safe_auc(yte, p)

    return {"segment_group": segment_group, "status": "OK", "n": len(sub), "auc": auc}

results = []
for g in sorted(df["segment_group"].unique()):
    results.append(train_segment_model(g))

results_df = pd.DataFrame(results).sort_values("segment_group")
results_df


Unnamed: 0,segment_group,status,n,auc
0,A_high_value,SKIP_single_class_overall,106966,
1,C_low_or_risky,SKIP_small,224,
2,D_never,SKIP_small,4,
