In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, recall_score
from catboost import CatBoostClassifier

In [40]:
dataset = pd.read_csv("training_data.csv")

X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [41]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

stage1_oof = np.zeros(len(X_train))
stage1_test = np.zeros(len(X_test))

print("=== STAGE 1: TRAINING ===")
for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"\nFold {fold+1}")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    model_s1 = CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        loss_function="Logloss",
        eval_metric="AUC",
        scale_pos_weight=(y_tr==0).sum() / max(1, (y_tr==1).sum()),
        verbose=100,
        random_seed=42
    )

    model_s1.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)
    
    stage1_oof[val_idx] = model_s1.predict_proba(X_val)[:, 1]
    stage1_test += model_s1.predict_proba(X_test)[:, 1] / n_splits


=== STAGE 1: TRAINING ===

Fold 1
0:	test: 0.5952908	best: 0.5952908 (0)	total: 49.7ms	remaining: 24.8s
100:	test: 0.6342043	best: 0.6342043 (100)	total: 5.05s	remaining: 19.9s
200:	test: 0.6363887	best: 0.6364049 (197)	total: 10.3s	remaining: 15.4s
300:	test: 0.6367479	best: 0.6372572 (280)	total: 15.5s	remaining: 10.2s
400:	test: 0.6355578	best: 0.6372572 (280)	total: 20.6s	remaining: 5.09s
499:	test: 0.6342978	best: 0.6372572 (280)	total: 25.6s	remaining: 0us

bestTest = 0.6372571914
bestIteration = 280

Shrink model to first 281 iterations.

Fold 2
0:	test: 0.5980643	best: 0.5980643 (0)	total: 53.2ms	remaining: 26.5s
100:	test: 0.6362693	best: 0.6362819 (99)	total: 5.01s	remaining: 19.8s
200:	test: 0.6410867	best: 0.6411379 (199)	total: 10.1s	remaining: 15s
300:	test: 0.6427834	best: 0.6427834 (300)	total: 15s	remaining: 9.92s
400:	test: 0.6422757	best: 0.6429597 (310)	total: 20.1s	remaining: 4.95s
499:	test: 0.6400163	best: 0.6429597 (310)	total: 25.3s	remaining: 0us

bestTest = 0

In [42]:
auc_s1 = roc_auc_score(y_train, stage1_oof)
gini_s1 = 2 * auc_s1 - 1
print(f"\nStage-1 OOF AUC: {auc_s1:.4f} | Gini: {gini_s1:.4f}")



Stage-1 OOF AUC: 0.6377 | Gini: 0.2754


In [43]:
thresholds = np.arange(0.2, 0.4, 0.01)
best_thresh = 0.27
for t in thresholds:
    mask = stage1_oof >= t
    recall = recall_score(y_train, (stage1_oof >= t).astype(int))
    selected_ratio = mask.mean()
    print(f"Threshold {t:.2f} | Recall {recall:.3f} | Selected {selected_ratio:.2%}")

Threshold 0.20 | Recall 0.999 | Selected 99.79%
Threshold 0.21 | Recall 0.999 | Selected 99.67%
Threshold 0.22 | Recall 0.999 | Selected 99.51%
Threshold 0.23 | Recall 0.998 | Selected 99.27%
Threshold 0.24 | Recall 0.997 | Selected 98.93%
Threshold 0.25 | Recall 0.996 | Selected 98.46%
Threshold 0.26 | Recall 0.993 | Selected 97.88%
Threshold 0.27 | Recall 0.991 | Selected 97.13%
Threshold 0.28 | Recall 0.987 | Selected 96.21%
Threshold 0.29 | Recall 0.982 | Selected 95.13%
Threshold 0.30 | Recall 0.977 | Selected 93.88%
Threshold 0.31 | Recall 0.971 | Selected 92.39%
Threshold 0.32 | Recall 0.963 | Selected 90.76%
Threshold 0.33 | Recall 0.955 | Selected 88.86%
Threshold 0.34 | Recall 0.944 | Selected 86.78%
Threshold 0.35 | Recall 0.930 | Selected 84.45%
Threshold 0.36 | Recall 0.914 | Selected 81.96%
Threshold 0.37 | Recall 0.898 | Selected 79.24%
Threshold 0.38 | Recall 0.883 | Selected 76.40%
Threshold 0.39 | Recall 0.864 | Selected 73.37%


In [45]:
# Compute scale_pos_weight for Stage-2
pos = y_s2.sum()
neg = len(y_s2) - pos
scale_pos_weight_s2 = neg / max(1, pos)
print(f"\nStage-2: {len(y_s2)} samples, scale_pos_weight={scale_pos_weight_s2:.2f}")



Stage-2: 370017 samples, scale_pos_weight=25.90


In [46]:
stage2_oof = np.zeros(len(X_train))
stage2_test = np.zeros(len(X_test))

skf_s2 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print("\n=== STAGE 2: TRAINING ===")
for fold, (tr_idx, val_idx) in enumerate(skf_s2.split(X_s2, y_s2)):
    print(f"\nFold {fold+1}")
    X_tr, X_val = X_s2[tr_idx], X_s2[val_idx]
    y_tr, y_val = y_s2[tr_idx], y_s2[val_idx]

    model_s2 = CatBoostClassifier(
        iterations=800,
        depth=8,
        learning_rate=0.03,
        loss_function="Logloss",
        eval_metric="AUC",
        scale_pos_weight=scale_pos_weight_s2,
        verbose=100,
        random_seed=42
    )

    model_s2.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)
    
    # OOF for Stage-2
    stage2_oof[mask_s2] = model_s2.predict_proba(X_s2)[:, 1]
    
    # Test predictions
    mask_test = stage1_test >= best_thresh
    stage2_test[mask_test] += model_s2.predict_proba(X_test[mask_test])[:, 1] / n_splits



=== STAGE 2: TRAINING ===

Fold 1
0:	test: 0.5943140	best: 0.5943140 (0)	total: 65.3ms	remaining: 52.2s
100:	test: 0.6251597	best: 0.6255888 (77)	total: 6s	remaining: 41.5s
200:	test: 0.6244228	best: 0.6255888 (77)	total: 12.1s	remaining: 36s
300:	test: 0.6241062	best: 0.6255888 (77)	total: 18s	remaining: 29.8s
400:	test: 0.6228161	best: 0.6255888 (77)	total: 23.9s	remaining: 23.8s
500:	test: 0.6191648	best: 0.6255888 (77)	total: 29.9s	remaining: 17.8s
600:	test: 0.6131980	best: 0.6255888 (77)	total: 35.7s	remaining: 11.8s
700:	test: 0.6096385	best: 0.6255888 (77)	total: 41.8s	remaining: 5.9s
799:	test: 0.6048053	best: 0.6255888 (77)	total: 47.7s	remaining: 0us

bestTest = 0.6255887608
bestIteration = 77

Shrink model to first 78 iterations.

Fold 2
0:	test: 0.6057577	best: 0.6057577 (0)	total: 61.8ms	remaining: 49.4s
100:	test: 0.6381159	best: 0.6381984 (99)	total: 5.83s	remaining: 40.4s
200:	test: 0.6409833	best: 0.6410210 (199)	total: 11.6s	remaining: 34.7s
300:	test: 0.6412241	bes

In [47]:
final_probs = stage1_test.copy()
mask_test = stage1_test >= best_thresh
final_probs[mask_test] = 0.6 * stage1_test[mask_test] + 0.4 * stage2_test[mask_test]

# Evaluate
final_auc = roc_auc_score(y_test, final_probs)
final_gini = 2 * final_auc - 1
print(f"\nFINAL TEST AUC: {final_auc:.4f}")
print(f"FINAL TEST GINI: {final_gini:.4f}")


FINAL TEST AUC: 0.6371
FINAL TEST GINI: 0.2742


In [48]:
# oof_stage2 = np.zeros(len(X_train))
# test_stage2 = np.zeros(len(X_test))

# for fold, (tr_idx, val_idx) in enumerate(skf.split(X_s2, y_s2)):
#     print(f"\nStage-2 | Fold {fold+1}")

#     X_tr, X_val = X_s2[tr_idx], X_s2[val_idx]
#     y_tr, y_val = y_s2[tr_idx], y_s2[val_idx]

#     model_s2 = CatBoostClassifier(
#         iterations=800,
#         depth=8,
#         learning_rate=0.03,
#         loss_function="Logloss",
#         eval_metric="AUC",
#         scale_pos_weight=scale_pos_weight,
#         verbose=False,
#         random_seed=42
#     )

#     model_s2.fit(
#         X_tr, y_tr,
#         eval_set=(X_val, y_val),
#         use_best_model=True
#     )

#     test_stage2 += model_s2.predict_proba(X_test)[:, 1] / 5


In [49]:
# final_probs = np.where(
#     stage1_probs >= THRESHOLD,        # only high-risk from Stage-1
#     0.6 * stage1_probs + 0.4 * test_stage2,  # soft blend
#     stage1_probs                       # keep Stage-1 for low-risk
# )

In [50]:
# final_auc = roc_auc_score(y_test, final_probs)
# final_gini = 2 * final_auc - 1

# print("FINAL AUC:", final_auc)
# print("FINAL GINI:", final_gini)


In [51]:
# print("Stage-1 AUC (full):", roc_auc_score(y_test, stage1_probs))

# mask_val = stage1_probs >= threshold
# print(
#     "Stage-2 AUC (filtered):",
#     roc_auc_score(y_test[mask_val], val_probs_s2[mask_val])
# )
