In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import product
import time
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, ParameterGrid, cross_val_score, train_test_split, RandomizedSearchCV, ParameterSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.utils.validation import check_is_fitted
from scipy.stats import randint, uniform

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix, precision_recall_curve,
    classification_report, ConfusionMatrixDisplay, fbeta_score)

%cd /content/drive/MyDrive

df = pd.read_csv('imputed_data.csv')
df

Mounted at /content/drive
/content/drive/MyDrive


Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,...,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,ORGANIZATION_TYPE_Business Entity/Self-Employed,ORGANIZATION_TYPE_Corporate/Private Industry,ORGANIZATION_TYPE_Education/Healthcare/NonProfit,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Public/Gov & Emergency
0,0,398301.0,0.0,1.0,0.0,0.0,1.0,0.0,157500.0,276813.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,279370.0,0.0,1.0,0.0,1.0,1.0,1.0,292500.0,900000.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,418778.0,0.0,1.0,1.0,0.0,1.0,0.0,180000.0,450000.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3,118843.0,0.0,1.0,0.0,0.0,1.0,0.0,135000.0,1078200.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,132944.0,0.0,1.0,0.0,0.0,1.0,0.0,112500.0,1205896.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49645,49645,280869.0,0.0,1.0,0.0,0.0,1.0,1.0,67500.0,187704.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
49646,49646,157161.0,1.0,1.0,0.0,0.0,0.0,1.0,292500.0,1494486.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49647,49647,416067.0,1.0,1.0,0.0,0.0,1.0,1.0,157500.0,651600.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
49648,49648,271302.0,0.0,1.0,0.0,1.0,1.0,0.0,126000.0,124722.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
count = df['TARGET']
count.value_counts()

Unnamed: 0_level_0,count
TARGET,Unnamed: 1_level_1
0.0,24825
1.0,24825


In [None]:
df = df.drop(columns=['Unnamed: 0', 'SK_ID_CURR'])

In [None]:
df.isnull().sum()

Unnamed: 0,0
TARGET,0
NAME_CONTRACT_TYPE,0
CODE_GENDER,0
FLAG_OWN_CAR,0
FLAG_OWN_REALTY,0
...,...
ORGANIZATION_TYPE_Business Entity/Self-Employed,0
ORGANIZATION_TYPE_Corporate/Private Industry,0
ORGANIZATION_TYPE_Education/Healthcare/NonProfit,0
ORGANIZATION_TYPE_Other,0


In [None]:
X = df.drop(columns=['TARGET'])
y = df['TARGET']

X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=42)

print(f"Shapes: X_train={X_train.shape}, X_val={X_val.shape}, X_test={X_test.shape}")

# Baseline Decision Tree
dt_baseline = DecisionTreeClassifier(random_state=42)
dt_baseline.fit(X_train, y_train)

# Evaluate
train_auc = roc_auc_score(y_train, dt_baseline.predict_proba(X_train)[:, 1])
val_auc = roc_auc_score(y_val, dt_baseline.predict_proba(X_val)[:, 1])
print(f"Baseline DT - Train AUC: {train_auc:.4f}, Val AUC: {val_auc:.4f}")

Shapes: X_train=(29790, 105), X_val=(9930, 105), X_test=(9930, 105)
Baseline DT - Train AUC: 1.0000, Val AUC: 0.5831


In [None]:
# ---------------------------------------------------------------------
# 0‧ split (already done in your notebook)
# ---------------------------------------------------------------------
# X_train, X_val, y_train, y_val have shapes:
print(f"{X_train.shape=}  {X_val.shape=}")

# ---------------------------------------------------------------------
# 1‧ baseline RF (all 106 features) – quick (no bar needed)
# ---------------------------------------------------------------------
rf_base = RandomForestClassifier(
    n_estimators=300, class_weight="balanced",
    random_state=42, n_jobs=-1
)
rf_base.fit(X_train, y_train)
print("\n[1] BASE RF  –  Train AUC:",
      roc_auc_score(y_train, rf_base.predict_proba(X_train)[:,1]).round(4),
      " | Val AUC:",
      roc_auc_score(y_val,   rf_base.predict_proba(X_val)  [:,1]).round(4))

# ---------------------------------------------------------------------
# 2‧ Randomised hyper-parameter search  (with tqdm)
# ---------------------------------------------------------------------
param_dist = {
    "n_estimators"     : randint(150, 400),          # smaller while tuning
    "max_depth"        : [8, 12, 16, None],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf" : randint(1, 5),
    "max_features"     : ["sqrt", "log2", 0.3],
    "class_weight"     : ["balanced"]
}
n_iter = 25
cv3    = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

sampler = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))
best_auc, best_params = -np.inf, None

for params in tqdm(sampler, desc="Random-search", ncols=80):
    rf = RandomForestClassifier(random_state=42, n_jobs=1, **params)  # ← n_jobs=1
    auc = cross_val_score(rf, X_train, y_train,
                          cv=cv3, scoring="roc_auc", n_jobs=-1).mean()
    if auc > best_auc:
        best_auc, best_params = auc, params

print("best params", best_params)

# final refit with more trees
best_params["n_estimators"] = 800
rf_best = RandomForestClassifier(random_state=42, n_jobs=-1, **best_params)
rf_best.fit(X_train, y_train)

print("\n[2] TUNED RF  –  Train AUC:",
      roc_auc_score(y_train, rf_best.predict_proba(X_train)[:,1]).round(4),
      " | Val AUC:",
      roc_auc_score(y_val,   rf_best.predict_proba(X_val)  [:,1]).round(4))
print("       best params →", best_params)

# ---------------------------------------------------------------------
# 3‧ Top-k feature selection  ➜  re-fit  ➜  compare
# ---------------------------------------------------------------------
k = 30
importances = pd.Series(rf_best.feature_importances_, index=X_train.columns)
topk_cols   = importances.sort_values(ascending=False).head(k).index

X_train_k, X_val_k = X_train[topk_cols], X_val[topk_cols]

rf_k = RandomForestClassifier(random_state=42, n_jobs=-1, **best_params)
rf_k.fit(X_train_k, y_train)

print(f"\n[3] TUNED RF on top-{k} features",
      " – Train AUC:", roc_auc_score(y_train, rf_k.predict_proba(X_train_k)[:,1]).round(4),
      " | Val AUC:",  roc_auc_score(y_val,   rf_k.predict_proba(X_val_k)  [:,1]).round(4))

# ---------------------------------------------------------------------
# 4‧ side-by-side summary
# ---------------------------------------------------------------------
summary = pd.DataFrame(
    {
        "Model"      : ["RF-base", "RF-tuned", f"RF-tuned-top{k}"],
        "Val AUC"    : [
            roc_auc_score(y_val, rf_base.predict_proba(X_val)  [:,1]).round(4),
            roc_auc_score(y_val, rf_best.predict_proba(X_val)  [:,1]).round(4),
            roc_auc_score(y_val, rf_k  .predict_proba(X_val_k)[:,1]).round(4)
        ],
        "n_features" : [X_train.shape[1], X_train.shape[1], k]
    }
)
print("\n---------------- AUC comparison ----------------")
print(summary.to_string(index=False))

X_train.shape=(29790, 105)  X_val.shape=(9930, 105)

[1] BASE RF  –  Train AUC: 1.0  | Val AUC: 0.7387


Random-search: 100%|████████████████████████████| 25/25 [25:00<00:00, 60.01s/it]


best params {'class_weight': 'balanced', 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 3, 'n_estimators': 366}

[2] TUNED RF  –  Train AUC: 0.9593  | Val AUC: 0.7418
       best params → {'class_weight': 'balanced', 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 3, 'n_estimators': 800}

[3] TUNED RF on top-30 features  – Train AUC: 0.977  | Val AUC: 0.7386

---------------- AUC comparison ----------------
         Model  Val AUC  n_features
       RF-base   0.7387         105
      RF-tuned   0.7418         105
RF-tuned-top30   0.7386          30


In [None]:
#
!pip install tqdm tqdm_joblib --quiet
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib
from joblib import parallel_backend

# ------------------------------------------------------------------
# 1. sweep k = 20…60 (step 10) with a progress-bar
# ------------------------------------------------------------------
aucs = {}
k_values = range(20, 70, 10)             # 20, 30, 40, 50, 60
cv3 = StratifiedKFold(3, shuffle=True, random_state=42)

print("\n[ sweep top-k features ]")
for k in tqdm(k_values, desc="k-loop"):

    topk_cols  = importances.sort_values(ascending=False).head(k).index
    Xtr_k, Xv_k = X_train[topk_cols], X_val[topk_cols]

    rf_k = RandomForestClassifier(**best_params, random_state=42, n_jobs=1)  # n_jobs=1 (outer parallelism only)

    # fold-level bar: each of the 3 CV folds shows up
    with tqdm_joblib(tqdm(desc=f"  CV folds for k={k}", total=cv3.get_n_splits(), leave=False)):
        auc = cross_val_score(
            rf_k, X_train[topk_cols], y_train,
            cv=cv3, scoring="roc_auc", n_jobs=-1
        ).mean()

    aucs[k] = auc

# print summary
print("\nk | Val-AUC")
for k, a in aucs.items():
    print(f"{k:2d} | {a:.4f}")

best_k = max(aucs, key=aucs.get)
print(f"\nBest k = {best_k}")

# ------------------------------------------------------------------
# 2. refit best-k model and pick threshold with tqdm feedback
# ------------------------------------------------------------------
topk_cols = importances.sort_values(ascending=False).head(best_k).index
Xtr_k, Xv_k = X_train[topk_cols], X_val[topk_cols]

rf_best_k = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
print("\n[ fitting best-k RF ]")
rf_best_k.fit(Xtr_k, y_train)            # trees already show their own progress


[ sweep top-k features ]


k-loop:   0%|          | 0/5 [00:00<?, ?it/s]

  CV folds for k=20:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  CV folds for k=30:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  CV folds for k=40:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  CV folds for k=50:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  CV folds for k=60:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]


k | Val-AUC
20 | 0.7300
30 | 0.7322
40 | 0.7338
50 | 0.7340
60 | 0.7338

Best k = 50

[ fitting best-k RF ]


In [None]:
from sklearn.metrics import precision_recall_curve, confusion_matrix

def pick_threshold(model, X_val, y_val, min_precision=0.6):
    """Return (threshold, recall, precision) with highest recall
       while precision ≥ min_precision.
    """
    proba = model.predict_proba(X_val)[:, 1]

    # p_r has len = n_thr+1 ; thr has len = n_thr
    precision, recall, thr = precision_recall_curve(y_val, proba)

    # the first precision / recall pair corresponds to a threshold of -∞,
    # so line-up arrays by discarding the 1st element of precision / recall
    precision = precision[1:]
    recall    = recall[1:]

    # now all three arrays have the same length (= n_thresholds)
    mask = precision >= min_precision
    if not mask.any():                       # precision floor never reached
        best = np.argmax(precision)          # fall back to best precision
        return thr[best], recall[best], precision[best]

    best = np.argmax(recall[mask])           # highest recall under constraint
    return thr[mask][best], recall[mask][best], precision[mask][best]


# --- full-feature model (best_params, 106 features) -------------------
th_full, rec_full, prec_full = pick_threshold(
    rf_best, X_val, y_val, min_precision=0.6
)

# --- 60-feature model -------------------------------------------------
th_60, rec_60, prec_60 = pick_threshold(
    rf_best_k, X_val[topk_cols], y_val, min_precision=0.6
)

print(f"FULL 106f →  thresh={th_full:.3f} | recall={rec_full:.3f} | "
      f"precision={prec_full:.3f}")
print(f"TOP-60f   →  thresh={th_60 :.3f} | recall={rec_60 :.3f} | "
      f"precision={prec_60 :.3f}")


FULL 106f →  thresh=0.389 | recall=0.876 | precision=0.600
TOP-60f   →  thresh=0.383 | recall=0.867 | precision=0.600


In [None]:
# columns to keep
top60_cols = importances.sort_values(ascending=False).head(60).index

# final refit on TRAIN+VAL (optional but recommended)
rf_final = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
rf_final.fit(pd.concat([X_train[top60_cols], X_val[top60_cols]]),
             pd.concat([y_train,             y_val]))

# fixed operating point
THRESH = th_60

# evaluate on hold-out TEST
prob_test = rf_final.predict_proba(X_test[top60_cols])[:,1]
pred_test = (prob_test >= THRESH).astype(int)

from sklearn.metrics import recall_score, precision_score, confusion_matrix
print("test recall   :", recall_score(y_test, pred_test))
print("test precision:", precision_score(y_test, pred_test))
print("confusion:\n",    confusion_matrix(y_test, pred_test))

test recall   : 0.8668680765357503
test precision: 0.604494382022472
confusion:
 [[2149 2816]
 [ 661 4304]]


In [None]:
pip install xgboost --quiet

In [None]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,ORGANIZATION_TYPE_Business Entity/Self-Employed,ORGANIZATION_TYPE_Corporate/Private Industry,ORGANIZATION_TYPE_Education/Healthcare/NonProfit,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Public/Gov & Emergency
0,0.0,1.0,0.0,0.0,1.0,0.0,157500.0,276813.0,19813.5,256500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,1.0,1.0,292500.0,900000.0,23742.0,900000.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,180000.0,450000.0,24412.5,450000.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,135000.0,1078200.0,31653.0,900000.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,112500.0,1205896.5,35388.0,1053000.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49645,0.0,1.0,0.0,0.0,1.0,1.0,67500.0,187704.0,10903.5,148500.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
49646,1.0,1.0,0.0,0.0,0.0,1.0,292500.0,1494486.0,43825.5,1305000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
49647,1.0,1.0,0.0,0.0,1.0,1.0,157500.0,651600.0,33399.0,562500.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
49648,0.0,1.0,0.0,1.0,1.0,0.0,126000.0,124722.0,12465.0,117000.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


1. Baseline XGB for reference.

2. Randomised hyper-parameter search (25 draws, 5-fold CV) with live
progress bars.

3. Gain-based feature ranking, a quick sweep of k and a comparison of
full-feature vs. top-k model.

4. A fixed threshold that maximises recall while keeping precision ≥ 0.60.

5. Final metrics on the untouched test split.

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import recall_score, precision_score, confusion_matrix, f1_score
from tqdm.auto      import tqdm
!pip install tqdm tqdm_joblib --quiet
from tqdm_joblib    import tqdm_joblib

# ---------------- data split (already done, kept here for completeness) -----

X = df.drop(columns=['TARGET', ])
y = df['TARGET']

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42)

X_train, X_val,   y_train, y_val      = train_test_split(
    X_trainval, y_trainval, test_size=0.25, stratify=y_trainval, random_state=42)

print(f"X_train.shape={X_train.shape}  X_val.shape={X_val.shape}")

# ---------------- 1. baseline XGB -------------------------------------------
xgb_base = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
)

xgb_base.fit(X_train, y_train)
print("[1] BASE XGB –  Train AUC:",
      f"{roc_auc_score(y_train, xgb_base.predict_proba(X_train)[:,1]):.4f}",
      "| Val AUC:",
      f"{roc_auc_score(y_val,   xgb_base.predict_proba(X_val)[:,1]):.4f}")

# ---------------- 2. random-search tuning -----------------------------------
param_dist = {
    "n_estimators"     : np.arange(300, 900, 100),
    "learning_rate"    : np.linspace(0.01, 0.3, 10),
    "max_depth"        : np.arange(3, 9),
    "min_child_weight" : [1, 2, 5, 10],
    "gamma"            : [0, 0.1, 0.2, 0.3],
    "subsample"        : [0.6, 0.8, 1.0],
    "colsample_bytree" : [0.6, 0.8, 1.0],
    "reg_alpha"        : [0, 0.1, 1],
    "reg_lambda"       : [1, 5, 10],
}

cv5 = StratifiedKFold(5, shuffle=True, random_state=42)

with tqdm_joblib(tqdm(desc="Random-search", total=25)) as _:
    search = RandomizedSearchCV(
        xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='auc',
            random_state=42,
            scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()
        ),
        param_distributions=param_dist,
        n_iter=25,
        scoring='roc_auc',
        cv=cv5,
        n_jobs=-1,
        random_state=42,
        verbose=0
    )
    search.fit(X_train, y_train)

best_params = search.best_params_
print("\n[2] TUNED XGB best params →", best_params)

xgb_tuned = search.best_estimator_
print("[2] TUNED XGB –  Train AUC:",
      f"{roc_auc_score(y_train, xgb_tuned.predict_proba(X_train)[:,1]):.4f}",
      "| Val AUC:",
      f"{roc_auc_score(y_val,   xgb_tuned.predict_proba(X_val)[:,1]):.4f}")

# ---------------- 3. permutation gain importance ----------------------------
gain_imp = pd.Series(
    xgb_tuned.get_booster().get_score(importance_type='gain')
).sort_values(ascending=False)

# keep only names that are in X’s columns (XGBoost drops ignored dummies)
gain_imp = gain_imp[gain_imp.index.intersection(X_train.columns)]

# ---------------- 4. sweep k = 20…60 ----------------------------------------
def quick_auc(cols, model):
    model.fit(X_train[cols], y_train)
    return roc_auc_score(
        y_val, model.predict_proba(X_val[cols])[:,1])

aucs = {}
for k in tqdm(range(20,70,10), desc="k-loop"):
    topk_cols = gain_imp.head(k).index
    aucs[k]   = quick_auc(topk_cols,
                          xgb.XGBClassifier(**best_params, random_state=42,
                                            eval_metric='auc',
                                            scale_pos_weight=
                                            (y_train==0).sum()/(y_train==1).sum()))

best_k = max(aucs, key=aucs.get)
print("\nVal-AUC by k:", aucs, "→ best k =", best_k)

# ---------------- 5. refit best-k model -------------------------------------
topk_cols = gain_imp.head(best_k).index
xgb_bestk = xgb.XGBClassifier(**best_params, random_state=42, eval_metric='auc',
                              scale_pos_weight=(y_train==0).sum()/(y_train==1).sum())
xgb_bestk.fit(X_train[topk_cols], y_train)

# ---------------- 6. pick threshold (≥ 60 % precision) ----------------------
def pick_threshold(model, Xv, yv, p_floor=0.6):
    proba = model.predict_proba(Xv)[:,1]
    prec, rec, thr = precision_recall_curve(yv, proba)
    prec, rec = prec[1:], rec[1:]          # align sizes
    thr_best  = thr[prec >= p_floor]
    if len(thr_best)==0:                    # never reaches floor
        idx = np.argmax(prec);  return thr[idx], rec[idx], prec[idx]
    idx = np.argmax(rec[prec >= p_floor])
    return thr[prec >= p_floor][idx], rec[prec >= p_floor][idx], prec[prec >= p_floor][idx]

th_full, rec_full, prec_full = pick_threshold(xgb_tuned,  X_val,            y_val)
th_k,    rec_k,    prec_k    = pick_threshold(xgb_bestk, X_val[topk_cols], y_val)

print(f"\nFULL 106f →  thr={th_full:.3f} | recall={rec_full:.3f} | precision={prec_full:.3f}")
print(f"TOP-{best_k:02d}f →  thr={th_k   :.3f} | recall={rec_k   :.3f} | precision={prec_k   :.3f}")

# choose winner
winner, win_cols, win_thr = (xgb_bestk, topk_cols, th_k) if rec_k>=rec_full else (xgb_tuned, X_train.columns, th_full)

# ---------------- 7. final evaluation on TEST -------------------------------
X_win_test = X_test[win_cols]
test_proba = winner.predict_proba(X_win_test)[:,1]
test_pred  = (test_proba >= win_thr).astype(int)

print("\n=== TEST-SET RESULTS ===")
print("Recall   :", recall_score(y_test, test_pred))
print("Precision:", precision_score(y_test, test_pred))
print("F1 score :", f1_score(y_test, test_pred))
print("ConfMat:\n", confusion_matrix(y_test, test_pred))

X_train.shape=(29790, 105)  X_val.shape=(9930, 105)
[1] BASE XGB –  Train AUC: 0.9282 | Val AUC: 0.7379


Random-search:   0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]


[2] TUNED XGB best params → {'subsample': 0.6, 'reg_lambda': 5, 'reg_alpha': 0, 'n_estimators': np.int64(800), 'min_child_weight': 1, 'max_depth': np.int64(7), 'learning_rate': np.float64(0.01), 'gamma': 0.3, 'colsample_bytree': 1.0}
[2] TUNED XGB –  Train AUC: 0.8575 | Val AUC: 0.7475


k-loop:   0%|          | 0/5 [00:00<?, ?it/s]


Val-AUC by k: {20: np.float64(0.7353461541972053), 30: np.float64(0.7450480655626648), 40: np.float64(0.7457293095982046), 50: np.float64(0.7460817464446492), 60: np.float64(0.7459357496432732)} → best k = 50

FULL 106f →  thr=0.339 | recall=0.877 | precision=0.600
TOP-50f →  thr=0.342 | recall=0.867 | precision=0.600

=== TEST-SET RESULTS ===
Recall   : 0.8745216515609265
Precision: 0.6057477678571429
F1 score : 0.7157339487348554
ConfMat:
 [[2139 2826]
 [ 623 4342]]
