In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as py

In [None]:
dataset=pd.read_csv('training_data.csv')

In [None]:
print(dataset.shape)

(476169, 52)


In [None]:
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state = 1)

STAGE 1

In [None]:
import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

stage1 = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    max_depth=4,
    num_leaves=16,
    n_estimators=300,
    learning_rate=0.05,
    class_weight='balanced',
    min_data_in_leaf=100,
    random_state=42
)


In [None]:
stage1.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 13884, number of negative: 367051
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.116862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1305
[LightGBM] [Info] Number of data points in the train set: 380935, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [None]:
stage1_probs = stage1.predict_proba(X_test)[:, 1] #not actual probbalities kinda like risk scores like is probability se ye nahi pata chal rha ki ye percenatge hai that ye customer kharidega insurance
print(stage1_probs)



[0.56569319 0.47020377 0.49917364 ... 0.52032931 0.49710659 0.52618692]


In [None]:
from sklearn.metrics import recall_score, roc_auc_score

for t in [0.05, 0.10, 0.15, 0.20, 0.25]:
    preds = (stage1_probs >= t).astype(int)
    recall = recall_score(y_test, preds)
    print(f"Threshold {t:.2f} → Recall: {recall:.3f}")




Threshold 0.05 → Recall: 1.000
Threshold 0.10 → Recall: 1.000
Threshold 0.15 → Recall: 1.000
Threshold 0.20 → Recall: 0.999
Threshold 0.25 → Recall: 0.989


In [None]:
for t in [0.05, 0.10, 0.15, 0.5]:
    selected_ratio = (stage1_probs >= t).mean()
    print(f"Threshold {t}: selected {selected_ratio:.2%}")


Threshold 0.05: selected 100.00%
Threshold 0.1: selected 100.00%
Threshold 0.15: selected 99.97%
Threshold 0.5: selected 36.02%


In [None]:
for t in [0.3, 0.4, 0.5, 0.6]:
    preds = (stage1_probs >= t).astype(int)
    recall = recall_score(y_test, preds)
    selected = (stage1_probs >= t).mean()
    print(f"t={t}: recall={recall:.3f}, selected={selected:.2%}")


t=0.3: recall=0.966, selected=92.15%
t=0.4: recall=0.833, selected=69.25%
t=0.5: recall=0.547, selected=36.02%
t=0.6: recall=0.253, selected=12.40%


In [None]:
preds_05 = (stage1_probs >= 0.27).astype(int)
recall_05 = recall_score(y_test, preds_05)
print("Recall at 0.5:", recall_05)


Recall at 0.5: 0.9798329011812158


In [None]:
for t in [0.25, 0.26, 0.27]:
    preds = (stage1_probs >= t).astype(int)
    print(
        t,
        recall_score(y_test, preds),
        (stage1_probs >= t).mean()
    )


0.25 0.9890521463555172 0.9758279606023059
0.26 0.9847306251800634 0.9682991368628852
0.27 0.9798329011812158 0.9591007413318773


In [None]:
auc = roc_auc_score(y_test, stage1_probs)
print("Stage-1 AUC:", auc)


Stage-1 AUC: 0.6357153357618772


In [None]:
threshold = 0.27

# Use train_probs instead of stage1_probs to create the mask for the training data
mask_train = train_probs >= threshold
X_train_s2 = X_train[mask_train]
y_train_s2 = y_train[mask_train]

In [None]:
pos = y_train_s2.sum()
neg = len(y_train_s2) - pos
ratio = neg / pos

print(ratio)


25.50210449927431


In [None]:
stage2_model = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.02,
    max_depth=-1,
    num_leaves=63,
    min_data_in_leaf=100,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=ratio,
    objective="binary",
    metric="auc",
    random_state=42,
    n_jobs=-1
)


In [None]:
stage2_model.fit(
    X_train_s2,
    y_train_s2,
    eval_set=[(X_test, y_test)],
    eval_metric="auc",
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=10)
    ]
)

[LightGBM] [Info] Number of positive: 13780, number of negative: 351419
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.090318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1304
[LightGBM] [Info] Number of data points in the train set: 365199, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037733 -> initscore=-3.238761
[LightGBM] [Info] Start training from score -3.238761
Training until validation scores don't improve for 100 rounds
[10]	valid_0's auc: 0.623301
[20]	valid_0's auc: 0.625921
[30]	valid_0's auc: 0.6273
[40]	valid_0's auc: 0.628367
[50]	valid_0's auc: 0.628951
[60]	valid_0's auc: 0.629049
[70]	valid_0's auc: 0.629418
[80]	valid_0's auc: 0.62975
[90]	valid_0's auc: 0.629884
[100]	valid_0's auc: 0.630247
[110]	valid_0's auc: 0.63012
[120]	valid_0's auc: 0.630543
[130]	valid_0's auc: 0.631066
[

In [None]:
from sklearn.metrics import roc_auc_score

val_probs_s2 = stage2_model.predict_proba(X_test)[:, 1]
auc_s2 = roc_auc_score(y_test, val_probs_s2)

gini_s2 = 2 * auc_s2 - 1

print("Stage-2 AUC:", auc_s2)
print("Stage-2 Gini:", gini_s2)




Stage-2 AUC: 0.6330062977455926
Stage-2 Gini: 0.26601259549118517


In [None]:
test_stage1_probs = stage1.predict_proba(X_test)[:, 1]

final_probs = np.zeros(len(X_test))

mask_test = test_stage1_probs >= threshold

# Stage-2 predictions only where needed
final_probs[mask_test] = stage2_model.predict_proba(X_test[mask_test])[:, 1]

# Very low probability for filtered samples
final_probs[~mask_test] = 0.001

final_auc = roc_auc_score(y_test, final_probs)
final_gini = 2 * final_auc - 1

print("FINAL AUC:", final_auc)
print("FINAL GINI:", final_gini)







FINAL AUC: 0.6328638278409472
FINAL GINI: 0.2657276556818944


checking


In [None]:
# Stage-1 AUC on FULL validation
auc_s1_full = roc_auc_score(y_test, stage1_probs)

# Stage-2 AUC on ONLY filtered validation
mask_val = stage1_probs >= threshold
auc_s2_filtered = roc_auc_score(
    y_test[mask_val],
    val_probs_s2[mask_val]
)

print("Stage-1 AUC (full):", auc_s1_full)
print("Stage-2 AUC (filtered):", auc_s2_filtered)


Stage-1 AUC (full): 0.6357153357618772
Stage-2 AUC (filtered): 0.6300393345344906
