In [1]:
import pandas as pd

train = pd.read_csv(r"C:\Users\Anannya\demo-project\data\training_data.csv")
test = pd.read_csv(r"C:\Users\Anannya\demo-project\data\test_data.csv")

In [5]:
# Separate target
y = train['target']

# Drop target from training features
X = train.drop(columns=['target'])

# Handle ID column consistently (if present)
if 'id' in X.columns:
    X = X.drop(columns=['id'])

if 'id' in test.columns:
    X_test = test.drop(columns=['id'])
else:
    X_test = test.copy()


In [2]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['target', 'id'])
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(y_train.value_counts(normalize=True))
print(y_val.value_counts(normalize=True))


target
0    0.963553
1    0.036447
Name: proportion, dtype: float64
target
0    0.963553
1    0.036447
Name: proportion, dtype: float64


In [3]:
import sys
!{sys.executable} -m pip install xgboost




In [6]:
# Calculate scale_pos_weight
neg_count = (y == 0).sum()
pos_count = (y == 1).sum()

pos_weight = neg_count / pos_count

print("scale_pos_weight:", pos_weight)


scale_pos_weight: 26.436992221261885


In [7]:
import xgboost as xgb

# Stage-1 model: very sensitive detector
stage1_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,              # shallow → broad detector
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=pos_weight * 1.2,  # extra weight for positives
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

stage1_model.fit(X_train, y_train)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [8]:
import numpy as np

# 1. Stage-1 predictions for entire training data
stage1_probs = stage1_model.predict_proba(X_train)[:, 1]

# 2. Decide selection cutoff (top 30%)
cutoff_percent = 0.30  

# Calculate threshold value for top 30%
threshold = np.quantile(stage1_probs, 1 - cutoff_percent)

# 3. Create mask for risky population
risky_mask = stage1_probs >= threshold

# 4. Extract the risky subset
X_risky = X_train[risky_mask]
y_risky = y_train[risky_mask]

print("Total samples:", len(X_train))
print("Selected risky samples:", len(X_risky))
print("Positive targets in risky subset:", sum(y_risky))


Total samples: 380935
Selected risky samples: 114281
Positive targets in risky subset: 7289


In [9]:
stage2_model = xgb.XGBClassifier(
    n_estimators=600,
    learning_rate=0.03,
    max_depth=7,               
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(sum(y_risky==0) / sum(y_risky==1)),  # balanced only inside risky subset
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)

stage2_model.fit(X_risky, y_risky)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [10]:
import numpy as np

# 1. Stage-1 predictions for full training data
p1 = stage1_model.predict_proba(X_train)[:, 1]

# 2. Stage-2 predictions ONLY for risky subset
p2_risky = stage2_model.predict_proba(X_risky)[:, 1]

# 3. Create final array (initially equal to Stage-1 predictions)
final_safe_pred = np.array(p1)

# 4. Replace predictions for risky customers with Stage-2 predictions
final_safe_pred[risky_mask] = p2_risky


In [11]:
from sklearn.metrics import roc_auc_score

auc_safe = roc_auc_score(y_train, final_safe_pred)
gini_safe = 2*auc_safe - 1

print("SAFE AUC:", auc_safe)
print("SAFE Gini:", gini_safe)


SAFE AUC: 0.815409506438918
SAFE Gini: 0.6308190128778359


In [12]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

safe_oof = np.zeros(len(X_train))  # final OOF predictions for SAFE model


In [13]:
import xgboost as xgb

stage1_oof = np.zeros(len(X_train))  # store Stage-1 model predictions for each fold

stage1_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'eta': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42
}

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n---- Fold {fold+1} ----")

    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

    dtr = xgb.DMatrix(X_tr, y_tr)
    dval = xgb.DMatrix(X_val, y_val)

    # train Stage-1 on fold training data
    model1 = xgb.train(stage1_params, dtr, num_boost_round=400)

    # Stage-1 predictions for fold validation
    stage1_oof[val_idx] = model1.predict(dval)

print("\nStage-1 OOF collection complete.")



---- Fold 1 ----

---- Fold 2 ----

---- Fold 3 ----

---- Fold 4 ----

---- Fold 5 ----

Stage-1 OOF collection complete.


In [14]:
# mark risky samples for each fold
risky_mask = np.zeros(len(X_train), dtype=bool)

risk_fraction = 0.30  # top 30% as risky

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n---- Fold {fold+1} ----")
    
    # stage-1 predictions for this fold's validation set
    fold_preds = stage1_oof[val_idx]

    # number of risky samples within this fold
    k = int(len(val_idx) * risk_fraction)

    # indices (within fold) of highest predicted risk
    top_k_idx = np.argsort(fold_preds)[-k:]

    # mark these global indices as risky
    risky_mask[val_idx[top_k_idx]] = True

print("\nRisky mask for all folds created.")
print("Total risky samples:", risky_mask.sum())



---- Fold 1 ----

---- Fold 2 ----

---- Fold 3 ----

---- Fold 4 ----

---- Fold 5 ----

Risky mask for all folds created.
Total risky samples: 114280


In [15]:
stage2_oof = np.zeros(len(X_train))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\nTraining Stage-2, Fold {fold+1}...")

    # risky mask for the TRAIN part of this fold
    risky_tr_idx = tr_idx[risky_mask[tr_idx]]

    X_tr_risky = X_train.iloc[risky_tr_idx]
    y_tr_risky = y_train.iloc[risky_tr_idx]

    X_val = X_train.iloc[val_idx]

    # train Stage-2 XGBoost model
    stage2_model = xgb.XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        scale_pos_weight=pos_weight,
        objective='binary:logistic',
        eval_metric='auc',
        random_state=42,
        n_jobs=-1
    )

    stage2_model.fit(X_tr_risky, y_tr_risky)

    # prediction on validation fold
    stage2_oof[val_idx] = stage2_model.predict_proba(X_val)[:, 1]

print("\nStage-2 model training completed.")



Training Stage-2, Fold 1...

Training Stage-2, Fold 2...

Training Stage-2, Fold 3...

Training Stage-2, Fold 4...

Training Stage-2, Fold 5...

Stage-2 model training completed.


In [16]:
from sklearn.metrics import roc_auc_score

# Scale outputs to be roughly comparable (optional but helps stability)
s1 = (stage1_oof - stage1_oof.min()) / (stage1_oof.max() - stage1_oof.min())
s2 = (stage2_oof - stage2_oof.min()) / (stage2_oof.max() - stage2_oof.min())

# SAFE final risk score
SAFE_SCORE = 0.3 * s1 + 0.7 * s2

# compute SAFE AUC
safe_auc = roc_auc_score(y_train, SAFE_SCORE)
safe_gini = 2 * safe_auc - 1

print("SAFE AUC:", safe_auc)
print("SAFE Gini:", safe_gini)


SAFE AUC: 0.619166774982063
SAFE Gini: 0.23833354996412592
