In [1]:
# Cell 1: imports
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")


In [2]:
# Cell 2: load processed data
TRAIN_PATH = "/kaggle/input/processed-financial-risk/train_processed.csv"
TEST_PATH  = "/kaggle/input/processed-financial-risk/test_processed.csv"
SAMPLE_PATH = "/kaggle/input/financial-risk-profiling/sample_submission_updated.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_PATH)

print("Train shape:", train.shape)
print("Test shape :", test.shape)


Train shape: (204277, 27)
Test shape : (51070, 26)


In [3]:
# Cell 3: prepare X, y, X_test (drop ProfileID)
IDCOL = "ProfileID"
TARGET = "RiskFlag"

X = train.drop(columns=[IDCOL, TARGET])
y = train[TARGET].astype(int)

X_test = test.drop(columns=[IDCOL])
test_ids = test[IDCOL].copy()

print("X shape:", X.shape, "y shape:", y.shape, "X_test shape:", X_test.shape)


X shape: (204277, 25) y shape: (204277,) X_test shape: (51070, 25)


In [4]:
# Cell 4: stacking containers
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_lr = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

test_lr = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))


In [5]:
# Cell 5: base models definitions

# Logistic Regression (base)
lr_model_template = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

# LightGBM
lgb_params = dict(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
lgb_model_template = lgb.LGBMClassifier(**lgb_params)

# XGBoost
xgb_params = dict(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)
xgb_model_template = XGBClassifier(**xgb_params)

# CatBoost
cat_model_template = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=0
)


In [6]:
# Cell 6: helper - scaler to use only for logistic regression (not for tree models)
scaler = RobustScaler()


In [7]:
# =============================
# Cell 7 â€” Corrected OOF loop
# =============================

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    print(f"\n=== FOLD {fold} ===")

    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    # ---------------------------------------
    # 1. Logistic Regression (scaled)
    # ---------------------------------------
    X_tr_scaled = scaler.fit_transform(X_tr)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    lr = LogisticRegression(**lr_model_template.get_params())
    lr.fit(X_tr_scaled, y_tr)

    oof_lr[valid_idx] = lr.predict_proba(X_val_scaled)[:, 1]
    test_lr += lr.predict_proba(X_test_scaled)[:, 1] / NFOLDS

    print("LR fold AUC:", roc_auc_score(y_val, oof_lr[valid_idx]))

    # ---------------------------------------
    # 2. LightGBM (corrected early stopping)
    # ---------------------------------------
    lgbm = lgb.LGBMClassifier(**lgb_params)

    # use callbacks instead of early_stopping_rounds
    lgbm.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    oof_lgb[valid_idx] = lgbm.predict_proba(X_val)[:, 1]
    test_lgb += lgbm.predict_proba(X_test)[:, 1] / NFOLDS

    print("LGB fold AUC:", roc_auc_score(y_val, oof_lgb[valid_idx]))

    # ---------------------------------------
    # 3. XGBoost (works normally)
    # ---------------------------------------
    xgb = XGBClassifier(**xgb_params)

    xgb.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=False
    )

    oof_xgb[valid_idx] = xgb.predict_proba(X_val)[:, 1]
    test_xgb += xgb.predict_proba(X_test)[:, 1] / NFOLDS

    print("XGB fold AUC:", roc_auc_score(y_val, oof_xgb[valid_idx]))

    # ---------------------------------------
    # 4. CatBoost (correct syntax)
    # ---------------------------------------
    cat = CatBoostClassifier(**cat_model_template.get_params())

    cat.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50,
        verbose=False
    )

    oof_cat[valid_idx] = cat.predict_proba(X_val)[:, 1]
    test_cat += cat.predict_proba(X_test)[:, 1] / NFOLDS

    print("CAT fold AUC:", roc_auc_score(y_val, oof_cat[valid_idx]))



=== FOLD 1 ===
LR fold AUC: 0.7443443241116541
LGB fold AUC: 0.7475389055177383
XGB fold AUC: 0.7487547392617031
CAT fold AUC: 0.7513943416675826

=== FOLD 2 ===
LR fold AUC: 0.7511436148207896
LGB fold AUC: 0.7552423449121577
XGB fold AUC: 0.7558598743929901
CAT fold AUC: 0.7586210261630262

=== FOLD 3 ===
LR fold AUC: 0.7479316950608914
LGB fold AUC: 0.7538761337202173
XGB fold AUC: 0.7549067980083254
CAT fold AUC: 0.755196299569129

=== FOLD 4 ===
LR fold AUC: 0.7459982057085267
LGB fold AUC: 0.7526599672017501
XGB fold AUC: 0.7531485912190403
CAT fold AUC: 0.7552943890524918

=== FOLD 5 ===
LR fold AUC: 0.7426032675470849
LGB fold AUC: 0.7478347727655505
XGB fold AUC: 0.7483161072869786
CAT fold AUC: 0.7514457904480737


In [8]:
# Cell 8: OOF AUC scores
print("OOF LR AUC :", roc_auc_score(y, oof_lr))
print("OOF LGB AUC:", roc_auc_score(y, oof_lgb))
print("OOF XGB AUC:", roc_auc_score(y, oof_xgb))
print("OOF CAT AUC:", roc_auc_score(y, oof_cat))


OOF LR AUC : 0.7463822888109548
OOF LGB AUC: 0.7513675497582692
OOF XGB AUC: 0.752176948864755
OOF CAT AUC: 0.754361586419632


In [9]:
# Cell 9: stacking datasets
stack_train = pd.DataFrame({
    "lr": oof_lr,
    "lgb": oof_lgb,
    "xgb": oof_xgb,
    "cat": oof_cat
})

stack_test = pd.DataFrame({
    "lr": test_lr,
    "lgb": test_lgb,
    "xgb": test_xgb,
    "cat": test_cat
})

print("stack_train shape:", stack_train.shape)
print("stack_test shape :", stack_test.shape)
stack_train.head()


stack_train shape: (204277, 4)
stack_test shape : (51070, 4)


Unnamed: 0,lr,lgb,xgb,cat
0,0.710151,0.1918,0.233761,0.282925
1,0.605584,0.142981,0.116648,0.118602
2,0.798523,0.30077,0.304511,0.304084
3,0.391644,0.054863,0.090129,0.058845
4,0.466101,0.146956,0.124037,0.144942


In [10]:
# Cell 10: meta-model training
meta = LogisticRegression(max_iter=2000, solver="lbfgs", random_state=42)
meta.fit(stack_train, y)

meta_oof = meta.predict_proba(stack_train)[:, 1]
print("META OOF AUC:", roc_auc_score(y, meta_oof))


META OOF AUC: 0.7526451521732984


In [11]:
# Cell 11: meta predict on test, then threshold to 0/1
final_probs = meta.predict_proba(stack_test)[:, 1]
final_preds = (final_probs >= 0.5).astype(int)

# Quick sanity
print("Final probs sample:", final_probs[:6])
print("Final preds sample:", final_preds[:6])


Final probs sample: [0.03785287 0.04712548 0.07432494 0.09492885 0.12076972 0.1415704 ]
Final preds sample: [0 0 0 0 0 0]


In [12]:
# Cell 12: make submission (0/1)
submission = sample.copy()
submission["RiskFlag"] = final_preds
OUT_PATH = "logreg_lgb_xgb_cat_stacked.csv"
submission.to_csv(OUT_PATH, index=False)
OUT_PATH


'logreg_lgb_xgb_cat_stacked.csv'