In [1]:
# Cell 1: imports
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")


In [2]:
# Cell 2: load processed data
TRAIN_PATH = "/kaggle/input/processed-financial-risk/train_processed.csv"
TEST_PATH  = "/kaggle/input/processed-financial-risk/test_processed.csv"
SAMPLE_PATH = "/kaggle/input/financial-risk-profiling/sample_submission_updated.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_PATH)

print("Train shape:", train.shape)
print("Test shape :", test.shape)


Train shape: (204277, 27)
Test shape : (51070, 26)


In [3]:
# Cell 3: prepare X, y, X_test (drop ProfileID)
IDCOL = "ProfileID"
TARGET = "RiskFlag"

X = train.drop(columns=[IDCOL, TARGET])
y = train[TARGET].astype(int)

X_test = test.drop(columns=[IDCOL])
test_ids = test[IDCOL].copy()

print("X shape:", X.shape, "y shape:", y.shape, "X_test shape:", X_test.shape)


X shape: (204277, 25) y shape: (204277,) X_test shape: (51070, 25)


In [4]:
# ===============================
# Cell A: Add Safe Clustering Features
# ===============================

from sklearn.cluster import MiniBatchKMeans

X_cluster = X.copy()
X_test_cluster = X_test.copy()

# Use MiniBatchKMeans instead of KMeans
K = 12
kmeans = MiniBatchKMeans(n_clusters=K, batch_size=4096, random_state=42)
kmeans.fit(X)

# Add only cluster labels (low memory)
X_cluster["cluster_label"] = kmeans.labels_
X_test_cluster["cluster_label"] = kmeans.predict(X_test)

print("Cluster labels added safely!")
print("New train shape:", X_cluster.shape)
print("New test shape:", X_test_cluster.shape)


Cluster labels added safely!
New train shape: (204277, 26)
New test shape: (51070, 26)


In [5]:
# ===============================
# Cell B: Replace X & X_test with clustered versions
# ===============================

X = X_cluster
X_test = X_test_cluster

print("New X shape:", X.shape)
print("New X_test shape:", X_test.shape)


New X shape: (204277, 26)
New X_test shape: (51070, 26)


In [6]:
# Cell 4: stacking containers
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_lr = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))

test_lr = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))


In [7]:
# Cell 5: base models definitions

# Logistic Regression (base)
lr_model_template = LogisticRegression(
    max_iter=2000,
    solver="lbfgs",
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

# LightGBM
lgb_params = dict(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
lgb_model_template = lgb.LGBMClassifier(**lgb_params)

# XGBoost
xgb_params = dict(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)
xgb_model_template = XGBClassifier(**xgb_params)

# CatBoost
cat_model_template = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=0
)


In [8]:
# Cell 6: helper - scaler to use only for logistic regression (not for tree models)
scaler = RobustScaler()


In [9]:
# ===============================
# Cell 7: OOF Stacking (LR + LGB + XGB + Cat + PyTorch MLP)
# ===============================

import numpy as np
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim

oof_lr  = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
oof_mlp = np.zeros(len(X))

test_lr  = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))
test_mlp = np.zeros(len(X_test))

# ----------------------
# PyTorch MLP definition
# ----------------------
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

def train_mlp(X_tr, y_tr, X_val):
    X_tr = torch.tensor(X_tr.values, dtype=torch.float32)
    y_tr = torch.tensor(y_tr.values.reshape(-1, 1), dtype=torch.float32)
    X_val = torch.tensor(X_val.values, dtype=torch.float32)

    model = MLP(X_tr.shape[1])
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.BCELoss()

    for epoch in range(15):
        optimizer.zero_grad()
        preds = model(X_tr)
        loss = loss_fn(preds, y_tr)
        loss.backward()
        optimizer.step()

    with torch.no_grad():
        val_preds = model(X_val).numpy().flatten()

    return model, val_preds

# ----------------------
# K-Fold Loop
# ----------------------
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    print(f"\n=== FOLD {fold} ===")

    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    # ------- Logistic Regression -------
    X_tr_s = scaler.fit_transform(X_tr)
    X_val_s = scaler.transform(X_val)
    lr = LogisticRegression(**lr_model_template.get_params())
    lr.fit(X_tr_s, y_tr)

    oof_lr[valid_idx] = lr.predict_proba(X_val_s)[:, 1]
    test_lr += lr.predict_proba(scaler.transform(X_test))[:, 1] / NFOLDS

    print("LR AUC:", roc_auc_score(y_val, oof_lr[valid_idx]))

    # ------- LightGBM -------
    lgbm = lgb.LGBMClassifier(**lgb_params)
    lgbm.fit(X_tr, y_tr)

    oof_lgb[valid_idx] = lgbm.predict_proba(X_val)[:, 1]
    test_lgb += lgbm.predict_proba(X_test)[:, 1] / NFOLDS

    print("LGB AUC:", roc_auc_score(y_val, oof_lgb[valid_idx]))

    # ------- XGBoost -------
    xgb = XGBClassifier(**xgb_params)
    xgb.fit(X_tr, y_tr)

    oof_xgb[valid_idx] = xgb.predict_proba(X_val)[:, 1]
    test_xgb += xgb.predict_proba(X_test)[:, 1] / NFOLDS

    print("XGB AUC:", roc_auc_score(y_val, oof_xgb[valid_idx]))

    # ------- CatBoost -------
    cat = CatBoostClassifier(**cat_model_template.get_params())
    cat.fit(X_tr, y_tr, verbose=False)

    oof_cat[valid_idx] = cat.predict_proba(X_val)[:, 1]
    test_cat += cat.predict_proba(X_test)[:, 1] / NFOLDS

    print("CAT AUC:", roc_auc_score(y_val, oof_cat[valid_idx]))

    # ------- PyTorch MLP -------
    mlp_model, val_mlp = train_mlp(X_tr, y_tr, X_val)
    oof_mlp[valid_idx] = val_mlp

    with torch.no_grad():
        test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        test_mlp += mlp_model(test_tensor).numpy().flatten() / NFOLDS

    print("MLP AUC:", roc_auc_score(y_val, oof_mlp[valid_idx]))



=== FOLD 1 ===
LR AUC: 0.7443546349340318
LGB AUC: 0.7404007562220264
XGB AUC: 0.742687057922404
CAT AUC: 0.7492672810266917
MLP AUC: 0.4999144821898916

=== FOLD 2 ===
LR AUC: 0.7511320461009093
LGB AUC: 0.7460351141400655
XGB AUC: 0.7460942991252144
CAT AUC: 0.7558973570454017
MLP AUC: 0.5001922135486111

=== FOLD 3 ===
LR AUC: 0.7479313217438136
LGB AUC: 0.7472323205139905
XGB AUC: 0.7452407468170812
CAT AUC: 0.7541278310938082
MLP AUC: 0.5007726671888283

=== FOLD 4 ===
LR AUC: 0.7460048410916049
LGB AUC: 0.7437238597073791
XGB AUC: 0.7427711928834206
CAT AUC: 0.7531678682011355
MLP AUC: 0.5005620501527769

=== FOLD 5 ===
LR AUC: 0.7426054807136085
LGB AUC: 0.7399036481895369
XGB AUC: 0.7388392785974078
CAT AUC: 0.7488934897318358
MLP AUC: 0.49943879294108023


In [10]:
# Cell 8: OOF AUC scores
print("OOF LR AUC :", roc_auc_score(y, oof_lr))
print("OOF LGB AUC:", roc_auc_score(y, oof_lgb))
print("OOF XGB AUC:", roc_auc_score(y, oof_xgb))
print("OOF CAT AUC:", roc_auc_score(y, oof_cat))


OOF LR AUC : 0.7463827619934068
OOF LGB AUC: 0.7434312856332332
OOF XGB AUC: 0.7430965556904268
OOF CAT AUC: 0.7522499319524455


In [11]:
# Cell 9: stacking datasets
stack_train = pd.DataFrame({
    "lr": oof_lr,
    "lgb": oof_lgb,
    "xgb": oof_xgb,
    "cat": oof_cat
})

stack_test = pd.DataFrame({
    "lr": test_lr,
    "lgb": test_lgb,
    "xgb": test_xgb,
    "cat": test_cat
})

print("stack_train shape:", stack_train.shape)
print("stack_test shape :", stack_test.shape)
stack_train.head()


stack_train shape: (204277, 4)
stack_test shape : (51070, 4)


Unnamed: 0,lr,lgb,xgb,cat
0,0.712108,0.106217,0.134024,0.101285
1,0.607268,0.073963,0.082985,0.066967
2,0.798746,0.266051,0.200388,0.201732
3,0.393385,0.092593,0.07005,0.126933
4,0.467192,0.115299,0.201585,0.232701


In [12]:
# Cell 10: meta-model training
meta = LogisticRegression(max_iter=2000, solver="lbfgs", random_state=42)
meta.fit(stack_train, y)

meta_oof = meta.predict_proba(stack_train)[:, 1]
print("META OOF AUC:", roc_auc_score(y, meta_oof))


META OOF AUC: 0.7517334496914092


In [13]:
# Cell 11: meta predict on test, then threshold to 0/1
final_probs = meta.predict_proba(stack_test)[:, 1]
final_preds = (final_probs >= 0.5).astype(int)

# Quick sanity
print("Final probs sample:", final_probs[:6])
print("Final preds sample:", final_preds[:6])


Final probs sample: [0.0372279  0.0462225  0.07584912 0.08948229 0.12274721 0.1585751 ]
Final preds sample: [0 0 0 0 0 0]


In [14]:
# Cell 12: make submission (0/1)
submission = sample.copy()
submission["RiskFlag"] = final_preds
OUT_PATH = "logreg_nn_lgb_xgb_cat_stacked.csv"
submission.to_csv(OUT_PATH, index=False)
OUT_PATH


'logreg_nn_lgb_xgb_cat_stacked.csv'