In [1]:
# ===============================================================
# Cell 1: Imports
# ===============================================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import GaussianMixture

from sklearn.linear_model import LogisticRegression

import lightgbm as lgb
from xgboost import XGBClassifier, callback as xgb_callback
from catboost import CatBoostClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks, regularizers

print("Imports done.")


2025-11-23 11:47:11.918770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763898432.171304      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763898432.241585      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Imports done.


In [2]:
# ===============================================================
# Cell 2: Load data
# ===============================================================
TRAIN_PATH = "/kaggle/input/processed-financial-risk/train_processed.csv"
TEST_PATH  = "/kaggle/input/processed-financial-risk/test_processed.csv"
SAMPLE_PATH = "/kaggle/input/financial-risk-profiling/sample_submission_updated.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_PATH)

IDCOL = "ProfileID"
TARGET = "RiskFlag"

print("Train shape:", train.shape)
print("Test shape: ", test.shape)

test_ids = test[IDCOL].copy()

X_full = train.drop(columns=[IDCOL, TARGET]).copy()
y_full = train[TARGET].astype(int).copy()
X_test_full = test.drop(columns=[IDCOL]).copy()

print("Feature matrix shapes:", X_full.shape, X_test_full.shape)


Train shape: (204277, 27)
Test shape:  (51070, 26)
Feature matrix shapes: (204277, 25) (51070, 25)


In [3]:
# ===============================================================
# Cell 3: Clustering Features
# ===============================================================
print("Adding clustering features...")

X = X_full.copy()
X_test = X_test_full.copy()

# MiniBatchKMeans
K = 12
kmeans = MiniBatchKMeans(n_clusters=K, batch_size=4096, random_state=42)
kmeans.fit(X)
X["kmeans_label"] = kmeans.labels_
X_test["kmeans_label"] = kmeans.predict(X_test)

# GaussianMixture (diag)
G = 6
gmm = GaussianMixture(n_components=G, covariance_type="diag", random_state=42)
gmm.fit(X)

gmm_train_proba = gmm.predict_proba(X)
gmm_test_proba  = gmm.predict_proba(X_test)

for i in range(G):
    X[f"gmm_prob_{i}"] = gmm_train_proba[:, i]
    X_test[f"gmm_prob_{i}"] = gmm_test_proba[:, i]

print("Added clustering features. New shapes:", X.shape, X_test.shape)


Adding clustering features...
Added clustering features. New shapes: (204277, 32) (51070, 32)


In [4]:
# ===============================================================
# Cell 4: 20% Dataset for NN
# ===============================================================
RND = 42
frac = 0.20

nn_sample = train.sample(frac=frac, random_state=RND).reset_index(drop=True)
print("NN sample shape (20%):", nn_sample.shape)

nn_ids = nn_sample[IDCOL].tolist()
X_nn = X.loc[train[IDCOL].isin(nn_ids)].reset_index(drop=True)
y_nn = train.loc[train[IDCOL].isin(nn_ids), TARGET].reset_index(drop=True)

X_nn_train, X_nn_val, y_nn_train, y_nn_val = train_test_split(
    X_nn, y_nn, test_size=0.2, random_state=RND, stratify=y_nn
)

scaler_nn = StandardScaler()
X_nn_train_s = scaler_nn.fit_transform(X_nn_train)
X_nn_val_s   = scaler_nn.transform(X_nn_val)
X_test_s_nn  = scaler_nn.transform(X_test)

print("NN shapes:", X_nn_train_s.shape, X_nn_val_s.shape, X_test_s_nn.shape)


NN sample shape (20%): (40855, 27)
NN shapes: (32684, 32) (8171, 32) (51070, 32)


In [5]:
# ===============================================================
# Cell 5: NN Training (20%)
# ===============================================================
def build_nn(input_dim, lr=1e-3, l2=1e-5, dropout=0.3):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.BatchNormalization()(inputs)
    x = layers.Dense(256, activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout*0.5)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(dropout*0.3)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=out)
    model.compile(
        optimizer=keras.optimizers.Adam(lr),
        loss="binary_crossentropy",
        metrics=[keras.metrics.AUC(name="auc")]
    )
    return model

nn_model = build_nn(X_nn_train_s.shape[1])

early = callbacks.EarlyStopping(monitor="val_auc", mode="max",
                                patience=6, restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max",
                                        factor=0.5, patience=3)

history = nn_model.fit(
    X_nn_train_s, y_nn_train,
    validation_data=(X_nn_val_s, y_nn_val),
    epochs=50,
    batch_size=1024,
    callbacks=[early, reduce_lr],
    verbose=2
)

nn_val_probs = nn_model.predict(X_nn_val_s, batch_size=2048).ravel()
nn_val_auc = roc_auc_score(y_nn_val, nn_val_probs)
print("NN validation AUC:", nn_val_auc)

nn_test_probs = nn_model.predict(X_test_s_nn, batch_size=2048).ravel()


2025-11-23 11:47:49.239436: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/50
32/32 - 5s - 161ms/step - auc: 0.5891 - loss: 0.7318 - val_auc: 0.6480 - val_loss: 0.5791 - learning_rate: 0.0010
Epoch 2/50
32/32 - 1s - 25ms/step - auc: 0.6195 - loss: 0.5754 - val_auc: 0.6850 - val_loss: 0.4868 - learning_rate: 0.0010
Epoch 3/50
32/32 - 1s - 39ms/step - auc: 0.6510 - loss: 0.4828 - val_auc: 0.6920 - val_loss: 0.4178 - learning_rate: 0.0010
Epoch 4/50
32/32 - 1s - 24ms/step - auc: 0.6650 - loss: 0.4184 - val_auc: 0.6929 - val_loss: 0.3704 - learning_rate: 0.0010
Epoch 5/50
32/32 - 1s - 24ms/step - auc: 0.6796 - loss: 0.3754 - val_auc: 0.7083 - val_loss: 0.3436 - learning_rate: 0.0010
Epoch 6/50
32/32 - 1s - 24ms/step - auc: 0.6990 - loss: 0.3513 - val_auc: 0.7136 - val_loss: 0.3334 - learning_rate: 0.0010
Epoch 7/50
32/32 - 1s - 24ms/step - auc: 0.7071 - loss: 0.3415 - val_auc: 0.7186 - val_loss: 0.3308 - learning_rate: 0.0010
Epoch 8/50
32/32 - 1s - 24ms/step - auc: 0.7226 - loss: 0.3340 - val_auc: 0.7224 - val_loss: 0.3289 - learning_rate: 0.0010
Epoch 9

In [6]:
# ===============================================================
# Cell 6: OOF for Boosters + Logistic Regression
# ===============================================================
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=RND)

# storage
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
oof_lr  = np.zeros(len(X))

test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))
test_lr  = np.zeros(len(X_test))

# LR parameters
lr_model = LogisticRegression(
    solver="lbfgs",
    max_iter=2000,
    n_jobs=-1
)

lgb_params = dict(
    n_estimators=500, learning_rate=0.05,
    num_leaves=64, subsample=0.8,
    colsample_bytree=0.8, random_state=42
)

xgb_params = dict(
    n_estimators=500, learning_rate=0.05,
    max_depth=6, subsample=0.8, colsample_bytree=0.8,
    use_label_encoder=False, eval_metric="auc", random_state=42
)

cat_params = dict(
    iterations=500, learning_rate=0.05,
    depth=6, random_seed=42, verbose=0
)


for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y_full), 1):
    print(f"\n--- FOLD {fold} ---")
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y_full.iloc[tr_idx], y_full.iloc[val_idx]

    # LightGBM
    lgbm = lgb.LGBMClassifier(**lgb_params)
    lgbm.fit(X_tr, y_tr,
             eval_set=[(X_val, y_val)],
             callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_lgb[val_idx] = lgbm.predict_proba(X_val)[:,1]
    test_lgb += lgbm.predict_proba(X_test)[:,1] / NFOLDS
    print("LGB AUC:", roc_auc_score(y_val, oof_lgb[val_idx]))

    # XGB
    xgbm = XGBClassifier(**xgb_params)
    xgbm.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[xgb_callback.EarlyStopping(rounds=50, save_best=True, maximize=True)]
    )
    oof_xgb[val_idx] = xgbm.predict_proba(X_val)[:,1]
    test_xgb += xgbm.predict_proba(X_test)[:,1] / NFOLDS
    print("XGB AUC:", roc_auc_score(y_val, oof_xgb[val_idx]))

    # CatBoost
    cat = CatBoostClassifier(**cat_params)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=False)
    oof_cat[val_idx] = cat.predict_proba(X_val)[:,1]
    test_cat += cat.predict_proba(X_test)[:,1] / NFOLDS
    print("CAT AUC:", roc_auc_score(y_val, oof_cat[val_idx]))

    # Logistic Regression
    lr_model.fit(X_tr, y_tr)
    oof_lr[val_idx] = lr_model.predict_proba(X_val)[:,1]
    test_lr += lr_model.predict_proba(X_test)[:,1] / NFOLDS
    print("LR AUC:", roc_auc_score(y_val, oof_lr[val_idx]))



--- FOLD 1 ---
[LightGBM] [Info] Number of positive: 18970, number of negative: 144451
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008578 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3113
[LightGBM] [Info] Number of data points in the train set: 163421, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.116081 -> initscore=-2.030082
[LightGBM] [Info] Start training from score -2.030082
LGB AUC: 0.7472727135839764
[0]	validation_0-auc:0.69790
[1]	validation_0-auc:0.70220
[2]	validation_0-auc:0.71922
[3]	validation_0-auc:0.72150
[4]	validation_0-auc:0.72426
[5]	validation_0-auc:0.72635
[6]	validation_0-auc:0.72718
[7]	validation_0-auc:0.72725
[8]	validation_0-auc:0.72739
[9]	validation_0-auc:0.72729
[10]	validation_0-auc:0.72857
[11]	validation_0-auc:0.72849
[12]	validation_0-auc:0.72945
[13]	validation_

In [7]:
# ===============================================================
# Cell 7: OOF AUC Summary
# ===============================================================
print("OOF LGB:", roc_auc_score(y_full, oof_lgb))
print("OOF XGB:", roc_auc_score(y_full, oof_xgb))
print("OOF CAT:", roc_auc_score(y_full, oof_cat))
print("OOF LR :", roc_auc_score(y_full, oof_lr))
print("NN validation AUC:", nn_val_auc)


OOF LGB: 0.7513913338812572
OOF XGB: 0.752254797872534
OOF CAT: 0.7542116318922092
OOF LR : 0.5043915580619966
NN validation AUC: 0.727500355601366


In [8]:
# ===============================================================
# Cell 8: Compute Blend Weights
# ===============================================================
eps = 1e-6
auc_lgb = roc_auc_score(y_full, oof_lgb) + eps
auc_xgb = roc_auc_score(y_full, oof_xgb) + eps
auc_cat = roc_auc_score(y_full, oof_cat) + eps
auc_lr  = roc_auc_score(y_full, oof_lr)  + eps
auc_nn  = nn_val_auc + eps

raw = np.array([auc_lgb, auc_xgb, auc_cat, auc_lr, auc_nn])
weights = raw / raw.sum()

print("Model AUCs:", raw)
print("Blend Weights (LGB, XGB, CAT, LR, NN):", weights)

test_blend = (
    weights[0]*test_lgb +
    weights[1]*test_xgb +
    weights[2]*test_cat +
    weights[3]*test_lr +
    weights[4]*nn_test_probs
)


Model AUCs: [0.75139233 0.7522558  0.75421263 0.50439256 0.72750136]
Blend Weights (LGB, XGB, CAT, LR, NN): [0.2153138  0.21556123 0.21612196 0.14453525 0.20846776]


In [9]:
# ===============================================================
# Cell 9: Submission
# ===============================================================
final_probs = test_blend
final_preds = (final_probs >= 0.5).astype(int)

submission = pd.DataFrame({
    "ProfileID": test_ids,
    "RiskFlag": final_preds
})

OUT_PATH = "blend_lgb_xgb_cat_lr_nn.csv"
submission.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)

print("Final positive predictions:", final_preds.sum())


Saved: blend_lgb_xgb_cat_lr_nn.csv
Final positive predictions: 438
