In [1]:
# =============================
# 1. IMPORTS & CONFIG
# =============================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.calibration import CalibratedClassifierCV

import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

pd.set_option('display.max_columns', 200)


In [2]:
# =============================
# 2. FILES & LOAD
# =============================
TRAIN_PATH = "/kaggle/input/processed-financial-risk/train_processed.csv"
TEST_PATH  = "/kaggle/input/processed-financial-risk/test_processed.csv"
SAMPLE_SUB = "/kaggle/input/financial-risk-profiling/sample_submission_updated.csv"

for p in [TRAIN_PATH, TEST_PATH, SAMPLE_SUB]:
    assert os.path.exists(p), f"{p} not found"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_SUB)

print(train.shape, test.shape, sample.shape)
display(train.head(3))
display(test.head(3))


(204277, 27) (51070, 26) (51070, 2)


Unnamed: 0,ProfileID,orig_index,ApplicantYears,AnnualEarnings,RequestedSum,TrustMetric,WorkDuration,ActiveAccounts,OfferRate,RepayPeriod,DebtFactor,QualificationLevel_High School,QualificationLevel_Master's,QualificationLevel_PhD,WorkCategory_Part-time,WorkCategory_Self-employed,WorkCategory_Unemployed,RelationshipStatus_Married,RelationshipStatus_Single,OwnsProperty_Yes,FamilyObligation_Yes,FundUseCase_Business,FundUseCase_Education,FundUseCase_Home,FundUseCase_Other,JointApplicant_Yes,RiskFlag
0,DRIRC89L0T,0,-1.699838,1.413785,1.151487,1.711544,-0.967182,-0.44953,-0.454811,1.41572,1.339989,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
1,TS0FIUNHNU,1,0.23412,-0.649831,-1.715866,1.094714,-0.851727,-0.44953,0.939092,-0.000645,0.993538,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,I0YR284A1V,2,-1.166333,0.04677,-0.458437,-0.762072,-1.515594,-0.44953,1.621727,-1.41701,-0.219039,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0


Unnamed: 0,ProfileID,orig_index,ApplicantYears,AnnualEarnings,RequestedSum,TrustMetric,WorkDuration,ActiveAccounts,OfferRate,RepayPeriod,DebtFactor,QualificationLevel_High School,QualificationLevel_Master's,QualificationLevel_PhD,WorkCategory_Part-time,WorkCategory_Self-employed,WorkCategory_Unemployed,RelationshipStatus_Married,RelationshipStatus_Single,OwnsProperty_Yes,FamilyObligation_Yes,FundUseCase_Business,FundUseCase_Education,FundUseCase_Home,FundUseCase_Other,JointApplicant_Yes
0,CKV34LU7V7,0,0.767625,0.774024,-0.496148,0.043584,1.543966,-0.44953,1.514736,-0.000645,-1.518228,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1,62KTYNH93J,1,0.834313,0.232665,0.056842,0.421235,-0.158996,-1.344869,0.256456,-1.41701,-0.305651,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
2,JGFUSOIUH7,2,-1.166333,-0.111348,-0.735736,-0.031946,1.313056,0.445809,0.682915,-1.41701,-0.91194,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0


In [3]:
# =============================
# 3. BASIC CHECKS
# =============================
TARGET = "RiskFlag"
IDCOL = "ProfileID"

print(train[TARGET].value_counts())
print("orig_index in train?", "orig_index" in train.columns)
print("orig_index in test ?", "orig_index" in test.columns)


RiskFlag
0    180524
1     23753
Name: count, dtype: int64
orig_index in train? True
orig_index in test ? True


In [4]:
# =============================
# 4. BUILD X, y
# =============================
drop_cols = [IDCOL, "orig_index"] if "orig_index" in train.columns else [IDCOL]

X = train.drop(columns=drop_cols + [TARGET])
y = train[TARGET].values

X_test = test.drop(columns=drop_cols)

print("X:", X.shape)
print("X_test:", X_test.shape)


X: (204277, 24)
X_test: (51070, 24)


In [5]:
# =============================
# 5. FEATURE ENGINEERING
# =============================
def add_features(df):
    df["req_over_annual"]         = df["RequestedSum"] / (df["AnnualEarnings"] + 1e-9)
    df["req_over_annual_plus1"]   = df["RequestedSum"] / (df["AnnualEarnings"] + 1)
    df["offer_times_debt"]        = df["OfferRate"] * df["DebtFactor"]
    df["trust_over_workdur"]      = df["TrustMetric"] / (df["WorkDuration"] + 1e-9)

    df["appyears_bin"] = pd.qcut(df["ApplicantYears"].rank(method="first"), q=6, labels=False, duplicates="drop")
    df["annual_bin"]   = pd.qcut(df["AnnualEarnings"].rank(method="first"), q=6, labels=False, duplicates="drop")
    return df

X      = add_features(X)
X_test = add_features(X_test)

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", len(numeric_cols))


Numeric columns: 30


In [6]:
# =============================
# 6. SAFE K-FOLD TARGET ENCODING
# =============================
cat_groups = ["QualificationLevel", "WorkCategory", "FundUseCase"]

# Detect OHE columns for each group
group_columns = {
    g: [c for c in X.columns if c.startswith(g + "_")]
    for g in cat_groups
}
group_columns = {g: cols for g, cols in group_columns.items() if len(cols) > 0}

print("Detected groups:", group_columns)


def kfold_target_encode_safe(X, X_test, y, group_cols_map):
    X = X.copy()
    X_test = X_test.copy()
    KF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    global_mean = y.mean()

    for group, cols in group_cols_map.items():
        print(f"Processing TE for:", group)

        # reconstruct category
        X[group+"_recon"]      = X[cols].idxmax(axis=1).str.replace(group + "_", "")
        X_test[group+"_recon"] = X_test[cols].idxmax(axis=1).str.replace(group + "_", "")

        oof_te = np.zeros(len(X))

        for tr_idx, val_idx in KF.split(X, y):
            tr_slice = X.iloc[tr_idx].copy()
            val_slice = X.iloc[val_idx].copy()

            # attach target into dataframe → prevents KeyError
            tr_slice["__target__"] = y[tr_idx]

            # mapping category → mean
            mapping = tr_slice.groupby(group+"_recon")["__target__"].mean()

            oof_te[val_idx] = val_slice[group+"_recon"].map(mapping).fillna(global_mean)

        # final mapping using full training set
        temp = X.copy()
        temp["__target__"] = y
        final_map = temp.groupby(group+"_recon")["__target__"].mean()

        test_te = X_test[group+"_recon"].map(final_map).fillna(global_mean)

        # add TE columns
        X[group+"_te"]      = oof_te
        X_test[group+"_te"] = test_te

    return X, X_test


# RUN TE
X, X_test = kfold_target_encode_safe(X, X_test, y, group_columns)

print("X shape after TE:", X.shape)
print("X_test shape after TE:", X_test.shape)


Detected groups: {'QualificationLevel': ['QualificationLevel_High School', "QualificationLevel_Master's", 'QualificationLevel_PhD'], 'WorkCategory': ['WorkCategory_Part-time', 'WorkCategory_Self-employed', 'WorkCategory_Unemployed'], 'FundUseCase': ['FundUseCase_Business', 'FundUseCase_Education', 'FundUseCase_Home', 'FundUseCase_Other']}
Processing TE for: QualificationLevel
Processing TE for: WorkCategory
Processing TE for: FundUseCase
X shape after TE: (204277, 36)
X_test shape after TE: (51070, 36)


In [7]:
# =============================
# 6B. DROP _recon COLUMNS (critical for LightGBM)
# =============================
recon_cols = [c for c in X.columns if c.endswith("_recon")]
print("Dropping recon columns:", recon_cols)

X = X.drop(columns=recon_cols)
X_test = X_test.drop(columns=recon_cols)


Dropping recon columns: ['QualificationLevel_recon', 'WorkCategory_recon', 'FundUseCase_recon']


In [8]:
# =============================
# 7. FINAL FEATURES & SCALING
# =============================
FEATURES = X.columns.tolist()
X = X[FEATURES].copy()
X_test = X_test[FEATURES].copy()

scaler = StandardScaler()
X_scaled = X.copy()
X_test_scaled = X_test.copy()

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X_scaled[num_cols] = scaler.fit_transform(X_scaled[num_cols])
X_test_scaled[num_cols] = scaler.transform(X_test_scaled[num_cols])

print("Final feature count:", len(FEATURES))


Final feature count: 33


In [9]:
# =============================
# 8. CV SETUP
# =============================
SEED = 42
NFOLD = 5

skf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=SEED)

oof_lgb = np.zeros(len(X))
preds_lgb = np.zeros(len(X_test))

oof_cb  = np.zeros(len(X))
preds_cb = np.zeros(len(X_test))

feature_importance = pd.DataFrame()
models_lgb = []
models_cb  = []


In [10]:
# =============================
# 8B. DEFINE LIGHTGBM PARAMS   ← ADD THIS
# =============================
lgb_params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "min_data_in_leaf": 60,
    "subsample": 0.8,
    "colsample_bytree": 0.7,
    "reg_alpha": 0.1,
    "reg_lambda": 1.0,
    "seed": SEED,
    "is_unbalance": True,
    "verbose": -1
}


In [11]:
# =============================
# LIGHTGBM TRAINING (K-FOLD) — robust version
# =============================
from sklearn.utils import check_array

# sanity checks
assert "X_scaled" in globals(), "X_scaled not found — run preprocessing cells first."
assert "X_test_scaled" in globals(), "X_test_scaled not found — run preprocessing cells first."
assert X_scaled.shape[1] == X_test_scaled.shape[1], f"Feature count mismatch: train {X_scaled.shape[1]} vs test {X_test_scaled.shape[1]}"

oof_lgb = np.zeros(len(X_scaled))
preds_lgb = np.zeros(len(X_test_scaled))
feature_importance = []   # collect per-fold DataFrames
models_lgb = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
    print(f"\n--- LGB Fold {fold} ---")
    X_tr, X_val = X_scaled.iloc[tr_idx], X_scaled.iloc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    # ensure all dtypes numeric (no object columns)
    bad_cols = [c for c in X_tr.columns if X_tr[c].dtype == "object"]
    if bad_cols:
        raise ValueError(f"Found non-numeric columns in X_tr: {bad_cols}. Convert or drop them before training.")

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dvalid = lgb.Dataset(X_val, label=y_val, reference=dtrain)

    clf = lgb.train(
        params=lgb_params,
        train_set=dtrain,
        num_boost_round=3000,
        valid_sets=[dtrain, dvalid],
        valid_names=["train", "valid"],
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=200)]
    )

    models_lgb.append(clf)

    # use best_iteration for predict calls
    best_it = clf.best_iteration if hasattr(clf, "best_iteration") else None
    if best_it is None or best_it <= 0:
        # fallback
        pred_val = clf.predict(X_val)
        pred_test = clf.predict(X_test_scaled)
    else:
        pred_val = clf.predict(X_val, num_iteration=best_it)
        pred_test = clf.predict(X_test_scaled, num_iteration=best_it)

    oof_lgb[val_idx] = pred_val
    preds_lgb += pred_test / NFOLD

    # fold metrics
    fold_auc = roc_auc_score(y_val, pred_val)
    print(f"Fold {fold} AUC: {fold_auc:.6f}")

    # importance for the fold
    fold_imp = pd.DataFrame({
        "feature": X_scaled.columns,
        "gain": clf.feature_importance(importance_type="gain"),
        "split": clf.feature_importance(importance_type="split"),
        "fold": fold
    })
    feature_importance.append(fold_imp)

# overall
oof_auc = roc_auc_score(y, oof_lgb)
print("\nOOF AUC (LGB):", oof_auc)

# merge importance
feature_importance = pd.concat(feature_importance, axis=0).reset_index(drop=True)
print("Feature importance collected for", feature_importance['feature'].nunique(), "features.")



--- LGB Fold 1 ---
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.802144	valid's auc: 0.754735
Early stopping, best iteration is:
[215]	train's auc: 0.804879	valid's auc: 0.754781
Fold 1 AUC: 0.754781

--- LGB Fold 2 ---
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.803681	valid's auc: 0.746862
Early stopping, best iteration is:
[194]	train's auc: 0.802653	valid's auc: 0.746911
Fold 2 AUC: 0.746911

--- LGB Fold 3 ---
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.802049	valid's auc: 0.753822
Early stopping, best iteration is:
[169]	train's auc: 0.796267	valid's auc: 0.753963
Fold 3 AUC: 0.753963

--- LGB Fold 4 ---
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.803592	valid's auc: 0.749247
Early stopping, best iteration is:
[208]	train's auc: 0.805055	valid's auc: 0.749351
Fold 4 AUC: 0.749351

--- LGB Fold 5 ---
Training until validation sc

In [12]:
# =============================
# 10. CATBOOST TRAINING
# =============================
cb_params = {
    "iterations": 2000,
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 3,
    "eval_metric": "AUC",
    "random_seed": SEED,
    "early_stopping_rounds": 150,
    "verbose": 200
}

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"--- CAT Fold {fold} ---")

    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    train_pool = Pool(X_tr, y_tr)
    val_pool   = Pool(X_val, y_val)

    cb = CatBoostClassifier(**cb_params)
    cb.fit(train_pool, eval_set=val_pool, use_best_model=True)

    models_cb.append(cb)

    oof_cb[val_idx] = cb.predict_proba(X_val)[:,1]
    preds_cb += cb.predict_proba(X_test)[:,1] / NFOLD

    print("Fold AUC:", roc_auc_score(y_val, oof_cb[val_idx]))

print("OOF CAT AUC:", roc_auc_score(y, oof_cb))


--- CAT Fold 1 ---
0:	test: 0.7029307	best: 0.7029307 (0)	total: 86.2ms	remaining: 2m 52s
200:	test: 0.7552208	best: 0.7552208 (200)	total: 5.29s	remaining: 47.4s
400:	test: 0.7575336	best: 0.7575336 (400)	total: 10.5s	remaining: 41.8s
600:	test: 0.7577382	best: 0.7577590 (585)	total: 15.7s	remaining: 36.5s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.7577590106
bestIteration = 585

Shrink model to first 586 iterations.
Fold AUC: 0.757759010552112
--- CAT Fold 2 ---
0:	test: 0.6949064	best: 0.6949064 (0)	total: 32ms	remaining: 1m 3s
200:	test: 0.7474210	best: 0.7474210 (200)	total: 5.28s	remaining: 47.2s
400:	test: 0.7493072	best: 0.7493169 (398)	total: 10.5s	remaining: 41.8s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.7495081568
bestIteration = 441

Shrink model to first 442 iterations.
Fold AUC: 0.7495081568116287
--- CAT Fold 3 ---
0:	test: 0.7044255	best: 0.7044255 (0)	total: 32.1ms	remaining: 1m 4s
200:	test: 0.7558903	best: 0.75589

In [13]:
# =============================
# 11. DIAGNOSTICS
# =============================
print("LGB AUC:", roc_auc_score(y, oof_lgb))
print("CAT AUC:", roc_auc_score(y, oof_cb))


LGB AUC: 0.7513008412937727
CAT AUC: 0.7542709112289636


In [14]:
# =============================
# 12. FEATURE IMPORTANCE
# =============================
imp_summary = feature_importance.groupby("feature")[["gain","split"]].mean().sort_values("gain", ascending=False)
display(imp_summary.head(30))


Unnamed: 0_level_0,gain,split
feature,Unnamed: 1_level_1,Unnamed: 2_level_1
OfferRate,226317.651962,1311.0
ApplicantYears,223924.568099,912.0
AnnualEarnings,165605.732113,1196.8
WorkDuration,135904.222294,1124.2
appyears_bin,113658.877778,200.0
RequestedSum,92750.861369,1145.0
req_over_annual,49556.059662,820.0
TrustMetric,46176.669527,941.0
req_over_annual_plus1,44828.203537,647.4
trust_over_workdur,27300.566448,639.4


In [15]:
# =============================
# 13. ENSEMBLE
# =============================
auc_lgb = roc_auc_score(y, oof_lgb)
auc_cb  = roc_auc_score(y, oof_cb)

w_lgb = auc_lgb / (auc_lgb + auc_cb)
w_cb  = auc_cb  / (auc_lgb + auc_cb)

preds_ensemble = w_lgb * preds_lgb + w_cb * preds_cb
oof_ensemble   = w_lgb * oof_lgb + w_cb * oof_cb

print("Ensemble AUC:", roc_auc_score(y, oof_ensemble))


Ensemble AUC: 0.7528430317575163


In [16]:
# =============================
# FIX: Convert probabilities → binary labels for metrics
# =============================

# If you want the optimal threshold, compute it here:
precision, recall, thresholds = precision_recall_curve(y, oof_ensemble)
f1_scores = 2 * precision * recall / (precision + recall + 1e-12)

best_idx = np.argmax(f1_scores[:-1])
best_thr = thresholds[best_idx]

print("Best threshold for classification metrics:", best_thr)

# Convert OOF predictions to hard labels using this threshold
oof_labels = (oof_ensemble >= best_thr).astype(int)

# Safely compute classification metrics (NO ERROR ANYMORE)
print("\nClassification Report (Ensemble):")
print(classification_report(y, oof_labels))

print("\nConfusion Matrix (Ensemble):")
print(confusion_matrix(y, oof_labels))


Best threshold for classification metrics: 0.3971020161308677

Classification Report (Ensemble):
              precision    recall  f1-score   support

           0       0.93      0.85      0.89    180524
           1       0.30      0.48      0.37     23753

    accuracy                           0.81    204277
   macro avg       0.61      0.67      0.63    204277
weighted avg       0.85      0.81      0.83    204277


Confusion Matrix (Ensemble):
[[153228  27296]
 [ 12299  11454]]


In [17]:
# # =============================
# # 14. CALIBRATION
# # =============================
# lr = LogisticRegression(max_iter=200)
# lr.fit(oof_ensemble.reshape(-1,1), y)

# cal_test_proba = lr.predict_proba(preds_ensemble.reshape(-1,1))[:,1]


In [18]:
# =============================
# 15. SAVE SUBMISSION (BINARY OUTPUT)
# =============================

# Recreate sample format
sample = pd.DataFrame({
    "ProfileID": test["ProfileID"],
    "RiskFlag": 0
})

# Convert ensemble probability → binary label
# Use threshold 0.5 or best_thr if you computed it earlier
threshold = 0.5
# or: threshold = best_thr

binary_preds = (preds_ensemble >= threshold).astype(int)

# Create submission
sub_ens = sample.copy()
sub_ens["RiskFlag"] = binary_preds

# Save
sub_ens.to_csv("final_submission.csv", index=False)

print("Saved final_submission.csv ✔ (BINARY OUTPUT REQUIRED BY KAGGLE)")


Saved final_submission.csv ✔ (BINARY OUTPUT REQUIRED BY KAGGLE)
