In [1]:
import os, re, gc, warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

# ---------- 0) Repro & basic knobs ----------
seed     = 42
cores    = 10
metric   = "auc"
verbose  = 500          # log every N rounds (0 = silent)
stopping = 300
num_folds = 5
shuffle   = True

rng = np.random.RandomState(seed)

In [3]:
# ---------- 1) Load & align ----------
train = pd.read_csv("train_full_cor.csv")
test  = pd.read_csv("test_full_cor.csv")
y     = pd.read_csv("y_full_cor.csv").sort_values("SK_ID_CURR")["TARGET"]

# keep original IDs
train = train.sort_values("SK_ID_CURR").reset_index(drop=True)
test  = test.sort_values("SK_ID_CURR").reset_index(drop=True)

excluded_feats = ["SK_ID_CURR"]
features = [c for c in train.columns if c not in excluded_feats]

print(train[features].shape, test[features].shape)


(307511, 1333) (48744, 1333)


In [4]:
# ---------- 2) Make column names LightGBM-safe ----------
def sanitize_columns(cols):
    used = {}
    out = []
    for c in cols:
        base = re.sub(r"\W+", "_", str(c)).strip("_") or "col"
        # ensure uniqueness
        k = used.get(base, 0)
        name = base if k == 0 else f"{base}__{k}"
        used[base] = k + 1
        out.append(name)
    return {orig: new for orig, new in zip(cols, out)}

col_map = sanitize_columns(train.columns)
rev_map = {v: k for k, v in col_map.items()}

train = train.rename(columns=col_map)
test  = test.rename(columns=col_map)
features = [col_map[c] for c in features]  # keep same variable name

# safety: make sure test has all train columns (rarely needed, but cheap)
test = test.reindex(columns=train.columns, fill_value=0)


In [5]:
folds = StratifiedKFold(n_splits=num_folds, shuffle=shuffle, random_state=seed)

valid_aucs_cv   = np.zeros(num_folds, dtype=float)
test_preds_cv   = np.zeros(len(test), dtype=float)
best_iterations = []
imp_rows        = []  # accumulate per-fold importances

# LightGBM params (instantiate model per fold to avoid state bleed)
gbm_params = dict(
    n_estimators=10_000,
    learning_rate=0.005,
    num_leaves=70,
    colsample_bytree=0.8,
    subsample=0.9,
    max_depth=7,
    reg_alpha=0.1,
    reg_lambda=0.1,
    min_split_gain=0.01,
    min_child_weight=2,
    objective="binary",
    n_jobs=cores,
    random_state=seed,
)

In [6]:

# ---------- 4) Cross-Validation ----------
for fold_id, (trn_idx, val_idx) in enumerate(folds.split(train, y), start=1):
    trn_x, trn_y = train.loc[trn_idx, features], y.iloc[trn_idx]
    val_x, val_y = train.loc[val_idx, features], y.iloc[val_idx]

    # callbacks: early stop + logging
    cb = [
        lgb.early_stopping(stopping, first_metric_only=True),
        lgb.log_evaluation(period=int(verbose) if isinstance(verbose, int) and verbose > 0 else 0),
    ]

    # fresh model each fold
    gbm = lgb.LGBMClassifier(**gbm_params)
    gbm.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        eval_metric=metric,
        callbacks=cb
    )

    best_round = gbm.best_iteration_ or gbm_params["n_estimators"]
    best_iterations.append(best_round)

    # validation predictions & AUC
    val_pred = gbm.predict_proba(val_x, num_iteration=best_round)[:, 1]
    valid_aucs_cv[fold_id - 1] = roc_auc_score(val_y, val_pred)

    # test bagging (per-fold)
    test_preds_cv += gbm.predict_proba(test[features], num_iteration=best_round)[:, 1] / num_folds

    # gain-based importance (less common in templates)
    gain_imp = gbm.booster_.feature_importance(importance_type="gain")
    for f, g in zip(features, gain_imp):
        imp_rows.append({"Feature_safe": f, "ImportanceGain": g, "Fold": fold_id})

    print(f"---- Fold {fold_id:02d} | AUC: {valid_aucs_cv[fold_id-1]:.6f} ----")

    del trn_x, trn_y, val_x, val_y
    gc.collect()

auc = float(valid_aucs_cv.mean())
print(f"CV AUC (mean over folds): {auc:.6f}")
print("Best iterations per fold:", best_iterations)

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.046194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122695
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 1274
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.76677	valid_0's binary_logloss: 0.24445
[1000]	valid_0's auc: 0.777344	valid_0's binary_logloss: 0.240203


[1500]	valid_0's auc: 0.781374	valid_0's binary_logloss: 0.238749






[2000]	valid_0's auc: 0.783453	valid_0's binary_logloss: 0.238042




[2500]	valid_0's auc: 0.784365	valid_0's binary_logloss: 0.237729






[3000]	valid_0's auc: 0.784743	valid_0's binary_logloss: 0.237608






[3500]	valid_0's auc: 0.784939	valid_0's binary_logloss: 0.237528






[4000]	valid_0's auc: 0.785192	valid_0's binary_logloss: 0.237471




[4500]	valid_0's auc: 0.785207	valid_0's binary_logloss: 0.237483


Early stopping, best iteration is:
[4293]	valid_0's auc: 0.785294	valid_0's binary_logloss: 0.237454
Evaluated only: auc
---- Fold 01 | AUC: 0.785294 ----
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.569640 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122653
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 1272
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.774359	valid_0's binary_logloss: 0.242869
[1000]	valid_0's auc: 0.78623	valid_0's binary_logloss: 0.237873


[1500]	valid_0's auc: 0.790999	valid_0's binary_logloss: 0.235955




[2000]	valid_0's auc: 0.79306	valid_0's binary_logloss: 0.23509




[2500]	valid_0's auc: 0.793985	valid_0's binary_logloss: 0.234732






[3000]	valid_0's auc: 0.794608	valid_0's binary_logloss: 0.234484






[3500]	valid_0's auc: 0.79499	valid_0's binary_logloss: 0.234333




[4000]	valid_0's auc: 0.795395	valid_0's binary_logloss: 0.234183






[4500]	valid_0's auc: 0.79558	valid_0's binary_logloss: 0.234122






[5000]	valid_0's auc: 0.795571	valid_0's binary_logloss: 0.234143


Early stopping, best iteration is:
[4796]	valid_0's auc: 0.795595	valid_0's binary_logloss: 0.234122
Evaluated only: auc
---- Fold 02 | AUC: 0.795595 ----
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.075603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122611
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 1269
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.768	valid_0's binary_logloss: 0.244641
[1000]	valid_0's auc: 0.778467	valid_0's binary_logloss: 0.24044


[1500]	valid_0's auc: 0.782986	valid_0's binary_logloss: 0.238835




[2000]	valid_0's auc: 0.785354	valid_0's binary_logloss: 0.238002




[2500]	valid_0's auc: 0.786631	valid_0's binary_logloss: 0.237559






[3000]	valid_0's auc: 0.787614	valid_0's binary_logloss: 0.237234






[3500]	valid_0's auc: 0.788098	valid_0's binary_logloss: 0.237078






[4000]	valid_0's auc: 0.788595	valid_0's binary_logloss: 0.23695






[4500]	valid_0's auc: 0.788757	valid_0's binary_logloss: 0.236912




[5000]	valid_0's auc: 0.788859	valid_0's binary_logloss: 0.236877




Early stopping, best iteration is:
[5063]	valid_0's auc: 0.788876	valid_0's binary_logloss: 0.236876
Evaluated only: auc
---- Fold 03 | AUC: 0.788876 ----
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.764458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122720
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 1268
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.77463	valid_0's binary_logloss: 0.242973
[1000]	valid_0's auc: 0.785177	valid_0's binary_logloss: 0.238277




[1500]	valid_0's auc: 0.789767	valid_0's binary_logloss: 0.236505




[2000]	valid_0's auc: 0.791836	valid_0's binary_logloss: 0.235708






[2500]	valid_0's auc: 0.793098	valid_0's binary_logloss: 0.235236






[3000]	valid_0's auc: 0.793724	valid_0's binary_logloss: 0.234988






[3500]	valid_0's auc: 0.794267	valid_0's binary_logloss: 0.234792






[4000]	valid_0's auc: 0.794606	valid_0's binary_logloss: 0.234654






[4500]	valid_0's auc: 0.794709	valid_0's binary_logloss: 0.234614




[5000]	valid_0's auc: 0.794764	valid_0's binary_logloss: 0.234597


Early stopping, best iteration is:
[4836]	valid_0's auc: 0.794822	valid_0's binary_logloss: 0.234578
Evaluated only: auc
---- Fold 04 | AUC: 0.794822 ----
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.020822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 122772
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 1270
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.763064	valid_0's binary_logloss: 0.246
[1000]	valid_0's auc: 0.775734	valid_0's binary_logloss: 0.241378


[1500]	valid_0's auc: 0.780453	valid_0's binary_logloss: 0.239718




[2000]	valid_0's auc: 0.7829	valid_0's binary_logloss: 0.23886




[2500]	valid_0's auc: 0.784308	valid_0's binary_logloss: 0.238405






[3000]	valid_0's auc: 0.785195	valid_0's binary_logloss: 0.238112






[3500]	valid_0's auc: 0.785776	valid_0's binary_logloss: 0.23792






[4000]	valid_0's auc: 0.786161	valid_0's binary_logloss: 0.237813






[4500]	valid_0's auc: 0.786338	valid_0's binary_logloss: 0.237778






[5000]	valid_0's auc: 0.786395	valid_0's binary_logloss: 0.237782
Early stopping, best iteration is:
[4843]	valid_0's auc: 0.786425	valid_0's binary_logloss: 0.237763
Evaluated only: auc
---- Fold 05 | AUC: 0.786425 ----
CV AUC (mean over folds): 0.790202
Best iterations per fold: [4293, 4796, 5063, 4836, 4843]


In [7]:
# ---------- 5) Variable importance (aggregate & plot) ----------
feature_importance_df = pd.DataFrame(imp_rows)
# map pretty/original names for display
feature_importance_df["Feature"] = feature_importance_df["Feature_safe"].map(lambda s: rev_map.get(s, s))

# mean gain per feature (safe key), then display original name
imp_mean = (feature_importance_df
            .groupby(["Feature_safe", "Feature"])["ImportanceGain"]
            .mean()
            .reset_index()
            .sort_values("ImportanceGain", ascending=False))

top_feats = 50
top_block = imp_mean.head(top_feats)

plt.figure(figsize=(10, 10))
plt.barh(
    y=np.arange(len(top_block))[::-1],
    width=top_block["ImportanceGain"][::-1].values
)
plt.yticks(np.arange(len(top_block))[::-1], top_block["Feature"][::-1].values)
plt.title("LightGBM Feature Importance (mean gain over CV folds)")
plt.xlabel("Gain")
plt.tight_layout()
plt.savefig("var_importance.pdf")
plt.close()

In [8]:
# ---------- 6) Feature selection (top K) ----------
top = 500
# choose by safe key to keep alignment with train/test
top_safe = (imp_mean
            .sort_values("ImportanceGain", ascending=False)["Feature_safe"]
            .drop_duplicates()
            .head(top)
            .tolist())

# ensure dedup & existence
features = [f for f in top_safe if f in train.columns]
print("Selected feature matrix shapes:", train[features].shape, test[features].shape)


Selected feature matrix shapes: (307511, 500) (48744, 500)


In [9]:
# ---------- 7) Re-run CV on selected features & train final model ----------
valid_aucs_cv = np.zeros(num_folds, dtype=float)   # reset metrics
test_preds_cv = np.zeros(len(test), dtype=float)   # reset bagging
best_iterations = []

for fold_id, (trn_idx, val_idx) in enumerate(folds.split(train, y), start=1):
    trn_x, trn_y = train.loc[trn_idx, features], y.iloc[trn_idx]
    val_x, val_y = train.loc[val_idx, features], y.iloc[val_idx]

    cb = [
        lgb.early_stopping(stopping, first_metric_only=True),
        lgb.log_evaluation(period=int(verbose) if isinstance(verbose, int) and verbose > 0 else 0),
    ]

    gbm = lgb.LGBMClassifier(**gbm_params)
    gbm.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], eval_metric=metric, callbacks=cb)

    best_round = gbm.best_iteration_ or gbm_params["n_estimators"]
    best_iterations.append(best_round)

    val_pred = gbm.predict_proba(val_x, num_iteration=best_round)[:, 1]
    valid_aucs_cv[fold_id - 1] = roc_auc_score(val_y, val_pred)

    test_preds_cv += gbm.predict_proba(test[features], num_iteration=best_round)[:, 1] / num_folds

    print(f"[Selected] Fold {fold_id:02d} | AUC: {valid_aucs_cv[fold_id-1]:.6f}")

auc = float(valid_aucs_cv.mean())
print(f"[Selected] CV AUC (mean): {auc:.6f}")
best_round_final = int(np.median(best_iterations))  # median is a bit more conservative than mean
print(f"Final n_estimators for full-data fit: {best_round_final}")

# ----- Fit on ALL data with frozen rounds -----
final_params = dict(gbm_params)
final_params["n_estimators"] = best_round_final

gbm_final = lgb.LGBMClassifier(**final_params)
cb_final = [lgb.log_evaluation(period=int(verbose) if isinstance(verbose, int) and verbose > 0 else 0)]
gbm_final.fit(train[features], y, callbacks=cb_final)

test_preds_final = gbm_final.predict_proba(test[features])[:, 1]
print("Final model ready. Example preds:", test_preds_final[:5])


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.379425 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100293
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.767008	valid_0's binary_logloss: 0.244403
[1000]	valid_0's auc: 0.777696	valid_0's binary_logloss: 0.240083


[1500]	valid_0's auc: 0.781726	valid_0's binary_logloss: 0.238614




[2000]	valid_0's auc: 0.783497	valid_0's binary_logloss: 0.238026




[2500]	valid_0's auc: 0.784501	valid_0's binary_logloss: 0.237699




[3000]	valid_0's auc: 0.785193	valid_0's binary_logloss: 0.237496




[3500]	valid_0's auc: 0.785538	valid_0's binary_logloss: 0.237394




[4000]	valid_0's auc: 0.785624	valid_0's binary_logloss: 0.237383
Early stopping, best iteration is:
[3862]	valid_0's auc: 0.785664	valid_0's binary_logloss: 0.23736
Evaluated only: auc
[Selected] Fold 01 | AUC: 0.785664
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.285909 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100310
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 500


[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.774327	valid_0's binary_logloss: 0.242856
[1000]	valid_0's auc: 0.786268	valid_0's binary_logloss: 0.23782
[1500]	valid_0's auc: 0.790921	valid_0's binary_logloss: 0.235908


[2000]	valid_0's auc: 0.793371	valid_0's binary_logloss: 0.23494




[2500]	valid_0's auc: 0.79432	valid_0's binary_logloss: 0.234545






[3000]	valid_0's auc: 0.795121	valid_0's binary_logloss: 0.234238




[3500]	valid_0's auc: 0.795417	valid_0's binary_logloss: 0.234076




[4000]	valid_0's auc: 0.795576	valid_0's binary_logloss: 0.234016




Early stopping, best iteration is:
[4021]	valid_0's auc: 0.795604	valid_0's binary_logloss: 0.234008
Evaluated only: auc
[Selected] Fold 02 | AUC: 0.795604
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.226193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100295
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.76808	valid_0's binary_logloss: 0.244646
[1000]	valid_0's auc: 0.778389	valid_0's binary_logloss: 0.24046


[1500]	valid_0's auc: 0.78294	valid_0's binary_logloss: 0.238854


[2000]	valid_0's auc: 0.785205	valid_0's binary_logloss: 0.238037




[2500]	valid_0's auc: 0.786449	valid_0's binary_logloss: 0.237621




[3000]	valid_0's auc: 0.787272	valid_0's binary_logloss: 0.237356




[3500]	valid_0's auc: 0.787701	valid_0's binary_logloss: 0.237231




[4000]	valid_0's auc: 0.787895	valid_0's binary_logloss: 0.237176




[4500]	valid_0's auc: 0.788034	valid_0's binary_logloss: 0.237149




[5000]	valid_0's auc: 0.788157	valid_0's binary_logloss: 0.237142
Early stopping, best iteration is:
[4740]	valid_0's auc: 0.788178	valid_0's binary_logloss: 0.237118
Evaluated only: auc
[Selected] Fold 03 | AUC: 0.788178
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.293224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100328
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.774999	valid_0's binary_logloss: 0.242899
[1000]	valid_0's auc: 0.785386	valid_0's binary_logloss: 0.238192


[1500]	valid_0's auc: 0.789898	valid_0's binary_logloss: 0.236414




[2000]	valid_0's auc: 0.792065	valid_0's binary_logloss: 0.235596




[2500]	valid_0's auc: 0.793171	valid_0's binary_logloss: 0.235169




[3000]	valid_0's auc: 0.793983	valid_0's binary_logloss: 0.234867




[3500]	valid_0's auc: 0.794369	valid_0's binary_logloss: 0.234713




[4000]	valid_0's auc: 0.79457	valid_0's binary_logloss: 0.234626




[4500]	valid_0's auc: 0.794614	valid_0's binary_logloss: 0.234603
Early stopping, best iteration is:
[4350]	valid_0's auc: 0.794652	valid_0's binary_logloss: 0.234585
Evaluated only: auc
[Selected] Fold 04 | AUC: 0.794652
[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.283002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100282
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 300 rounds
[500]	valid_0's auc: 0.763312	valid_0's binary_logloss: 0.245956


[1000]	valid_0's auc: 0.776028	valid_0's binary_logloss: 0.2413


[1500]	valid_0's auc: 0.780768	valid_0's binary_logloss: 0.239629


[2000]	valid_0's auc: 0.783137	valid_0's binary_logloss: 0.238789




[2500]	valid_0's auc: 0.784475	valid_0's binary_logloss: 0.238334




[3000]	valid_0's auc: 0.785162	valid_0's binary_logloss: 0.238085




[3500]	valid_0's auc: 0.785613	valid_0's binary_logloss: 0.237922




[4000]	valid_0's auc: 0.785913	valid_0's binary_logloss: 0.237835




[4500]	valid_0's auc: 0.785991	valid_0's binary_logloss: 0.237839




[5000]	valid_0's auc: 0.786117	valid_0's binary_logloss: 0.237837




[5500]	valid_0's auc: 0.786065	valid_0's binary_logloss: 0.237906
Early stopping, best iteration is:
[5368]	valid_0's auc: 0.786178	valid_0's binary_logloss: 0.237853
Evaluated only: auc
[Selected] Fold 05 | AUC: 0.786178
[Selected] CV AUC (mean): 0.790055
Final n_estimators for full-data fit: 4350
[LightGBM] [Info] Number of positive: 24825, number of negative: 282686
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.499567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100340
[LightGBM] [Info] Number of data points in the train set: 307511, number of used features: 500
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486




















Final model ready. Example preds: [0.02760042 0.15918596 0.04049115 0.03814783 0.16531582]


In [10]:
# ---------- 8) Save submission (current folder) ----------
preds = test_preds_final if "test_preds_final" in locals() else test_preds_cv
assert "SK_ID_CURR" in test.columns, "Missing SK_ID_CURR in test."
assert len(preds) == len(test), "Prediction length mismatch."
preds = np.clip(preds, 0, 1)

subm = pd.DataFrame({"SK_ID_CURR": test["SK_ID_CURR"].values, "TARGET": preds})
auc_tag = str(round(auc, 6))[2:8] if "auc" in locals() else "NA"
top_tag = str(top)
out_path = f"./auc_{auc_tag}_bag_lgb_top{top_tag}.csv"
subm.to_csv(out_path, index=False, float_format="%.8f")
print(f"Saved submission to: {out_path}")

Saved submission to: ./auc_790055_bag_lgb_top500.csv
