In [40]:
import numpy as np
import pandas as pd
import re
import joblib
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from lightgbm import LGBMClassifier, early_stopping, log_evaluation

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

# XGBoost
import xgboost as xgb


# Optuna (only for LGBM)
import optuna

ART_DIR = Path("artifacts_models")
ART_DIR.mkdir(exist_ok=True)

def save_artifact(obj, name):
    path = ART_DIR / name
    joblib.dump(obj, path)
    print(f"Saved: {path}")
    return path

def load_artifact(name):
    path = ART_DIR / name
    if not path.exists():
        raise FileNotFoundError(f"Not found: {path}")
    obj = joblib.load(path)
    print(f"Loaded: {path}")
    return obj

def sanitize_columns(cols):
    cleaned = []
    for c in cols:
        c = str(c).replace("\n","_").replace("\r","_").replace("\t","_")
        c = re.sub(r'[{}\[\]":,\\]', "_", c)
        c = re.sub(r"\s+","_", c)
        c = re.sub(r"_+","_", c).strip("_")
        cleaned.append(c if c else "feat")

    seen = {}
    out = []
    for c in cleaned:
        if c not in seen:
            seen[c] = 0
            out.append(c)
        else:
            seen[c] += 1
            out.append(f"{c}__{seen[c]}")
    return out

def prepare_X_y_from_fs_df(fs_df: pd.DataFrame, target_col="TARGET"):
    df = fs_df.copy()
    if target_col not in df.columns:
        raise ValueError(f"{target_col} not found")

    y = df[target_col].astype(int)
    X = df.drop(columns=[target_col], errors="ignore")

    # fill
    cat_cols = X.select_dtypes(include="object").columns.tolist()
    num_cols = X.select_dtypes(exclude="object").columns.tolist()

    X[cat_cols] = X[cat_cols].fillna("Unknown")
    for c in num_cols:
        X[c] = X[c].fillna(X[c].median())

    # one-hot
    X = pd.get_dummies(X, drop_first=True)

    # bool -> int
    for c in X.columns:
        if X[c].dtype == bool:
            X[c] = X[c].astype(np.int8)

    # sanitize + inf cleanup
    X.columns = sanitize_columns(X.columns)
    X = X.replace([np.inf, -np.inf], np.nan)
    for c in X.columns:
        if X[c].isna().any():
            X[c] = X[c].fillna(X[c].median())

    return X, y

def drop_correlated_features_train_only(X, y, threshold=0.95):
    X = X.copy()

    corr = X.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

    # target association (rank corr)
    y_rank = pd.Series(y.values).rank().values
    assoc = {}
    for c in X.columns:
        xr = pd.Series(X[c].values).rank().values
        v = np.corrcoef(xr, y_rank)[0, 1]
        assoc[c] = 0.0 if np.isnan(v) else abs(v)
    assoc = pd.Series(assoc)

    to_drop = set()
    for col in upper.columns:
        high = upper.index[upper[col] > threshold].tolist()
        for row in high:
            if row in to_drop or col in to_drop:
                continue
            drop = row if assoc[row] < assoc[col] else col
            to_drop.add(drop)

    keep_cols = [c for c in X.columns if c not in to_drop]
    return X[keep_cols].copy(), sorted(list(to_drop))

def lgbm_cv_oof_auc(X, y, params, n_splits=5, seed=42, verbose_eval=0):
    X = X.copy()
    y = y.copy()

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros(len(y), dtype=float)
    models = []

    for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]

        model = LGBMClassifier(**params, random_state=seed+fold, n_jobs=-1)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="auc",
            callbacks=[early_stopping(200, first_metric_only=True),
                       log_evaluation(verbose_eval)]
        )
        oof[va] = model.predict_proba(X_va)[:, 1]
        models.append(model)

    return roc_auc_score(y, oof), oof, models

def mean_gain_importance(lgbm_models):
    gains = np.mean([m.booster_.feature_importance(importance_type="gain") for m in lgbm_models], axis=0)
    names = lgbm_models[0].booster_.feature_name()
    return pd.Series(gains, index=names).sort_values(ascending=False)

def xgb_train_cv_oof_auc_native(
    X, y,
    params,
    num_boost_round=20000,
    n_splits=5,
    seed=42,
    early_stopping_rounds=200,
    verbose_eval=200
):
    """
    Native XGBoost training (xgb.train) with early stopping.
    Works even when sklearn wrapper doesn't support early stopping/callbacks.
    """
    X = X.copy()
    y = y.copy()

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros(len(y), dtype=float)
    boosters = []
    best_iters = []

    # Make sure params include required fields
    xgb_params = dict(params)
    xgb_params.setdefault("objective", "binary:logistic")
    xgb_params.setdefault("eval_metric", "auc")
    xgb_params.setdefault("tree_method", "hist")
    xgb_params.setdefault("seed", seed)

    for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
        print(f"\n[XGB-NATIVE] Starting fold {fold}/{n_splits} ...")

        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]

        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dvalid = xgb.DMatrix(X_va, label=y_va)

        booster = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=num_boost_round,
            evals=[(dvalid, "valid")],
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=verbose_eval
        )

        # predict using best_iteration
        p_va = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))
        oof[va] = p_va

        fold_auc = roc_auc_score(y_va, p_va)
        print(f"[XGB-NATIVE] Fold {fold} AUC: {fold_auc:.5f} | best_iter: {booster.best_iteration}")

        boosters.append(booster)
        best_iters.append(booster.best_iteration)

    auc = roc_auc_score(y, oof)
    print(f"\n[XGB-NATIVE] OOF AUC: {auc:.5f}")

    return auc, oof, boosters, best_iters


def cv_auc_for_model_factory(X, y, model_factory, n_splits=5, seed=42):
    X = X.copy()
    y = y.copy()
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof = np.zeros(len(y), dtype=float)

    for fold, (tr, va) in enumerate(skf.split(X, y), start=1):
        model = model_factory()
        model.fit(X.iloc[tr], y.iloc[tr])
        oof[va] = model.predict_proba(X.iloc[va])[:, 1]

    return roc_auc_score(y, oof), oof

def fit_full_and_save_model(algo_key, X, y, model, params_dict=None):
    params_dict = params_dict or {}
    bundle = {
        "algo": algo_key,
        "params": params_dict,
        "feature_names": X.columns.tolist(),
        "model": model
    }
    save_artifact(bundle, f"model_{algo_key}.pkl")
    return bundle


In [32]:
fs_df = pd.read_csv("fs_df.csv")
X_all, y_all = prepare_X_y_from_fs_df(fs_df, target_col="TARGET")
print("X_all:", X_all.shape, "pos_rate:", y_all.mean())

# Baseline LGBM for importance 
baseline_lgbm_params = dict(
    n_estimators=10000,
    learning_rate=0.02,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    min_child_samples=20,
    class_weight="balanced",
    max_depth=-1,
)

auc_base_imp, oof_imp, models_imp = lgbm_cv_oof_auc(
    X_all, y_all, baseline_lgbm_params,
    n_splits=5, seed=42, verbose_eval=200
)
print("Baseline (importance run) OOF AUC:", auc_base_imp)

fi = mean_gain_importance(models_imp)
top40 = fi.head(40).index.tolist()

X_top40 = X_all[top40].copy()
X_final, dropped_corr = drop_correlated_features_train_only(X_top40, y_all, threshold=0.95)

print("Top40:", X_top40.shape, "| After corr drop:", X_final.shape, "| Dropped:", len(dropped_corr))

# Save artifacts to resume later
save_artifact({"baseline_lgbm_params": baseline_lgbm_params}, "params_baseline_lgbm.pkl")
save_artifact({"importance_gain": fi, "top40_before_corr": top40, "dropped_corr": dropped_corr}, "feature_selection_info.pkl")
save_artifact({"X_final": X_final, "y": y_all}, "data_final.pkl")


X_all: (307511, 239) pos_rate: 0.08072881945686496
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18446
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 233
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.770111	valid_0's binary_logloss: 0.552999
[400]	valid_0's auc: 0.776544	valid_0's binary_logloss: 0.526421
[600]	valid_0's auc: 0.777815	valid_0's binary_logloss: 0.508851
[800]	valid_0's auc: 0.777904	valid_0's binary_logloss: 0.493714
Early stopping, best iteration is:
[677]	valid_0's auc: 0.778103	valid_0's binary_logloss: 0.502713
Evaluated only: auc
[LightGBM

WindowsPath('artifacts_models/data_final.pkl')

In [33]:
data = load_artifact("data_final.pkl")
X_final = data["X_final"]
y = data["y"]

params = load_artifact("params_baseline_lgbm.pkl")["baseline_lgbm_params"]

auc_lgbm_base, oof_lgbm_base, _ = lgbm_cv_oof_auc(X_final, y, params, n_splits=5, seed=42, verbose_eval=200)
print("LGBM BASELINE OOF AUC:", auc_lgbm_base)

# Fit ONE model on full data + save
lgbm_base_full = LGBMClassifier(**params, random_state=42, n_jobs=-1)
lgbm_base_full.fit(X_final, y)

bundle_lgbm_base = fit_full_and_save_model("lgbm_base", X_final, y, lgbm_base_full, params_dict=params)
save_artifact({"auc": auc_lgbm_base}, "score_lgbm_base.pkl")


Loaded: artifacts_models\data_final.pkl
Loaded: artifacts_models\params_baseline_lgbm.pkl
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020652 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8966
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.767963	valid_0's binary_logloss: 0.556827
[400]	valid_0's auc: 0.773999	valid_0's binary_logloss: 0.533266
[600]	valid_0's auc: 0.774585	valid_0's binary_logloss: 0.517307
[800]	valid_0's auc: 0.774543	valid_0's binary_logloss: 0.503629
Early stopping, best iteration is:
[771]	valid_0's auc: 0.774638	valid_0's binary_logloss: 0

WindowsPath('artifacts_models/score_lgbm_base.pkl')

In [34]:
data = load_artifact("data_final.pkl")
X_final = data["X_final"]
y = data["y"]

def optuna_tune_lgbm_fast(X, y, n_trials=5, n_splits=3, seed=42):
    def objective(trial):
        params = dict(
            n_estimators=15000,
            learning_rate=trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
            num_leaves=trial.suggest_int("num_leaves", 16, 128),
            min_child_samples=trial.suggest_int("min_child_samples", 5, 120),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.6, 1.0),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-3, 20.0, log=True),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-3, 20.0, log=True),
            class_weight="balanced",
            max_depth=-1,
        )
        auc, _, _ = lgbm_cv_oof_auc(X, y, params, n_splits=n_splits, seed=seed, verbose_eval=0)
        return auc

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    best_params = dict(
        n_estimators=25000,
        class_weight="balanced",
        max_depth=-1,
        **study.best_params
    )
    return study, best_params

study, best_lgbm_params = optuna_tune_lgbm_fast(X_final, y, n_trials=5, n_splits=3, seed=42)
print("Optuna best (3-fold) AUC:", study.best_value)
print("Best params:", best_lgbm_params)

# Final eval 5-fold
auc_lgbm_opt, oof_lgbm_opt, _ = lgbm_cv_oof_auc(X_final, y, best_lgbm_params, n_splits=5, seed=42, verbose_eval=200)
print("LGBM OPT OOF AUC:", auc_lgbm_opt)

# Fit ONE model on full data + save
lgbm_opt_full = LGBMClassifier(**best_lgbm_params, random_state=42, n_jobs=-1)
lgbm_opt_full.fit(X_final, y)

bundle_lgbm_opt = fit_full_and_save_model("lgbm_opt", X_final, y, lgbm_opt_full, params_dict=best_lgbm_params)

save_artifact({"best_params": best_lgbm_params}, "params_lgbm_opt.pkl")
save_artifact({"auc": auc_lgbm_opt}, "score_lgbm_opt.pkl")


[I 2025-12-21 22:21:09,731] A new study created in memory with name: no-name-51f7654a-3f5f-4efe-8e74-04f7824f00ba


Loaded: artifacts_models\data_final.pkl
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8967
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2396]	valid_0's auc: 0.780257	valid_0's binary_logloss: 0.493304
Evaluated only: auc
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8968
[LightGBM] [Inf

[I 2025-12-21 22:24:08,765] Trial 0 finished with value: 0.7787195390693188 and parameters: {'learning_rate': 0.005857924562213654, 'num_leaves': 84, 'min_child_samples': 49, 'subsample': 0.7448512995296056, 'colsample_bytree': 0.6458990912301835, 'reg_lambda': 1.9555965111371516, 'reg_alpha': 10.089532017033568}. Best is trial 0 with value: 0.7787195390693188.


[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8967
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[400]	valid_0's auc: 0.777114	valid_0's binary_logloss: 0.502141
Evaluated only: auc
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8968
[LightGBM] [Info] Number of data points in the train set

[I 2025-12-21 22:24:43,180] Trial 1 finished with value: 0.7761269486221013 and parameters: {'learning_rate': 0.02742058420972762, 'num_leaves': 79, 'min_child_samples': 20, 'subsample': 0.8928167564854739, 'colsample_bytree': 0.6835246869646625, 'reg_lambda': 0.22246412247983582, 'reg_alpha': 0.0029679152685582974}. Best is trial 0 with value: 0.7787195390693188.


[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023911 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8967
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3290]	valid_0's auc: 0.779527	valid_0's binary_logloss: 0.515793
Evaluated only: auc
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8968
[LightGBM] [Info] Number of data points in the train se

[I 2025-12-21 22:26:48,173] Trial 2 finished with value: 0.7781942956802982 and parameters: {'learning_rate': 0.00812870775431524, 'num_leaves': 28, 'min_child_samples': 80, 'subsample': 0.9333832041694925, 'colsample_bytree': 0.6608378687602444, 'reg_lambda': 0.05770604041590449, 'reg_alpha': 4.099812828082873}. Best is trial 0 with value: 0.7787195390693188.


[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8967
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[800]	valid_0's auc: 0.777451	valid_0's binary_logloss: 0.491077
Evaluated only: auc
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8968
[LightGBM] [Info] Number of data points in the train set

[I 2025-12-21 22:28:10,164] Trial 3 finished with value: 0.7762354541403673 and parameters: {'learning_rate': 0.010440690620179154, 'num_leaves': 120, 'min_child_samples': 115, 'subsample': 0.9588937976584218, 'colsample_bytree': 0.9655217578665335, 'reg_lambda': 0.020213338905380462, 'reg_alpha': 1.1323203178433514}. Best is trial 0 with value: 0.7787195390693188.


[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016966 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8967
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2472]	valid_0's auc: 0.778216	valid_0's binary_logloss: 0.519605
Evaluated only: auc
[LightGBM] [Info] Number of positive: 16550, number of negative: 188457
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023466 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8968
[LightGBM] [Info] Number of data points in the train se

[I 2025-12-21 22:30:27,452] Trial 4 finished with value: 0.777042850892623 and parameters: {'learning_rate': 0.005932161628550322, 'num_leaves': 42, 'min_child_samples': 85, 'subsample': 0.7825608847549775, 'colsample_bytree': 0.8770945096147125, 'reg_lambda': 0.3977796049095145, 'reg_alpha': 0.00277632954633953}. Best is trial 0 with value: 0.7787195390693188.


Optuna best (3-fold) AUC: 0.7787195390693188
Best params: {'n_estimators': 25000, 'class_weight': 'balanced', 'max_depth': -1, 'learning_rate': 0.005857924562213654, 'num_leaves': 84, 'min_child_samples': 49, 'subsample': 0.7448512995296056, 'colsample_bytree': 0.6458990912301835, 'reg_lambda': 1.9555965111371516, 'reg_alpha': 10.089532017033568}
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8966
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 200 rounds
[200]	valid_0's auc: 0.755647	valid_0's binary_logloss: 0.597997
[400]	valid_0's auc: 0.76241

WindowsPath('artifacts_models/score_lgbm_opt.pkl')

In [41]:
data = load_artifact("data_final.pkl")
X_final = data["X_final"]
y = data["y"]

# Baseline XGB params (native)
xgb_params_native = dict(
    eta=0.03,                 # learning_rate
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    lambda_=1.0,              # reg_lambda in native
    alpha=0.0                 # reg_alpha in native
)

auc_xgb_base, oof_xgb_base, xgb_boosters, best_iters = xgb_train_cv_oof_auc_native(
    X_final, y,
    params=xgb_params_native,
    num_boost_round=20000,
    n_splits=5,
    seed=42,
    early_stopping_rounds=200,
    verbose_eval=200
)

print("XGB BASELINE (NATIVE) OOF AUC:", auc_xgb_base)

# Save score + best iters
save_artifact({"auc": auc_xgb_base, "best_iters_cv": best_iters, "params_native": xgb_params_native}, "score_xgb_base.pkl")

# Fit ONE FULL booster with n_estimators = median(best_iter)+1
n_rounds_full = int(np.median(best_iters)) + 1
print("XGB full-fit num_boost_round:", n_rounds_full)

dall = xgb.DMatrix(X_final, label=y)

booster_full = xgb.train(
    params={**xgb_params_native, "objective": "binary:logistic", "eval_metric": "auc", "tree_method": "hist", "seed": 42},
    dtrain=dall,
    num_boost_round=n_rounds_full,
    evals=[]
)

# Save full model bundle (note: booster is saved directly)
bundle_xgb = {
    "algo": "xgb_base_native",
    "params": {**xgb_params_native, "num_boost_round": n_rounds_full},
    "feature_names": X_final.columns.tolist(),
    "model": booster_full
}
save_artifact(bundle_xgb, "model_xgb_base.pkl")


Loaded: artifacts_models\data_final.pkl

[XGB-NATIVE] Starting fold 1/5 ...
[0]	valid-auc:0.70893
[200]	valid-auc:0.76441
[400]	valid-auc:0.77043
[600]	valid-auc:0.77353
[800]	valid-auc:0.77507
[1000]	valid-auc:0.77598
[1200]	valid-auc:0.77649
[1400]	valid-auc:0.77709
[1600]	valid-auc:0.77702
[1674]	valid-auc:0.77706
[XGB-NATIVE] Fold 1 AUC: 0.77714 | best_iter: 1474

[XGB-NATIVE] Starting fold 2/5 ...
[0]	valid-auc:0.71830
[200]	valid-auc:0.77244
[400]	valid-auc:0.77900
[600]	valid-auc:0.78218
[800]	valid-auc:0.78371
[1000]	valid-auc:0.78472
[1200]	valid-auc:0.78536
[1400]	valid-auc:0.78577
[1527]	valid-auc:0.78574
[XGB-NATIVE] Fold 2 AUC: 0.78587 | best_iter: 1327

[XGB-NATIVE] Starting fold 3/5 ...
[0]	valid-auc:0.70677
[200]	valid-auc:0.76307
[400]	valid-auc:0.76958
[600]	valid-auc:0.77239
[800]	valid-auc:0.77381
[1000]	valid-auc:0.77471
[1200]	valid-auc:0.77516
[1400]	valid-auc:0.77539
[1600]	valid-auc:0.77525
[1621]	valid-auc:0.77528
[XGB-NATIVE] Fold 3 AUC: 0.77543 | best_iter: 

WindowsPath('artifacts_models/model_xgb_base.pkl')

In [42]:
data = load_artifact("data_final.pkl")
X_final = data["X_final"]
y = data["y"]

scores = {}

# Logistic Regression
auc_logreg, _ = cv_auc_for_model_factory(
    X_final, y,
    model_factory=lambda: LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced", random_state=42),
    n_splits=5, seed=42
)
print("LogReg OOF AUC:", auc_logreg)
logreg_full = LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced", random_state=42)
logreg_full.fit(X_final, y)
fit_full_and_save_model("logreg", X_final, y, logreg_full, params_dict={})
save_artifact({"auc": auc_logreg}, "score_logreg.pkl")
scores["logreg"] = auc_logreg

# GaussianNB
auc_gnb, _ = cv_auc_for_model_factory(
    X_final, y,
    model_factory=lambda: GaussianNB(),
    n_splits=5, seed=42
)
print("GaussianNB OOF AUC:", auc_gnb)
gnb_full = GaussianNB()
gnb_full.fit(X_final, y)
fit_full_and_save_model("gnb", X_final, y, gnb_full, params_dict={})
save_artifact({"auc": auc_gnb}, "score_gnb.pkl")
scores["gnb"] = auc_gnb

# AdaBoost
auc_ada, _ = cv_auc_for_model_factory(
    X_final, y,
    model_factory=lambda: AdaBoostClassifier(n_estimators=300, learning_rate=0.05, random_state=42),
    n_splits=5, seed=42
)
print("AdaBoost OOF AUC:", auc_ada)
ada_full = AdaBoostClassifier(n_estimators=300, learning_rate=0.05, random_state=42)
ada_full.fit(X_final, y)
fit_full_and_save_model("ada", X_final, y, ada_full, params_dict={})
save_artifact({"auc": auc_ada}, "score_ada.pkl")
scores["ada"] = auc_ada

# RandomForest
#auc_rf, _ = cv_auc_for_model_factory(
#    X_final, y,
#    model_factory=lambda: RandomForestClassifier(
#        n_estimators=600, max_depth=None,
#        min_samples_leaf=2, n_jobs=-1,
#        class_weight="balanced_subsample",
#        random_state=42
#    ),
#    n_splits=5, seed=42
#)
#print("RandomForest OOF AUC:", auc_rf)
#rf_full = RandomForestClassifier(
#    n_estimators=600, max_depth=None,
#    min_samples_leaf=2, n_jobs=-1,
#    class_weight="balanced_subsample",
#    random_state=42
#)
#rf_full.fit(X_final, y)
#fit_full_and_save_model("rf", X_final, y, rf_full, params_dict={})
#save_artifact({"auc": auc_rf}, "score_rf.pkl")
#scores["rf"] = auc_rf

save_artifact({"scores_partial": scores}, "scores_other_models.pkl")


Loaded: artifacts_models\data_final.pkl
LogReg OOF AUC: 0.5941517914335778
Saved: artifacts_models\model_logreg.pkl
Saved: artifacts_models\score_logreg.pkl
GaussianNB OOF AUC: 0.6120036678218704
Saved: artifacts_models\model_gnb.pkl
Saved: artifacts_models\score_gnb.pkl
AdaBoost OOF AUC: 0.7412783426380111
Saved: artifacts_models\model_ada.pkl
Saved: artifacts_models\score_ada.pkl


MemoryError: Unable to allocate 1.88 MiB for an array with shape (246009,) and data type int64

In [43]:
save_artifact({"scores_partial": scores}, "scores_other_models.pkl")

Saved: artifacts_models\scores_other_models.pkl


WindowsPath('artifacts_models/scores_other_models.pkl')

In [44]:
# Load scores
scores = {}
scores["lgbm_base"] = load_artifact("score_lgbm_base.pkl")["auc"]
scores["lgbm_opt"]  = load_artifact("score_lgbm_opt.pkl")["auc"]
scores["xgb_base"]  = load_artifact("score_xgb_base.pkl")["auc"]

other = load_artifact("scores_other_models.pkl")["scores_partial"]
scores.update(other)

# Rank
ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
print("=== AUC RANKING ===")
for k, v in ranked:
    print(f"{k:10s}: {v:.6f}")

champion_key = ranked[0][0]
print("\nCHAMPION:", champion_key)

# Load champion bundle and save as champion_model.pkl (copy)
champ_bundle = load_artifact(f"model_{champion_key}.pkl")
save_artifact(champ_bundle, "champion_model.pkl")

# Also save an index of all models saved
all_models_saved = [p.name for p in ART_DIR.glob("model_*.pkl")]
save_artifact({"scores": scores, "champion": champion_key, "model_files": all_models_saved}, "model_registry.pkl")

print("\nSaved champion_model.pkl and model_registry.pkl in artifacts_models/")


Loaded: artifacts_models\score_lgbm_base.pkl
Loaded: artifacts_models\score_lgbm_opt.pkl
Loaded: artifacts_models\score_xgb_base.pkl
Loaded: artifacts_models\scores_other_models.pkl
=== AUC RANKING ===
xgb_base  : 0.779610
lgbm_opt  : 0.779446
lgbm_base : 0.777054
ada       : 0.741278
gnb       : 0.612004
logreg    : 0.594152

CHAMPION: xgb_base
Loaded: artifacts_models\model_xgb_base.pkl
Saved: artifacts_models\champion_model.pkl
Saved: artifacts_models\model_registry.pkl

Saved champion_model.pkl and model_registry.pkl in artifacts_models/
