In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------- ------------------------- 0.5/1.5 MB 2.0 MB/s eta 0:00:01
   ------------------------------------ --- 1.3/1.5 MB 3.0 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.9 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [3]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, matthews_corrcoef

from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier, BalancedBaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

try:
    from xgboost import XGBClassifier
except Exception as e:
    raise RuntimeError("Please install xgboost (e.g., `pip install xgboost`).") from e

try:
    from lightgbm import LGBMClassifier
except Exception as e:
    raise RuntimeError("Please install lightgbm (e.g., `pip install lightgbm`).") from e

TRAIN_CSV = "DIA_trainingset_RDKit_descriptors.csv"
TEST_CSV  = "DIA_testset_RDKit_descriptors.csv"

assert os.path.exists(TRAIN_CSV) and os.path.exists(TEST_CSV), "Place DIA_*_RDKit_descriptors.csv in the working directory."

train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

label_map = {"positive":1,"pos":1,"1":1,"true":1,"yes":1,"negative":0,"neg":0,"0":0,"false":0,"no":0}
Ytrain = train_df.iloc[:, 0].astype(str).str.lower().map(label_map).astype(int)
Ytest  = test_df.iloc[:, 0].astype(str).str.lower().map(label_map).astype(int)

Xtrain_all = train_df.iloc[:, 2:].select_dtypes(include=[np.number]).copy()
Xtest_all  = test_df.iloc[:, 2:].select_dtypes(include=[np.number]).copy()

common = [c for c in Xtrain_all.columns if c in Xtest_all.columns]
Xtrain_all = Xtrain_all[common].copy()
Xtest_all  = Xtest_all[common].copy()

print("Train shape:", Xtrain_all.shape, "Test shape:", Xtest_all.shape)
print("Train positive rate:", Ytrain.mean())
print("Test positive rate:", Ytest.mean())


Train shape: (477, 196) Test shape: (120, 196)
Train positive rate: 0.24737945492662475
Test positive rate: 0.25


In [4]:
import json

GA_TXT = "RDKit_GA_65_features.txt"   
GA_CSV = "RDKit_GA_65.csv"            
OUT_TXT = "RDKit_GA_65_features.txt"  
TARGET_K = 65

def load_ga65_from_files(cols):
    if os.path.exists(GA_TXT):
        feats = [l.strip() for l in open(GA_TXT, "r", encoding="utf-8") if l.strip()]
        feats = [f for f in feats if f in cols]
        if len(feats) == TARGET_K:
            return feats
    if os.path.exists(GA_CSV):
        df = pd.read_csv(GA_CSV)
        col = "feature" if "feature" in df.columns else df.columns[0]
        feats = df[col].astype(str).tolist()
        feats = [f for f in feats if f in cols]
        if len(feats) == TARGET_K:
            return feats
    return None

selected_features = load_ga65_from_files(Xtrain_all.columns)

if selected_features is None:
    from sklearn.model_selection import KFold, cross_validate
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import matthews_corrcoef
    import random

    rng_seed = 1
    random.seed(rng_seed)
    np.random.seed(rng_seed)

    X = Xtrain_all.values
    cols = Xtrain_all.columns.tolist()
    n_vars = X.shape[1]

    POP, GENS, CX, MUT = 20, 10, 0.5, 0.2

    kf = KFold(n_splits=5, shuffle=True, random_state=rng_seed)
    def eval_mcc(mask_bits):
        idx = [i for i, b in enumerate(mask_bits) if b == 1]
        if len(idx) == 0:
            return -1.0
        Xsel = X[:, idx]
        rf = RandomForestClassifier(n_estimators=200, random_state=rng_seed, class_weight="balanced")
        scores = cross_validate(rf, Xsel, Ytrain, cv=kf, scoring="matthews_corrcoef", n_jobs=-1)
        return float(scores["test_score"].mean())

    population = []
    for _ in range(POP):
        ones = set(random.sample(range(n_vars), TARGET_K))
        population.append([1 if i in ones else 0 for i in range(n_vars)])

    fitness = [eval_mcc(ind) for ind in population]

    for g in range(GENS):
        newpop = []
        while len(newpop) < POP:
            def tournament():
                t = random.sample(range(POP), 3)
                best_i = max(t, key=lambda i: fitness[i])
                return population[best_i]

            p1, p2 = tournament(), tournament()

            if random.random() < CX:
                cx_point = random.randint(1, n_vars - 1)
                c1 = p1[:cx_point] + p2[cx_point:]
                c2 = p2[:cx_point] + p1[cx_point:]
            else:
                c1, c2 = p1[:], p2[:]

            def mutate(child):
                rate = MUT / n_vars
                for i in range(n_vars):
                    if random.random() < rate:
                        child[i] = 1 - child[i]
                ones_idx = [i for i, b in enumerate(child) if b == 1]
                if len(ones_idx) != TARGET_K:
                    rf = RandomForestClassifier(n_estimators=200, random_state=rng_seed, class_weight="balanced")
                    rf.fit(X, Ytrain)
                    ranking = np.argsort(rf.feature_importances_)[::-1]
                    desired = set(ranking[:TARGET_K])
                    child = [1 if i in desired else 0 for i in range(n_vars)]
                return child

            c1, c2 = mutate(c1), mutate(c2)
            newpop.extend([c1, c2])

        population = newpop[:POP]
        fitness = [eval_mcc(ind) for ind in population]

    best = population[int(np.argmax(fitness))]
    idx = [i for i, b in enumerate(best) if b == 1]
    selected_features = [cols[i] for i in idx]

with open(OUT_TXT, "w", encoding="utf-8") as f:
    for feat in selected_features:
        f.write(f"{feat}\n")

print(f"RDKit_GA_65 features ready: {len(selected_features)} features")
print("Saved to:", OUT_TXT)


RDKit_GA_65 features ready: 65 features
Saved to: RDKit_GA_65_features.txt


In [5]:
def compute_metrics(y_true, y_prob, y_pred):
    auc = roc_auc_score(y_true, y_prob)
    acc = accuracy_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sen = tp / (tp + fn) if (tp+fn) > 0 else 0.0
    spe = tn / (tn + fp) if (tn+fp) > 0 else 0.0
    mcc = matthews_corrcoef(y_true, y_pred)
    return dict(AUC=auc, ACC=acc, SEN=sen, SPE=spe, MCC=mcc)

def run_oof_and_external(build_fn, Xtr, ytr, Xte, yte, threshold=0.5, n_splits=10, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_prob = np.zeros(len(Xtr), dtype=float)
    oof_pred = np.zeros(len(Xtr), dtype=int)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xtr, ytr), 1):
        X_tr, X_va = Xtr.iloc[tr_idx], Xtr.iloc[va_idx]
        y_tr, y_va = ytr.iloc[tr_idx], ytr.iloc[va_idx]
        model = build_fn(X_tr, y_tr)   
        model.fit(X_tr, y_tr)
        prob = model.predict_proba(X_va)[:, 1]
        oof_prob[va_idx] = prob
        oof_pred[va_idx] = (prob >= threshold).astype(int)

    oof = compute_metrics(ytr.values, oof_prob, oof_pred)

    model_full = build_fn(Xtr, ytr)
    model_full.fit(Xtr, ytr)
    te_prob = model_full.predict_proba(Xte)[:, 1]
    te_pred = (te_prob >= threshold).astype(int)
    ext = compute_metrics(yte.values, te_prob, te_pred)

    return oof, ext


In [6]:
Xtr = Xtrain_all[selected_features].copy().reset_index(drop=True)
Xte = Xtest_all[selected_features].copy().reset_index(drop=True)

print("Final shapes:", Xtr.shape, Xte.shape)


Final shapes: (477, 65) (120, 65)


In [11]:
from packaging import version
import sklearn

BRF_PARAMS = dict(
    n_estimators=154,
    criterion="gini",
    max_depth=15,
    max_features=48,         
    bootstrap=True,
    replacement=False,
    sampling_strategy='auto',
    random_state=1,
    verbose=False,
    n_jobs=-1
)

def build_brf(X, y):
    return BalancedRandomForestClassifier(**BRF_PARAMS)


sk_ver = sklearn.__version__
ada_algorithm = "SAMME.R"
if version.parse(sk_ver) >= version.parse("1.4"):
    ada_algorithm = "SAMME"

EEC_OUTER = dict(n_estimators=10, random_state=1, n_jobs=-1)
EEC_ADA = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=7, random_state=1),
    n_estimators=178,
    learning_rate=0.92,
    algorithm=ada_algorithm,
    random_state=1
)

def build_eec(X, y):
    return EasyEnsembleClassifier(estimator=EEC_ADA, **EEC_OUTER)


def _scale_pos_weight(y):
    pos = int((y == 1).sum())
    neg = int((y == 0).sum())
    return float(neg) / float(pos)


def build_bbc_xgb(X, y):
    base = XGBClassifier(
        n_estimators=172,
        learning_rate=0.73,
        booster='dart',
        colsample_bytree=0.3,
        colsample_bynode=1.0,
        gamma=0.036296772856035525,
        reg_lambda=0.06781903189364931,
        min_child_weight=1.0,
        max_depth=18,
        subsample=0.9,
        scale_pos_weight=_scale_pos_weight(y),
        random_state=1,
        verbosity=0,
        n_jobs=-1,
        eval_metric="logloss"
    )
    return BalancedBaggingClassifier(estimator=base, n_estimators=10, random_state=1)


GBC_BASE = GradientBoostingClassifier(
    n_estimators=107,
    learning_rate=0.24,
    criterion='friedman_mse',
    max_depth=5,
    max_features=4,
    subsample=0.99,
    random_state=1,
    verbose=False
)

def build_bbc_gbdt(X, y):
    return BalancedBaggingClassifier(estimator=GBC_BASE, n_estimators=10, random_state=1)


LGBM_BASE = LGBMClassifier(
    boosting_type='gbdt',
    n_estimators=112,
    learning_rate=0.83,
    max_depth=14,
    num_leaves=85,
    colsample_bytree=0.55,
    subsample=0.83,
    min_child_samples=2,
    reg_alpha=0.011600450241817575,
    reg_lambda=0.12670847895140583,
    class_weight='balanced',
    random_state=1,
    n_jobs=-1
)

def build_bbc_lgbm(X, y):
    return BalancedBaggingClassifier(estimator=LGBM_BASE, n_estimators=10, random_state=1)


In [12]:
order = ["BRF","EEC","BBC+XGBoost","BBC+GBDT","BBC+LightGBM"]
builders = {
    "BRF": build_brf,
    "EEC": build_eec,
    "BBC+XGBoost": build_bbc_xgb,
    "BBC+GBDT": build_bbc_gbdt,
    "BBC+LightGBM": build_bbc_lgbm
}

rows = []
for name in order:
    print(f"Running {name} ...")
    oof_m, ext_m = run_oof_and_external(builders[name], Xtr, Ytrain, Xte, Ytest, threshold=0.5, n_splits=10, seed=42)
    rows.append([name,
                 oof_m["AUC"], oof_m["ACC"], oof_m["SEN"], oof_m["SPE"], oof_m["MCC"],
                 ext_m["AUC"], ext_m["ACC"], ext_m["SEN"], ext_m["SPE"], ext_m["MCC"]])

cols = ["Model",
        "OOF_AUC","OOF_ACC","OOF_SEN","OOF_SPE","OOF_MCC",
        "EXT_AUC","EXT_ACC","EXT_SEN","EXT_SPE","EXT_MCC"]

results_df = pd.DataFrame(rows, columns=cols)
results_df


Running BRF ...
Running EEC ...
Running BBC+XGBoost ...
Running BBC+GBDT ...
Running BBC+LightGBM ...
[LightGBM] [Info] Number of positive: 113, number of negative: 113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1134
[LightGBM] [Info] Number of data points in the train set: 226, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 993
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.



[LightGBM] [Info] Number of positive: 88, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 176, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 103, number of negative: 103
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 206, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 102, number of negative: 102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead



[LightGBM] [Info] Number of positive: 96, number of negative: 96
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1045
[LightGBM] [Info] Number of data points in the train set: 192, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 104, number of negative: 104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1109
[LightGBM] [Info] Number of data points in the train set: 208, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 107, number of negative: 107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Number of positive: 119, number of negative: 119
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1180
[LightGBM] [Info] Number of data points in the train set: 238, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 103, number of negative: 103
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1066
[LightGBM] [Info] Number of data points in the train set: 206, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 91, number of negative: 91
[Ligh



[LightGBM] [Info] Number of positive: 117, number of negative: 117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000321 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1170
[LightGBM] [Info] Number of data points in the train set: 234, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 105, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1101
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 107, number of negative: 107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh



[LightGBM] [Info] Number of positive: 95, number of negative: 95
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 190, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 97, number of negative: 97
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1006
[LightGBM] [Info] Number of data points in the train set: 194, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 93, number of negative: 93
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of



[LightGBM] [Info] Number of positive: 118, number of negative: 118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1157
[LightGBM] [Info] Number of data points in the train set: 236, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1028
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 119, number of negative: 119
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Number of positive: 111, number of negative: 111
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1172
[LightGBM] [Info] Number of data points in the train set: 222, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 105, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 210, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Number of positive: 114, number of negative: 114
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1124
[LightGBM] [Info] Number of data points in the train set: 228, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 107, number of negative: 107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 214, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 126, number of negative: 126
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000223 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1243
[LightGBM] [Info] Number of data points in the train set: 252, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 121, number of negative: 121
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 186, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 127, number of negative: 127
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1264
[LightGBM] [Info] Number of data points in the train set: 254, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 109, number of negative: 109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_col_wise=tru



Unnamed: 0,Model,OOF_AUC,OOF_ACC,OOF_SEN,OOF_SPE,OOF_MCC,EXT_AUC,EXT_ACC,EXT_SEN,EXT_SPE,EXT_MCC
0,BRF,0.812509,0.721174,0.70339,0.727019,0.382723,0.82463,0.716667,0.666667,0.733333,0.359425
1,EEC,0.815826,0.719078,0.70339,0.724234,0.379842,0.841852,0.758333,0.7,0.777778,0.436217
2,BBC+XGBoost,0.747651,0.735849,0.652542,0.763231,0.378819,0.820741,0.725,0.733333,0.722222,0.404122
3,BBC+GBDT,0.786035,0.773585,0.627119,0.821727,0.427116,0.792963,0.766667,0.533333,0.844444,0.377778
4,BBC+LightGBM,0.785185,0.773585,0.59322,0.832869,0.412907,0.816296,0.758333,0.6,0.811111,0.39165


In [13]:
def pct(x):
    try: return f"{x*100:.2f} %"
    except: return x

pretty = results_df.copy()
for c in ["OOF_ACC","OOF_SEN","OOF_SPE","EXT_ACC","EXT_SEN","EXT_SPE"]:
    pretty[c] = pretty[c].apply(pct)

display(pretty)

out_csv = "reproduced_table5_results.csv"
results_df.to_csv(out_csv, index=False)
print("Saved:", out_csv)


Unnamed: 0,Model,OOF_AUC,OOF_ACC,OOF_SEN,OOF_SPE,OOF_MCC,EXT_AUC,EXT_ACC,EXT_SEN,EXT_SPE,EXT_MCC
0,BRF,0.812509,72.12 %,70.34 %,72.70 %,0.382723,0.82463,71.67 %,66.67 %,73.33 %,0.359425
1,EEC,0.815826,71.91 %,70.34 %,72.42 %,0.379842,0.841852,75.83 %,70.00 %,77.78 %,0.436217
2,BBC+XGBoost,0.747651,73.58 %,65.25 %,76.32 %,0.378819,0.820741,72.50 %,73.33 %,72.22 %,0.404122
3,BBC+GBDT,0.786035,77.36 %,62.71 %,82.17 %,0.427116,0.792963,76.67 %,53.33 %,84.44 %,0.377778
4,BBC+LightGBM,0.785185,77.36 %,59.32 %,83.29 %,0.412907,0.816296,75.83 %,60.00 %,81.11 %,0.39165


Saved: reproduced_table5_results.csv


In [14]:
from sklearn.linear_model import LogisticRegression

def run_stacking(Xtr, Ytrain, Xte, Ytest, base_builders, threshold=0.5, n_splits=10, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    oof_meta_X = np.zeros((len(Xtr), len(base_builders)))
    test_meta_X = np.zeros((len(Xte), len(base_builders)))

    for j, (name, build_fn) in enumerate(base_builders.items()):
        print(f"Stacking: generating meta-features for {name}")
        oof_prob = np.zeros(len(Xtr))
        test_fold_probs = []

        for tr_idx, va_idx in skf.split(Xtr, Ytrain):
            X_tr, X_va = Xtr.iloc[tr_idx], Xtr.iloc[va_idx]
            y_tr = Ytrain.iloc[tr_idx]

            model = build_fn(X_tr, y_tr)
            model.fit(X_tr, y_tr)

            oof_prob[va_idx] = model.predict_proba(X_va)[:,1]
            test_fold_probs.append(model.predict_proba(Xte)[:,1])

        oof_meta_X[:, j] = oof_prob
        test_meta_X[:, j] = np.mean(test_fold_probs, axis=0)

    meta = LogisticRegression(max_iter=5000, random_state=seed)
    meta.fit(oof_meta_X, Ytrain)

    oof_prob = meta.predict_proba(oof_meta_X)[:,1]
    oof_pred = (oof_prob >= threshold).astype(int)
    oof_metrics = compute_metrics(Ytrain.values, oof_prob, oof_pred)

    test_prob = meta.predict_proba(test_meta_X)[:,1]
    test_pred = (test_prob >= threshold).astype(int)
    ext_metrics = compute_metrics(Ytest.values, test_prob, test_pred)

    return oof_metrics, ext_metrics

stack_builders = {
    "BRF": build_brf,
    "EEC": build_eec,
    "BBC+XGB": build_bbc_xgb,
    "BBC+GBDT": build_bbc_gbdt,
    "BBC+LGBM": build_bbc_lgbm
}

oof_m, ext_m = run_stacking(Xtr, Ytrain, Xte, Ytest, stack_builders)
print("Stacking OOF metrics:", oof_m)
print("Stacking External metrics:", ext_m)


Stacking: generating meta-features for BRF
Stacking: generating meta-features for EEC
Stacking: generating meta-features for BBC+XGB
Stacking: generating meta-features for BBC+GBDT
Stacking: generating meta-features for BBC+LGBM
[LightGBM] [Info] Number of positive: 113, number of negative: 113
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000848 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1134
[LightGBM] [Info] Number of data points in the train set: 226, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 993
[LightGBM] [Info] Number of data points in the train set: 196, number of used 



[LightGBM] [Info] Number of positive: 88, number of negative: 88
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 176, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 103, number of negative: 103
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 206, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 102, number of negative: 102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead



[LightGBM] [Info] Number of positive: 96, number of negative: 96
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1045
[LightGBM] [Info] Number of data points in the train set: 192, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 104, number of negative: 104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1109
[LightGBM] [Info] Number of data points in the train set: 208, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 107, number of negative: 107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Number of positive: 119, number of negative: 119
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1180
[LightGBM] [Info] Number of data points in the train set: 238, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 103, number of negative: 103
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1066
[LightGBM] [Info] Number of data points in the train set: 206, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 91, number of negative: 91
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Number of positive: 109, number of negative: 109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1091
[LightGBM] [Info] Number of data points in the train set: 218, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 117, number of negative: 117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1170
[LightGBM] [Info] Number of data points in the train set: 234, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 105, number of negative: 105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh



[LightGBM] [Info] Number of positive: 118, number of negative: 118
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1167
[LightGBM] [Info] Number of data points in the train set: 236, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 95, number of negative: 95
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 190, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 97, number of negative: 97
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead 



[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000270 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1028
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 119, number of negative: 119
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1188
[LightGBM] [Info] Number of data points in the train set: 238, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 96, number of negative: 96
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead 



[LightGBM] [Info] Number of positive: 109, number of negative: 109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 218, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 122, number of negative: 122
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1208
[LightGBM] [Info] Number of data points in the train set: 244, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 119, number of negative: 119
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh



[LightGBM] [Info] Number of positive: 107, number of negative: 107
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 214, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 98, number of negative: 98
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 101, number of negative: 101
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhea



[LightGBM] [Info] Number of positive: 104, number of negative: 104
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1088
[LightGBM] [Info] Number of data points in the train set: 208, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 99, number of negative: 99
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1032
[LightGBM] [Info] Number of data points in the train set: 198, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 105, number of negative: 105
[Ligh

