In [1]:
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)  
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool

from tqdm import tqdm
from sklearn.base import clone



In [2]:
def has_torch_cuda():
    try:
        import torch
        return torch.cuda.is_available()
    except Exception:
        return False

def get_nvidia_gpu_free_memory_mb():
    try:
        import pynvml
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        free_mb = info.free // (1024 * 1024)
        total_mb = info.total // (1024 * 1024)
        pynvml.nvmlShutdown()
        return free_mb, total_mb
    except Exception:
        return None, None


In [3]:
GPU_AVAILABLE = has_torch_cuda()
FREE_GPU_MB, TOTAL_GPU_MB = get_nvidia_gpu_free_memory_mb()
if FREE_GPU_MB is not None:
    print(f"[GPU] Free memory detected: {FREE_GPU_MB} MB / {TOTAL_GPU_MB} MB")
else:
    print(f"[GPU] Could not detect GPU memory via pynvml. Torch reports CUDA available: {GPU_AVAILABLE}")

CATBOOST_GPU_MIN_MB = 6000  
CATBOOST_USE_GPU = False
if GPU_AVAILABLE and FREE_GPU_MB is not None and FREE_GPU_MB >= CATBOOST_GPU_MIN_MB:
    CATBOOST_USE_GPU = True
elif GPU_AVAILABLE and FREE_GPU_MB is None:
    CATBOOST_USE_GPU = True

USE_GPU_FOR_LIGHTGBM = GPU_AVAILABLE
USE_GPU_FOR_XGBOOST = GPU_AVAILABLE

print(f"CATBOOST_USE_GPU={CATBOOST_USE_GPU}, LIGHTGBM_GPU={USE_GPU_FOR_LIGHTGBM}, XGBOOST_GPU={USE_GPU_FOR_XGBOOST}")


  import pynvml  # type: ignore[import]


[GPU] Free memory detected: 7948 MB / 8188 MB
CATBOOST_USE_GPU=True, LIGHTGBM_GPU=True, XGBOOST_GPU=True


In [4]:
CV_SPLITS = 5
RANDOM_STATE = 42

# Keep features manageable
TOP_K_FEATURES = 2000

LGB_PARAMS = {
    'n_estimators': 400,
    'learning_rate': 0.03,
    'max_depth': 10,
    'n_jobs': -1,
    'verbose': -1,
    'device_type': 'gpu' if USE_GPU_FOR_LIGHTGBM else 'cpu',
}

XGB_PARAMS = {
    'n_estimators': 400,
    'learning_rate': 0.03,
    'max_depth': 8,
    'tree_method': 'hist',
    'verbosity': 0,
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
    'device': 'cuda' if USE_GPU_FOR_XGBOOST else 'cpu'
}

CAT_PARAMS = {
    'iterations': 500,
    'learning_rate': 0.03,
    'depth': 8,
    'verbose': 100,
}

if not CATBOOST_USE_GPU:
    CAT_PARAMS['iterations'] = 250


In [5]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

print("Loaded train/test")


Loaded train/test


In [6]:
features = [c for c in test_df.columns if c not in ['id', 'SMILES']]
X = train_df[features].copy()
y = train_df['Tm'].copy()

X = X.copy()
X.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

X.columns = [c.strip().replace(' ', '_') for c in X.columns]

print(f"Initial numeric feature count: {X.shape[1]}")


Initial numeric feature count: 322


In [7]:
vt = VarianceThreshold(threshold=1e-8)
try:
    X = pd.DataFrame(vt.fit_transform(X), columns=[c for i,c in enumerate(X.columns) if vt.get_support()[i]])
    print(f"After variance thresholding: {X.shape[1]} features remain")
except Exception:
    X = X.select_dtypes(include=[np.number])
    print(f"After forcing numeric selection: {X.shape[1]} features remain")

def select_top_k_by_corr(X_df, y_series, k=TOP_K_FEATURES):
    numeric = X_df.select_dtypes(include=[np.number]).columns
    cors = {}
    for c in numeric:
        col = X_df[c].values
        if np.nanstd(col) == 0:
            continue
        corr = np.corrcoef(col, y_series.values)[0,1]
        if np.isfinite(corr):
            cors[c] = abs(corr)
    if not cors:
        return X_df
    sel = sorted(cors.items(), key=lambda x: x[1], reverse=True)[:k]
    cols = [c for c, _ in sel]
    return X_df[cols]

if X.shape[1] > TOP_K_FEATURES:
    X = select_top_k_by_corr(X, y, k=TOP_K_FEATURES)
    print(f"Selected top {X.shape[1]} features by correlation")

print(f"Using {X.shape[1]} features for modeling")


After variance thresholding: 307 features remain
Using 307 features for modeling


In [None]:
# Reset indices for safe iloc slicing
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

print(" TARGET: Using pre-transformed 'y' for training.")


 TARGET: Using pre-transformed 'y' for training.


In [None]:
kf = KFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
results = {}

def evaluate_model_cv(name, model, scale=False, lgb_params=None, xgb_params=None, cat_params=None):
    """
    Evaluates a model using cross-validation.
    Assumes the target 'y' is already log-transformed.
    Calculates MAE on the original (Kelvin) scale.
    """
    mae_list = []
    steps = []
    if scale:
        steps.append(('scaler', StandardScaler()))
    steps.append(('model', model))
    pipe = Pipeline(steps)
    
    it = tqdm(kf.split(X, y), total=kf.get_n_splits(), desc=f'Training {name}', leave=True)
    for fold_idx, (tr_idx, val_idx) in enumerate(it):
        model = clone(model)
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        if name == 'LightGBM' and isinstance(model, lgb.LGBMRegressor):
            model.set_params(**(lgb_params or {}))
            model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
            y_pred_log = model.predict(X_val, num_iteration=model.best_iteration_)
        elif name == 'XGBoost' and isinstance(model, xgb.XGBRegressor):
            model.set_params(**(xgb_params or {}))
            model.fit(X_tr, y_tr)
            y_pred_log = model.predict(X_val)
        elif name == 'CatBoost' and isinstance(model, CatBoostRegressor):
            model.set_params(**(cat_params or {}))
            model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True, verbose=cat_params.get('verbose', False))
            y_pred_log = model.predict(X_val)
        else:
            pipe.fit(X_tr, y_tr)
            y_pred_log = pipe.predict(X_val)

        if np.any(np.isnan(y_pred_log)):
            y_pred_log = np.nan_to_num(y_pred_log, nan=np.nanmedian(y_pred_log))
            
        # Convert true values and predictions back to original Kelvin scale for MAE calculation
        true_kelvin = np.exp(y_val.values)
        pred_kelvin = np.exp(y_pred_log)

        mae = mean_absolute_error(true_kelvin, pred_kelvin)
        mae_list.append(mae)
        it.set_postfix_str(f"fold {fold_idx+1} mae {mae:.3f} K")
        print(f"{name} Fold {fold_idx+1} MAE: {mae:.3f} K")

    mean_mae = np.mean(mae_list)
    std_mae = np.std(mae_list)
    results[name] = {'mean_mae': mean_mae, 'std_mae': std_mae}
    print(f"{name} - Average MAE: {mean_mae:.3f} +/- {std_mae:.3f} K\n")

In [10]:
models = [
    ('Ridge', Ridge(alpha=1.0, random_state=RANDOM_STATE), True, {}),
    ('ElasticNet', ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=RANDOM_STATE), True, {}),
    ('RandomForest', RandomForestRegressor(n_estimators=200, max_depth=12, random_state=RANDOM_STATE, n_jobs=-1), False, {}),
    ('HistGB', HistGradientBoostingRegressor(max_iter=300, max_depth=10, random_state=RANDOM_STATE), False, {}),

    ('LightGBM', lgb.LGBMRegressor(**LGB_PARAMS), False, LGB_PARAMS),

    ('XGBoost', xgb.XGBRegressor(**XGB_PARAMS), False, XGB_PARAMS),

    ('CatBoost', CatBoostRegressor(task_type='GPU' if CATBOOST_USE_GPU else 'CPU', devices='0' if CATBOOST_USE_GPU else None,
                                  iterations=CAT_PARAMS['iterations'],
                                  learning_rate=CAT_PARAMS['learning_rate'],
                                  depth=CAT_PARAMS['depth'],
                                  random_seed=RANDOM_STATE,
                                  verbose=CAT_PARAMS.get('verbose', 100),
                                  allow_writing_files=False),
     False, CAT_PARAMS),

    ('KNN', KNeighborsRegressor(n_neighbors=5, n_jobs=-1), True, {}),
]


In [11]:
for name, model, scale, extra_params in models:
    print(f"=== Evaluating: {name} ===")
    try:
        if name == 'LightGBM':
            evaluate_model_cv(name, model, scale=scale, lgb_params=extra_params)
        elif name == 'XGBoost':
            evaluate_model_cv(name, model, scale=scale, xgb_params=extra_params)
        elif name == 'CatBoost':
            evaluate_model_cv(name, model, scale=scale, cat_params=extra_params)
        else:
            evaluate_model_cv(name, model, scale=scale)
    except Exception as e:
        print(f"[Error] {name} failed: {e}")
        continue

print("Final results summary:")
for k, v in results.items():
    print(f"{k}: mean MAE = {v['mean_mae']:.3f} K, std = {v['std_mae']:.3f} K")

=== Evaluating: Ridge ===


Training Ridge:   0%|          | 0/5 [00:00<?, ?it/s, fold 3 mae 32.669 K]

Ridge Fold 1 MAE: 35.860 K
Ridge Fold 2 MAE: 32.662 K
Ridge Fold 3 MAE: 32.669 K


Training Ridge: 100%|██████████| 5/5 [00:00<00:00, 46.84it/s, fold 5 mae 36.343 K]


Ridge Fold 4 MAE: 35.291 K
Ridge Fold 5 MAE: 36.343 K
Ridge - Average MAE: 34.565 +/- 1.586 K

=== Evaluating: ElasticNet ===


Training ElasticNet:   0%|          | 0/5 [00:00<?, ?it/s, fold 1 mae 49.524 K]

ElasticNet Fold 1 MAE: 49.524 K


Training ElasticNet:   0%|          | 0/5 [00:00<?, ?it/s, fold 2 mae 45.071 K]

ElasticNet Fold 2 MAE: 45.071 K


Training ElasticNet:   0%|          | 0/5 [00:00<?, ?it/s, fold 3 mae 46.303 K]

ElasticNet Fold 3 MAE: 46.303 K


Training ElasticNet: 100%|██████████| 5/5 [00:00<00:00, 44.17it/s, fold 5 mae 45.734 K]


ElasticNet Fold 4 MAE: 49.057 K
ElasticNet Fold 5 MAE: 45.734 K
ElasticNet - Average MAE: 47.138 +/- 1.806 K

=== Evaluating: RandomForest ===


Training RandomForest:  20%|██        | 1/5 [00:01<00:05,  1.38s/it, fold 1 mae 31.415 K]

RandomForest Fold 1 MAE: 31.415 K


Training RandomForest:  40%|████      | 2/5 [00:02<00:04,  1.43s/it, fold 2 mae 30.536 K]

RandomForest Fold 2 MAE: 30.536 K


Training RandomForest:  60%|██████    | 3/5 [00:04<00:02,  1.37s/it, fold 3 mae 30.230 K]

RandomForest Fold 3 MAE: 30.230 K


Training RandomForest:  80%|████████  | 4/5 [00:05<00:01,  1.36s/it, fold 4 mae 32.172 K]

RandomForest Fold 4 MAE: 32.172 K


Training RandomForest: 100%|██████████| 5/5 [00:06<00:00,  1.38s/it, fold 5 mae 30.621 K]


RandomForest Fold 5 MAE: 30.621 K
RandomForest - Average MAE: 30.995 +/- 0.707 K

=== Evaluating: HistGB ===


Training HistGB:  20%|██        | 1/5 [00:03<00:14,  3.67s/it, fold 1 mae 28.870 K]

HistGB Fold 1 MAE: 28.870 K


Training HistGB:  40%|████      | 2/5 [00:05<00:08,  2.84s/it, fold 2 mae 27.767 K]

HistGB Fold 2 MAE: 27.767 K


Training HistGB:  60%|██████    | 3/5 [00:08<00:05,  2.60s/it, fold 3 mae 27.929 K]

HistGB Fold 3 MAE: 27.929 K


Training HistGB:  80%|████████  | 4/5 [00:10<00:02,  2.46s/it, fold 4 mae 28.529 K]

HistGB Fold 4 MAE: 28.529 K


Training HistGB: 100%|██████████| 5/5 [00:12<00:00,  2.56s/it, fold 5 mae 28.281 K]


HistGB Fold 5 MAE: 28.281 K
HistGB - Average MAE: 28.275 +/- 0.399 K

=== Evaluating: LightGBM ===


Training LightGBM:  20%|██        | 1/5 [00:03<00:12,  3.21s/it, fold 1 mae 28.640 K]

LightGBM Fold 1 MAE: 28.640 K


Training LightGBM:  40%|████      | 2/5 [00:05<00:08,  2.75s/it, fold 2 mae 27.884 K]

LightGBM Fold 2 MAE: 27.884 K


Training LightGBM:  60%|██████    | 3/5 [00:08<00:05,  2.60s/it, fold 3 mae 27.855 K]

LightGBM Fold 3 MAE: 27.855 K


Training LightGBM:  80%|████████  | 4/5 [00:10<00:02,  2.60s/it, fold 4 mae 28.967 K]

LightGBM Fold 4 MAE: 28.967 K


Training LightGBM: 100%|██████████| 5/5 [00:13<00:00,  2.62s/it, fold 5 mae 27.591 K]


LightGBM Fold 5 MAE: 27.591 K
LightGBM - Average MAE: 28.187 +/- 0.524 K

=== Evaluating: XGBoost ===


Training XGBoost:  20%|██        | 1/5 [00:02<00:11,  2.98s/it, fold 1 mae 30.007 K]

XGBoost Fold 1 MAE: 30.007 K


Training XGBoost:  40%|████      | 2/5 [00:05<00:08,  2.88s/it, fold 2 mae 29.149 K]

XGBoost Fold 2 MAE: 29.149 K


Training XGBoost:  60%|██████    | 3/5 [00:08<00:05,  2.87s/it, fold 3 mae 28.807 K]

XGBoost Fold 3 MAE: 28.807 K


Training XGBoost:  80%|████████  | 4/5 [00:11<00:02,  2.86s/it, fold 4 mae 29.399 K]

XGBoost Fold 4 MAE: 29.399 K


Training XGBoost: 100%|██████████| 5/5 [00:14<00:00,  2.86s/it, fold 5 mae 28.837 K]


XGBoost Fold 5 MAE: 28.837 K
XGBoost - Average MAE: 29.240 +/- 0.441 K

=== Evaluating: CatBoost ===


Training CatBoost:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.3023334	test: 0.3079584	best: 0.3079584 (0)	total: 48ms	remaining: 24s
100:	learn: 0.1468339	test: 0.1726140	best: 0.1726140 (100)	total: 2.35s	remaining: 9.28s
200:	learn: 0.1275699	test: 0.1607006	best: 0.1607006 (200)	total: 4.55s	remaining: 6.77s
300:	learn: 0.1213853	test: 0.1578338	best: 0.1578338 (300)	total: 6.72s	remaining: 4.44s
400:	learn: 0.1152299	test: 0.1554998	best: 0.1554959 (399)	total: 8.93s	remaining: 2.21s


Training CatBoost:  20%|██        | 1/5 [00:11<00:46, 11.57s/it, fold 1 mae 31.034 K]

499:	learn: 0.1116463	test: 0.1544331	best: 0.1544317 (498)	total: 11.1s	remaining: 0us
bestTest = 0.1544317437
bestIteration = 498
Shrink model to first 499 iterations.
CatBoost Fold 1 MAE: 31.034 K
0:	learn: 0.3028670	test: 0.3065518	best: 0.3065518 (0)	total: 23.5ms	remaining: 11.7s
100:	learn: 0.1474291	test: 0.1648084	best: 0.1648084 (100)	total: 2.32s	remaining: 9.16s
200:	learn: 0.1266338	test: 0.1527948	best: 0.1527948 (200)	total: 4.56s	remaining: 6.79s
300:	learn: 0.1179519	test: 0.1490288	best: 0.1490288 (300)	total: 6.77s	remaining: 4.48s
400:	learn: 0.1118001	test: 0.1470492	best: 0.1470390 (399)	total: 8.97s	remaining: 2.21s


Training CatBoost:  40%|████      | 2/5 [00:22<00:34, 11.47s/it, fold 2 mae 29.778 K]

499:	learn: 0.1082566	test: 0.1460604	best: 0.1460422 (495)	total: 11.1s	remaining: 0us
bestTest = 0.1460422453
bestIteration = 495
Shrink model to first 496 iterations.
CatBoost Fold 2 MAE: 29.778 K
0:	learn: 0.3035551	test: 0.3034214	best: 0.3034214 (0)	total: 23.3ms	remaining: 11.6s
100:	learn: 0.1468892	test: 0.1688843	best: 0.1688843 (100)	total: 2.33s	remaining: 9.19s
200:	learn: 0.1267725	test: 0.1598783	best: 0.1598783 (200)	total: 4.56s	remaining: 6.78s
300:	learn: 0.1185225	test: 0.1579531	best: 0.1579043 (290)	total: 6.75s	remaining: 4.46s
400:	learn: 0.1122379	test: 0.1566049	best: 0.1565990 (399)	total: 8.94s	remaining: 2.21s


Training CatBoost:  60%|██████    | 3/5 [00:34<00:22, 11.43s/it, fold 3 mae 29.950 K]

499:	learn: 0.1080775	test: 0.1559940	best: 0.1559888 (497)	total: 11.1s	remaining: 0us
bestTest = 0.1559887588
bestIteration = 497
Shrink model to first 498 iterations.
CatBoost Fold 3 MAE: 29.950 K
0:	learn: 0.3026603	test: 0.3066872	best: 0.3066872 (0)	total: 24ms	remaining: 12s
100:	learn: 0.1494117	test: 0.1673973	best: 0.1673973 (100)	total: 2.31s	remaining: 9.14s
200:	learn: 0.1307333	test: 0.1533625	best: 0.1533625 (200)	total: 4.53s	remaining: 6.74s
300:	learn: 0.1223821	test: 0.1485985	best: 0.1485985 (300)	total: 6.69s	remaining: 4.42s
400:	learn: 0.1178869	test: 0.1462069	best: 0.1462069 (400)	total: 8.89s	remaining: 2.19s


Training CatBoost:  80%|████████  | 4/5 [00:45<00:11, 11.40s/it, fold 4 mae 30.622 K]

499:	learn: 0.1143064	test: 0.1443914	best: 0.1443914 (499)	total: 11.1s	remaining: 0us
bestTest = 0.1443914192
bestIteration = 499
CatBoost Fold 4 MAE: 30.622 K
0:	learn: 0.3058040	test: 0.2943333	best: 0.2943333 (0)	total: 24.2ms	remaining: 12.1s
100:	learn: 0.1480444	test: 0.1676858	best: 0.1676858 (100)	total: 2.31s	remaining: 9.12s
200:	learn: 0.1279824	test: 0.1554998	best: 0.1554998 (200)	total: 4.51s	remaining: 6.71s
300:	learn: 0.1183413	test: 0.1504963	best: 0.1504963 (300)	total: 6.72s	remaining: 4.44s
400:	learn: 0.1119353	test: 0.1473385	best: 0.1473385 (400)	total: 8.91s	remaining: 2.2s


Training CatBoost: 100%|██████████| 5/5 [00:57<00:00, 11.42s/it, fold 5 mae 29.259 K]


499:	learn: 0.1070211	test: 0.1454555	best: 0.1454555 (499)	total: 11.1s	remaining: 0us
bestTest = 0.1454555242
bestIteration = 499
CatBoost Fold 5 MAE: 29.259 K
CatBoost - Average MAE: 30.129 +/- 0.629 K

=== Evaluating: KNN ===


Training KNN:  40%|████      | 2/5 [00:00<00:00, 17.34it/s, fold 2 mae 38.134 K]

KNN Fold 1 MAE: 38.184 K
KNN Fold 2 MAE: 38.134 K


Training KNN: 100%|██████████| 5/5 [00:00<00:00, 24.93it/s, fold 5 mae 40.062 K]

KNN Fold 3 MAE: 36.023 K
KNN Fold 4 MAE: 36.780 K
KNN Fold 5 MAE: 40.062 K
KNN - Average MAE: 37.837 +/- 1.383 K

Final results summary:
Ridge: mean MAE = 34.565 K, std = 1.586 K
ElasticNet: mean MAE = 47.138 K, std = 1.806 K
RandomForest: mean MAE = 30.995 K, std = 0.707 K
HistGB: mean MAE = 28.275 K, std = 0.399 K
LightGBM: mean MAE = 28.187 K, std = 0.524 K
XGBoost: mean MAE = 29.240 K, std = 0.441 K
CatBoost: mean MAE = 30.129 K, std = 0.629 K
KNN: mean MAE = 37.837 K, std = 1.383 K





In [12]:
from sklearn.base import clone
from xgboost import callback
def ensemble_top3(models, X, y, kf):
    oof_preds = np.zeros(len(y))
    mae_list = []

    it = tqdm(kf.split(X, y), total=kf.get_n_splits(), desc="Ensemble Top 3", leave=True)
    for fold_idx, (tr_idx, val_idx) in enumerate(it):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        fold_preds = []
        for name, model, params in models:
            m = clone(model)
            
            if name == 'LightGBM':
                m.set_params(**params)
                m.fit(
                    X_tr, y_tr,
                    eval_set=[(X_val, y_val)],
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=50),
                        lgb.log_evaluation(0)
                    ]
                )
                fold_preds.append(m.predict(X_val, num_iteration=m.best_iteration_))
            
            elif name == 'XGBoost':
                m.set_params(**params)
                m.fit(X_tr, y_tr)
                fold_preds.append(m.predict(X_val))
            
            else:  # HistGradientBoosting
                m.fit(X_tr, y_tr)
                fold_preds.append(m.predict(X_val))

        y_pred = np.mean(fold_preds, axis=0)
        oof_preds[val_idx] = y_pred

        mae = mean_absolute_error(np.exp(y_val), np.exp(y_pred))
        mae_list.append(mae)
        print(f"Ensemble Fold {fold_idx+1} MAE: {mae:.3f} K")

    mean_mae = np.mean(mae_list)
    std_mae = np.std(mae_list)
    print(f"Ensemble - Average MAE: {mean_mae:.3f} +/- {std_mae:.3f} K")
    return oof_preds, mean_mae, std_mae


top3_models = [
    ('LightGBM', lgb.LGBMRegressor(**LGB_PARAMS), LGB_PARAMS),
    ('HistGB', HistGradientBoostingRegressor(max_iter=300, max_depth=10, random_state=RANDOM_STATE), {}),
    ('XGBoost', xgb.XGBRegressor(**XGB_PARAMS), XGB_PARAMS),
]

ensemble_oof, ens_mean, ens_std = ensemble_top3(top3_models, X, y, kf)

results['EnsembleTop3'] = {'mean_mae': ens_mean, 'std_mae': ens_std}

print("\n=== Final Results with Ensemble ===")
for k, v in results.items():
    print(f"{k}: mean MAE = {v['mean_mae']:.5f}, std = {v['std_mae']:.5f}")



Ensemble Top 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[390]	valid_0's l2: 0.0222472


Ensemble Top 3:  20%|██        | 1/5 [00:07<00:29,  7.39s/it]

Ensemble Fold 1 MAE: 28.503 K
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[396]	valid_0's l2: 0.0189527


Ensemble Top 3:  40%|████      | 2/5 [00:14<00:22,  7.36s/it]

Ensemble Fold 2 MAE: 27.611 K
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[399]	valid_0's l2: 0.0217702


Ensemble Top 3:  60%|██████    | 3/5 [00:22<00:14,  7.41s/it]

Ensemble Fold 3 MAE: 27.387 K
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's l2: 0.018964


Ensemble Top 3:  80%|████████  | 4/5 [00:29<00:07,  7.52s/it]

Ensemble Fold 4 MAE: 28.434 K
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[399]	valid_0's l2: 0.0187805


Ensemble Top 3: 100%|██████████| 5/5 [00:37<00:00,  7.48s/it]

Ensemble Fold 5 MAE: 27.638 K
Ensemble - Average MAE: 27.915 +/- 0.461 K

=== Final Results with Ensemble ===
Ridge: mean MAE = 34.56499, std = 1.58619
ElasticNet: mean MAE = 47.13790, std = 1.80632
RandomForest: mean MAE = 30.99477, std = 0.70658
HistGB: mean MAE = 28.27533, std = 0.39901
LightGBM: mean MAE = 28.18726, std = 0.52371
XGBoost: mean MAE = 29.23981, std = 0.44053
CatBoost: mean MAE = 30.12858, std = 0.62853
KNN: mean MAE = 37.83654, std = 1.38321
EnsembleTop3: mean MAE = 27.91452, std = 0.46087





In [13]:
# Feature Importance
print("\n=== Feature Importances (LightGBM) ===")
lgb_model = lgb.LGBMRegressor(**LGB_PARAMS)
lgb_model.fit(X, y)
lgb_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': lgb_model.feature_importances_
}).sort_values(by='importance', ascending=False)
print(lgb_importance.head(15))

print("\n=== Feature Importances (XGBoost) ===")
xgb_model = xgb.XGBRegressor(**XGB_PARAMS)
xgb_model.fit(X, y)
xgb_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values(by='importance', ascending=False)
print(xgb_importance.head(15))



=== Feature Importances (LightGBM) ===
              feature  importance
163         PEOE_VSA7         261
121  FpDensityMorgan2         259
132          BalabanJ         244
117  MinPartialCharge         243
205       VSA_EState8         240
129       BCUT2D_MRHI         234
122  FpDensityMorgan3         231
127     BCUT2D_LOGPHI         221
128    BCUT2D_LOGPLOW         215
186              TPSA         214
133           BertzCT         209
130      BCUT2D_MRLOW         206
120  FpDensityMorgan1         203
204       VSA_EState7         176
150            Kappa3         176

=== Feature Importances (XGBoost) ===
                    feature  importance
133                 BertzCT    0.120085
209               NHOHCount    0.106629
230               RingCount    0.055161
113              ExactMolWt    0.042661
221              NumHDonors    0.040694
186                    TPSA    0.040424
112          HeavyAtomMolWt    0.028706
210                 NOCount    0.021037
55               