In [None]:
import os
import warnings
warnings.filterwarnings("ignore", category=UserWarning)  
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import VarianceThreshold

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool

from tqdm import tqdm


In [None]:
def has_torch_cuda():
    try:
        import torch
        return torch.cuda.is_available()
    except Exception:
        return False

def get_nvidia_gpu_free_memory_mb():
    try:
        import pynvml
        pynvml.nvmlInit()
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        free_mb = info.free // (1024 * 1024)
        total_mb = info.total // (1024 * 1024)
        pynvml.nvmlShutdown()
        return free_mb, total_mb
    except Exception:
        return None, None


In [None]:
GPU_AVAILABLE = has_torch_cuda()
FREE_GPU_MB, TOTAL_GPU_MB = get_nvidia_gpu_free_memory_mb()
if FREE_GPU_MB is not None:
    print(f"[GPU] Free memory detected: {FREE_GPU_MB} MB / {TOTAL_GPU_MB} MB")
else:
    print(f"[GPU] Could not detect GPU memory via pynvml. Torch reports CUDA available: {GPU_AVAILABLE}")

CATBOOST_GPU_MIN_MB = 6000  
CATBOOST_USE_GPU = False
if GPU_AVAILABLE and FREE_GPU_MB is not None and FREE_GPU_MB >= CATBOOST_GPU_MIN_MB:
    CATBOOST_USE_GPU = True
elif GPU_AVAILABLE and FREE_GPU_MB is None:
    CATBOOST_USE_GPU = True

USE_GPU_FOR_LIGHTGBM = GPU_AVAILABLE
USE_GPU_FOR_XGBOOST = GPU_AVAILABLE

print(f"CATBOOST_USE_GPU={CATBOOST_USE_GPU}, LIGHTGBM_GPU={USE_GPU_FOR_LIGHTGBM}, XGBOOST_GPU={USE_GPU_FOR_XGBOOST}")


  import pynvml  # type: ignore[import]


[GPU] Free memory detected: 7776 MB / 8188 MB
CATBOOST_USE_GPU=True, LIGHTGBM_GPU=True, XGBOOST_GPU=True


In [None]:
CV_SPLITS = 5
RANDOM_STATE = 42

# Keep features manageable
TOP_K_FEATURES = 2000

LGB_PARAMS = {
    'n_estimators': 400,
    'learning_rate': 0.03,
    'max_depth': 10,
    'n_jobs': -1,
    'verbose': -1,
    'device_type': 'gpu' if USE_GPU_FOR_LIGHTGBM else 'cpu',
}

XGB_PARAMS = {
    'n_estimators': 400,
    'learning_rate': 0.03,
    'max_depth': 8,
    'tree_method': 'hist',
    'verbosity': 0,
    'random_state': RANDOM_STATE,
    'n_jobs': -1,
    'device': 'cuda' if USE_GPU_FOR_XGBOOST else 'cpu'
}

CAT_PARAMS = {
    'iterations': 500,
    'learning_rate': 0.03,
    'depth': 8,
    'verbose': 100,
    'random_state': RANDOM_STATE,
}

if not CATBOOST_USE_GPU:
    CAT_PARAMS['iterations'] = 250


In [6]:
train_df = pd.read_csv('../data/processed/train_processed.csv')
test_df = pd.read_csv('../data/processed/test_processed.csv')

print("Loaded train/test")


Loaded train/test


In [None]:
features = [c for c in test_df.columns if c not in ['id', 'SMILES']]
X = train_df[features].copy()
y = train_df['Tm'].copy()

X = X.copy()
X.replace([np.inf, -np.inf], np.nan, inplace=True)
numeric_cols = X.select_dtypes(include=[np.number]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())

X.columns = [c.strip().replace(' ', '_') for c in X.columns]

print(f"Initial numeric feature count: {X.shape[1]}")


Initial numeric feature count: 322


In [None]:
vt = VarianceThreshold(threshold=1e-8)
try:
    X = pd.DataFrame(vt.fit_transform(X), columns=[c for i,c in enumerate(X.columns) if vt.get_support()[i]])
    print(f"After variance thresholding: {X.shape[1]} features remain")
except Exception:
    X = X.select_dtypes(include=[np.number])
    print(f"After forcing numeric selection: {X.shape[1]} features remain")

def select_top_k_by_corr(X_df, y_series, k=TOP_K_FEATURES):
    numeric = X_df.select_dtypes(include=[np.number]).columns
    cors = {}
    for c in numeric:
        col = X_df[c].values
        if np.nanstd(col) == 0:
            continue
        corr = np.corrcoef(col, y_series.values)[0,1]
        if np.isfinite(corr):
            cors[c] = abs(corr)
    if not cors:
        return X_df
    sel = sorted(cors.items(), key=lambda x: x[1], reverse=True)[:k]
    cols = [c for c, _ in sel]
    return X_df[cols]

if X.shape[1] > TOP_K_FEATURES:
    X = select_top_k_by_corr(X, y, k=TOP_K_FEATURES)
    print(f"Selected top {X.shape[1]} features by correlation")

print(f"Using {X.shape[1]} features for modeling")


After variance thresholding: 307 features remain
Using 307 features for modeling


In [None]:
# Reset indices for safe iloc slicing
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

USE_LOG_TARGET = False
if (y > 0).all():
    USE_LOG_TARGET = True
    print("TARGET: all positive -> log-transform ENABLED")
else:
    print("TARGET: contains non-positive values -> log-transform DISABLED (to avoid -inf/nan)")


TARGET: all positive -> log-transform ENABLED


In [None]:
kf = KFold(n_splits=CV_SPLITS, shuffle=True, random_state=RANDOM_STATE)
results = {}

def evaluate_model_cv(name, model, scale=False, use_log=USE_LOG_TARGET, lgb_params=None, xgb_params=None, cat_params=None):
    mae_list = []
    steps = []
    if scale:
        steps.append(('scaler', StandardScaler()))
    steps.append(('model', model))
    pipe = Pipeline(steps)
    
    it = tqdm(kf.split(X, y), total=kf.get_n_splits(), desc=f'Training {name}', leave=True)
    for fold_idx, (tr_idx, val_idx) in enumerate(it):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        if use_log:
            y_tr_fit = np.log(y_tr)
            pipe.fit(X_tr, y_tr_fit)
            y_pred_log = pipe.predict(X_val)
            y_pred = np.exp(y_pred_log)
        else:
            if name == 'LightGBM' and isinstance(model, lgb.LGBMRegressor):
                model.set_params(**(lgb_params or {}))
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
                y_pred = model.predict(X_val, num_iteration=model.best_iteration_)
            elif name == 'XGBoost' and isinstance(model, xgb.XGBRegressor):
                model.set_params(**(xgb_params or {}))
                model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
                y_pred = model.predict(X_val)
            elif name == 'CatBoost' and isinstance(model, CatBoostRegressor):
                model.set_params(**(cat_params or {}))
                model.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True, verbose=cat_params.get('verbose', False))
                y_pred = model.predict(X_val)
            else:
                pipe.fit(X_tr, y_tr)
                y_pred = pipe.predict(X_val)

        if np.any(np.isnan(y_pred)):
            y_pred = np.nan_to_num(y_pred, nan=np.nanmedian(y_pred))
        mae = mean_absolute_error(y_val.values, y_pred)
        mae_list.append(mae)
        it.set_postfix_str(f"fold {fold_idx+1} mae {mae:.5f}")
        print(f"{name} Fold {fold_idx+1} MAE: {mae:.5f}")

    mean_mae = np.mean(mae_list)
    std_mae = np.std(mae_list)
    results[name] = {'mean_mae': mean_mae, 'std_mae': std_mae}
    print(f"{name} - Average MAE: {mean_mae:.5f} +/- {std_mae:.5f}\n")


In [None]:
models = [
    ('Ridge', Ridge(alpha=1.0, random_state=RANDOM_STATE), True, {}),
    ('ElasticNet', ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=RANDOM_STATE), True, {}),
    ('RandomForest', RandomForestRegressor(n_estimators=200, max_depth=12, random_state=RANDOM_STATE, n_jobs=-1), False, {}),
    ('HistGB', HistGradientBoostingRegressor(max_iter=300, max_depth=10, random_state=RANDOM_STATE), False, {}),

    ('LightGBM', lgb.LGBMRegressor(**LGB_PARAMS), False, LGB_PARAMS),

    ('XGBoost', xgb.XGBRegressor(**XGB_PARAMS), False, XGB_PARAMS),

    ('CatBoost', CatBoostRegressor(task_type='GPU' if CATBOOST_USE_GPU else 'CPU', devices='0' if CATBOOST_USE_GPU else None,
                                  iterations=CAT_PARAMS['iterations'],
                                  learning_rate=CAT_PARAMS['learning_rate'],
                                  depth=CAT_PARAMS['depth'],
                                  random_seed=RANDOM_STATE,
                                  verbose=CAT_PARAMS.get('verbose', 100),
                                  allow_writing_files=False),
     False, CAT_PARAMS),

    ('KNN', KNeighborsRegressor(n_neighbors=5, n_jobs=-1), True, {}),
]


In [None]:
for name, model, scale, extra_params in models:
    print(f"=== Evaluating: {name} ===")
    try:
        if name == 'LightGBM':
            evaluate_model_cv(name, model, scale=scale, use_log=USE_LOG_TARGET, lgb_params=extra_params)
        elif name == 'XGBoost':
            evaluate_model_cv(name, model, scale=scale, use_log=USE_LOG_TARGET, xgb_params=extra_params)
        elif name == 'CatBoost':
            evaluate_model_cv(name, model, scale=scale, use_log=USE_LOG_TARGET, cat_params=extra_params)
        else:
            evaluate_model_cv(name, model, scale=scale, use_log=USE_LOG_TARGET)
    except Exception as e:
        print(f"[Error] {name} failed: {e}")
        continue

print("Final results summary:")
for k, v in results.items():
    print(f"{k}: mean MAE = {v['mean_mae']:.5f}, std = {v['std_mae']:.5f}")


=== Evaluating: Ridge ===


Training Ridge:   0%|          | 0/5 [00:00<?, ?it/s, fold 3 mae 0.11809]

Ridge Fold 1 MAE: 0.12490
Ridge Fold 2 MAE: 0.11782
Ridge Fold 3 MAE: 0.11809


Training Ridge: 100%|██████████| 5/5 [00:00<00:00, 46.58it/s, fold 5 mae 0.13041]


Ridge Fold 4 MAE: 0.12626
Ridge Fold 5 MAE: 0.13041
Ridge - Average MAE: 0.12349 +/- 0.00487

=== Evaluating: ElasticNet ===


Training ElasticNet:   0%|          | 0/5 [00:00<?, ?it/s, fold 1 mae 0.24677]

ElasticNet Fold 1 MAE: 0.24677


Training ElasticNet:   0%|          | 0/5 [00:00<?, ?it/s, fold 3 mae 0.24954]

ElasticNet Fold 2 MAE: 0.24221
ElasticNet Fold 3 MAE: 0.24954


Training ElasticNet: 100%|██████████| 5/5 [00:00<00:00, 63.47it/s, fold 5 mae 0.23988]


ElasticNet Fold 4 MAE: 0.24671
ElasticNet Fold 5 MAE: 0.23988
ElasticNet - Average MAE: 0.24502 +/- 0.00348

=== Evaluating: RandomForest ===


Training RandomForest:  20%|██        | 1/5 [00:01<00:05,  1.30s/it, fold 1 mae 0.11223]

RandomForest Fold 1 MAE: 0.11223


Training RandomForest:  40%|████      | 2/5 [00:02<00:03,  1.31s/it, fold 2 mae 0.11017]

RandomForest Fold 2 MAE: 0.11017


Training RandomForest:  60%|██████    | 3/5 [00:04<00:02,  1.38s/it, fold 3 mae 0.11057]

RandomForest Fold 3 MAE: 0.11057


Training RandomForest:  80%|████████  | 4/5 [00:05<00:01,  1.37s/it, fold 4 mae 0.11464]

RandomForest Fold 4 MAE: 0.11464


Training RandomForest: 100%|██████████| 5/5 [00:06<00:00,  1.35s/it, fold 5 mae 0.10780]


RandomForest Fold 5 MAE: 0.10780
RandomForest - Average MAE: 0.11108 +/- 0.00227

=== Evaluating: HistGB ===


Training HistGB:  20%|██        | 1/5 [00:03<00:14,  3.66s/it, fold 1 mae 0.10391]

HistGB Fold 1 MAE: 0.10391


Training HistGB:  40%|████      | 2/5 [00:06<00:08,  2.97s/it, fold 2 mae 0.09870]

HistGB Fold 2 MAE: 0.09870


Training HistGB:  60%|██████    | 3/5 [00:08<00:05,  2.72s/it, fold 3 mae 0.10059]

HistGB Fold 3 MAE: 0.10059


Training HistGB:  80%|████████  | 4/5 [00:10<00:02,  2.57s/it, fold 4 mae 0.10470]

HistGB Fold 4 MAE: 0.10470


Training HistGB: 100%|██████████| 5/5 [00:13<00:00,  2.66s/it, fold 5 mae 0.09869]


HistGB Fold 5 MAE: 0.09869
HistGB - Average MAE: 0.10132 +/- 0.00255

=== Evaluating: LightGBM ===


Training LightGBM:  20%|██        | 1/5 [00:05<00:23,  5.97s/it, fold 1 mae 0.10301]

LightGBM Fold 1 MAE: 0.10301


Training LightGBM:  40%|████      | 2/5 [00:08<00:11,  3.97s/it, fold 2 mae 0.10060]

LightGBM Fold 2 MAE: 0.10060


Training LightGBM:  60%|██████    | 3/5 [00:10<00:06,  3.24s/it, fold 3 mae 0.10047]

LightGBM Fold 3 MAE: 0.10047


Training LightGBM:  80%|████████  | 4/5 [00:13<00:02,  2.86s/it, fold 4 mae 0.10397]

LightGBM Fold 4 MAE: 0.10397


Training LightGBM: 100%|██████████| 5/5 [00:15<00:00,  3.11s/it, fold 5 mae 0.09621]


LightGBM Fold 5 MAE: 0.09621
LightGBM - Average MAE: 0.10085 +/- 0.00269

=== Evaluating: XGBoost ===


Training XGBoost:  20%|██        | 1/5 [00:02<00:11,  2.99s/it, fold 1 mae 0.10913]

XGBoost Fold 1 MAE: 0.10913


Training XGBoost:  40%|████      | 2/5 [00:05<00:08,  2.87s/it, fold 2 mae 0.10410]

XGBoost Fold 2 MAE: 0.10410


Training XGBoost:  60%|██████    | 3/5 [00:08<00:05,  2.86s/it, fold 3 mae 0.10467]

XGBoost Fold 3 MAE: 0.10467


Training XGBoost:  80%|████████  | 4/5 [00:11<00:02,  2.84s/it, fold 4 mae 0.10699]

XGBoost Fold 4 MAE: 0.10699


Training XGBoost: 100%|██████████| 5/5 [00:14<00:00,  2.84s/it, fold 5 mae 0.09960]


XGBoost Fold 5 MAE: 0.09960
XGBoost - Average MAE: 0.10490 +/- 0.00319

=== Evaluating: CatBoost ===


Training CatBoost:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.0548703	total: 120ms	remaining: 59.7s
100:	learn: 0.0263770	total: 2.38s	remaining: 9.41s
200:	learn: 0.0228651	total: 4.58s	remaining: 6.81s
300:	learn: 0.0216748	total: 6.74s	remaining: 4.45s
400:	learn: 0.0206742	total: 8.9s	remaining: 2.2s


Training CatBoost:  20%|██        | 1/5 [00:11<00:46, 11.52s/it, fold 1 mae 0.11081]

499:	learn: 0.0200660	total: 11s	remaining: 0us
CatBoost Fold 1 MAE: 0.11081
0:	learn: 0.0549402	total: 22.8ms	remaining: 11.4s
100:	learn: 0.0265992	total: 2.27s	remaining: 8.98s
200:	learn: 0.0227784	total: 4.45s	remaining: 6.62s
300:	learn: 0.0211853	total: 6.61s	remaining: 4.37s
400:	learn: 0.0201536	total: 8.77s	remaining: 2.16s


Training CatBoost:  40%|████      | 2/5 [00:22<00:33, 11.31s/it, fold 2 mae 0.10754]

499:	learn: 0.0194226	total: 10.9s	remaining: 0us
CatBoost Fold 2 MAE: 0.10754
0:	learn: 0.0549899	total: 23.3ms	remaining: 11.6s
100:	learn: 0.0265745	total: 2.3s	remaining: 9.1s
200:	learn: 0.0228932	total: 4.5s	remaining: 6.7s
300:	learn: 0.0213469	total: 6.67s	remaining: 4.41s
400:	learn: 0.0202327	total: 8.82s	remaining: 2.18s


Training CatBoost:  60%|██████    | 3/5 [00:33<00:22, 11.25s/it, fold 3 mae 0.10816]

499:	learn: 0.0195059	total: 10.9s	remaining: 0us
CatBoost Fold 3 MAE: 0.10816
0:	learn: 0.0549488	total: 22.4ms	remaining: 11.2s
100:	learn: 0.0269808	total: 2.25s	remaining: 8.88s
200:	learn: 0.0234270	total: 4.42s	remaining: 6.58s
300:	learn: 0.0221162	total: 6.57s	remaining: 4.34s
400:	learn: 0.0210502	total: 8.71s	remaining: 2.15s


Training CatBoost:  80%|████████  | 4/5 [00:44<00:11, 11.18s/it, fold 4 mae 0.10951]

499:	learn: 0.0203029	total: 10.8s	remaining: 0us
CatBoost Fold 4 MAE: 0.10951
0:	learn: 0.0555841	total: 23.8ms	remaining: 11.9s
100:	learn: 0.0268140	total: 2.3s	remaining: 9.09s
200:	learn: 0.0229414	total: 4.48s	remaining: 6.66s
300:	learn: 0.0211402	total: 6.65s	remaining: 4.4s
400:	learn: 0.0198930	total: 8.81s	remaining: 2.17s


Training CatBoost: 100%|██████████| 5/5 [00:56<00:00, 11.23s/it, fold 5 mae 0.10294]


499:	learn: 0.0190845	total: 10.9s	remaining: 0us
CatBoost Fold 5 MAE: 0.10294
CatBoost - Average MAE: 0.10779 +/- 0.00268

=== Evaluating: KNN ===


Training KNN:  40%|████      | 2/5 [00:00<00:00, 16.95it/s, fold 2 mae 0.14316]

KNN Fold 1 MAE: 0.13920
KNN Fold 2 MAE: 0.14316


Training KNN: 100%|██████████| 5/5 [00:00<00:00, 23.98it/s, fold 5 mae 0.14622]

KNN Fold 3 MAE: 0.13404
KNN Fold 4 MAE: 0.13499
KNN Fold 5 MAE: 0.14622
KNN - Average MAE: 0.13952 +/- 0.00467

Final results summary:
Ridge: mean MAE = 0.12349, std = 0.00487
ElasticNet: mean MAE = 0.24502, std = 0.00348
RandomForest: mean MAE = 0.11108, std = 0.00227
HistGB: mean MAE = 0.10132, std = 0.00255
LightGBM: mean MAE = 0.10085, std = 0.00269
XGBoost: mean MAE = 0.10490, std = 0.00319
CatBoost: mean MAE = 0.10779, std = 0.00268
KNN: mean MAE = 0.13952, std = 0.00467





In [None]:
from sklearn.base import clone
from xgboost import callback
def ensemble_top3(models, X, y, kf):
    oof_preds = np.zeros(len(y))
    mae_list = []

    it = tqdm(kf.split(X, y), total=kf.get_n_splits(), desc="Ensemble Top 3", leave=True)
    for fold_idx, (tr_idx, val_idx) in enumerate(it):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        fold_preds = []
        for name, model, params in models:
            m = clone(model)
            
            if name == 'LightGBM':
                m.set_params(**params)
                m.fit(
                    X_tr, y_tr,
                    eval_set=[(X_val, y_val)],
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=50),
                        lgb.log_evaluation(0)
                    ]
                )
                fold_preds.append(m.predict(X_val, num_iteration=m.best_iteration_))
            
            elif name == 'XGBoost':
                m.set_params(**params)
                m.fit(X_tr, y_tr)
                fold_preds.append(m.predict(X_val))
            
            else:  # HistGradientBoosting
                m.fit(X_tr, y_tr)
                fold_preds.append(m.predict(X_val))

        y_pred = np.mean(fold_preds, axis=0)
        oof_preds[val_idx] = y_pred

        mae = mean_absolute_error(y_val, y_pred)
        mae_list.append(mae)
        print(f"Ensemble Fold {fold_idx+1} MAE: {mae:.5f}")

    mean_mae = np.mean(mae_list)
    std_mae = np.std(mae_list)
    print(f"Ensemble - Average MAE: {mean_mae:.5f} +/- {std_mae:.5f}")
    return oof_preds, mean_mae, std_mae


top3_models = [
    ('LightGBM', lgb.LGBMRegressor(**LGB_PARAMS), LGB_PARAMS),
    ('HistGB', HistGradientBoostingRegressor(max_iter=300, max_depth=10, random_state=RANDOM_STATE), {}),
    ('XGBoost', xgb.XGBRegressor(**XGB_PARAMS), XGB_PARAMS),
]

ensemble_oof, ens_mean, ens_std = ensemble_top3(top3_models, X, y, kf)

results['EnsembleTop3'] = {'mean_mae': ens_mean, 'std_mae': ens_std}

print("\n=== Final Results with Ensemble ===")
for k, v in results.items():
    print(f"{k}: mean MAE = {v['mean_mae']:.5f}, std = {v['std_mae']:.5f}")



Ensemble Top 3:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[390]	valid_0's l2: 0.0222541
Ensemble Fold 1 MAE: 0.10209
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[396]	valid_0's l2: 0.0189421
Ensemble Fold 2 MAE: 0.09902
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[399]	valid_0's l2: 0.0217741
Ensemble Fold 3 MAE: 0.09920
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's l2: 0.0189577
Ensemble Fold 4 MAE: 0.10172
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[399]	valid_0's l2: 0.0187803
Ensemble Fold 5 MAE: 0.09687
Ensemble - Average MAE: 0.09978 +/- 0.00192

=== Final Results with Ensemble ===
Ridge: mean MAE = 0.12349, std = 0.00487
ElasticNet: mean MAE = 0.24502, std = 

In [None]:
# Feature Importance
print("\n=== Feature Importances (LightGBM) ===")
lgb_model = lgb.LGBMRegressor(**LGB_PARAMS)
lgb_model.fit(X, y)
lgb_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': lgb_model.feature_importances_
}).sort_values(by='importance', ascending=False)
print(lgb_importance.head(15))

print("\n=== Feature Importances (XGBoost) ===")
xgb_model = xgb.XGBRegressor(**XGB_PARAMS)
xgb_model.fit(X, y)
xgb_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values(by='importance', ascending=False)
print(xgb_importance.head(15))



=== Feature Importances (LightGBM) ===
              feature  importance
163         PEOE_VSA7         261
121  FpDensityMorgan2         260
117  MinPartialCharge         247
132          BalabanJ         244
205       VSA_EState8         240
129       BCUT2D_MRHI         234
122  FpDensityMorgan3         231
127     BCUT2D_LOGPHI         221
186              TPSA         216
128    BCUT2D_LOGPLOW         215
133           BertzCT         209
130      BCUT2D_MRLOW         207
120  FpDensityMorgan1         202
204       VSA_EState7         176
150            Kappa3         176

=== Feature Importances (XGBoost) ===
                    feature  importance
133                 BertzCT    0.120085
209               NHOHCount    0.106629
230               RingCount    0.055161
113              ExactMolWt    0.042661
221              NumHDonors    0.040694
186                    TPSA    0.040424
112          HeavyAtomMolWt    0.028706
210                 NOCount    0.021037
55               