In [None]:
%pip install -q transformers datasets scikit-learn accelerate optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m337.9/400.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import joblib
import pandas as pd

In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import joblib

class RandomForestTrainer:
    def __init__(self, param_grid=None, n_splits=3, random_state=42):
        """
        param_grid: dictionary for hyperparameter tuning
        n_splits: number of folds for K-Fold CV
        """
        # Use estimator__ prefix for MultiOutputClassifier
        self.param_grid = param_grid if param_grid else {
            'estimator__n_estimators': [100, 200],
            'estimator__max_depth': [None, 10, 20],
            'estimator__min_samples_split': [2, 5]
        }
        self.n_splits = n_splits
        self.random_state = random_state
        self.best_model = None

    def train(self, X_train, y_train, X_val=None, y_val=None, save_path="rf_model.pkl"):
        """
        X_train, y_train: training features and labels (multi-label one-hot)
        """
        rf = RandomForestClassifier(random_state=self.random_state, class_weight='balanced_subsample', n_jobs=1)
        multi_rf = MultiOutputClassifier(rf, n_jobs=-1)

        kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state)

        random_search = RandomizedSearchCV(
            multi_rf,
            param_distributions=self.param_grid,
            n_iter=1,
            cv=kf,
            scoring='f1_weighted',
            verbose=3,
            n_jobs=-1,
            random_state=self.random_state
        )

        random_search.fit(X_train, y_train)

        print(f"Best hyperparameters: {random_search.best_params_}")

        self.best_model = random_search.best_estimator_

        if X_val is not None and y_val is not None:
            y_pred = self.best_model.predict(X_val)
            print("Validation Metrics:")
            print(classification_report(y_val, y_pred, zero_division=0))

        joblib.dump(self.best_model, save_path)
        print(f"Final model saved to {save_path}")

        return self.best_model

In [51]:
train_df = pd.read_csv("./train_fp1024.csv")
val_df   = pd.read_csv("./valid_fp1024.csv")
test_df  = pd.read_csv("./test_fp1024.csv")

label_cols = train_df.columns[1:13]
feature_cols = train_df.columns[13:]

y_train = train_df[label_cols].values
y_val   = val_df[label_cols].values
y_test  = test_df[label_cols].values

X_train = train_df[feature_cols].values
X_val   = val_df[feature_cols].values
X_test  = test_df[feature_cols].values

param_grid = {
    'estimator__n_estimators': [400],
    'estimator__max_depth': [40],
    'estimator__min_samples_split': [5],
    'estimator__min_samples_leaf': [1],
    'estimator__max_features': ['sqrt'],
    'estimator__class_weight': ['balanced_subsample']
}

In [52]:
rf_trainer = RandomForestTrainer(param_grid=param_grid, n_splits=3)
rf_model = rf_trainer.train(X_train, y_train, X_val, y_val, save_path="toxicity_rf_model.pkl")

Fitting 3 folds for each of 1 candidates, totalling 3 fits




Best hyperparameters: {'estimator__n_estimators': 400, 'estimator__min_samples_split': 5, 'estimator__min_samples_leaf': 1, 'estimator__max_features': 'sqrt', 'estimator__max_depth': 40, 'estimator__class_weight': 'balanced_subsample'}
Validation Metrics:
              precision    recall  f1-score   support

           0       0.82      0.45      0.58        31
           1       0.65      0.44      0.52        25
           2       0.64      0.18      0.29        87
           3       1.00      0.11      0.20        45
           4       0.57      0.17      0.27        75
           5       0.73      0.28      0.40        29
           6       0.00      0.00      0.00        32
           7       0.67      0.09      0.17       106
           8       0.00      0.00      0.00        35
           9       0.67      0.05      0.09        44
          10       0.56      0.09      0.16       111
          11       0.67      0.08      0.14        75

   micro avg       0.65      0.14      0

In [55]:
import joblib
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score


class LightGBMTrainer:
    def __init__(self, param_grid=None, n_iter=20, cv=3, scoring="f1_macro", random_state=42, n_jobs=-1, verbose=2):
        """
        Multi-label LightGBM trainer using MultiOutputClassifier + RandomizedSearchCV.
        """

        self.base_estimator = LGBMClassifier(
            objective="binary",
            boosting_type="gbdt",
            random_state=random_state,
            n_jobs=-1
        )

        self.model = MultiOutputClassifier(self.base_estimator, n_jobs=-1)

        if param_grid is None:
            self.param_grid = {
                "estimator__num_leaves": [63],
                "estimator__n_estimators": [400],
                "estimator__learning_rate": [0.1, 0.05, 0.01],
                "estimator__max_depth": [15],
                "estimator__min_child_samples": [50],
                "estimator__subsample": [0.9],
            }
        else:
            self.param_grid = param_grid

        self.n_iter = n_iter
        self.cv = cv
        self.scoring = scoring
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.search = None
        self.best_model = None

    def fit(self, X, y):
        """
        Fit model with hyperparameter search.
        """
        self.search = RandomizedSearchCV(
            self.model,
            param_distributions=self.param_grid,
            n_iter=self.n_iter,
            cv=self.cv,
            scoring=self.scoring,
            random_state=self.random_state,
            n_jobs=self.n_jobs,
            verbose=self.verbose
        )

        self.search.fit(X, y)
        self.best_model = self.search.best_estimator_
        return self.best_model

    def evaluate(self, X, y, label_names=None):
        """
        Evaluate model on test/validation data.
        """
        if self.best_model is None:
            raise ValueError("Model not trained yet. Call fit() first.")

        y_pred = self.best_model.predict(X)

        report = classification_report(y, y_pred, target_names=label_names, zero_division=0)
        macro_f1 = f1_score(y, y_pred, average="macro", zero_division=0)
        micro_f1 = f1_score(y, y_pred, average="micro", zero_division=0)

        return {
            "report": report,
            "macro_f1": macro_f1,
            "micro_f1": micro_f1
        }

    def save(self, path="lgbm_multioutput.pkl"):
        """
        Save trained model to disk.
        """
        if self.best_model is None:
            raise ValueError("No trained model to save.")
        joblib.dump(self.best_model, path)

    def load(self, path="lgbm_multioutput.pkl"):
        """
        Load trained model from disk.
        """
        self.best_model = joblib.load(path)
        return self.best_model

In [56]:
train_df = pd.read_csv("./train_fp1024.csv")
val_df   = pd.read_csv("./valid_fp1024.csv")
test_df  = pd.read_csv("./test_fp1024.csv")

label_cols = train_df.columns[1:13]
feature_cols = train_df.columns[13:]

X_train, y_train = train_df[feature_cols].values, train_df[label_cols].values
X_val,   y_val   = val_df[feature_cols].values, val_df[label_cols].values
X_test,  y_test  = test_df[feature_cols].values, test_df[label_cols].values


trainer = LightGBMTrainer(n_iter=20, cv=3, n_jobs=-1)

# Train
trainer.fit(X_train, y_train)

# Evaluate
results = trainer.evaluate(X_test, y_test, label_names=label_cols)
print(results["report"])
print("Macro F1:", results["macro_f1"])
print("Micro F1:", results["micro_f1"])

# Save
trainer.save("tox21_lightgbm.pkl")

Fitting 3 folds for each of 3 candidates, totalling 9 fits




               precision    recall  f1-score   support

        NR-AR       0.58      0.26      0.36        27
    NR-AR-LBD       0.56      0.26      0.36        19
       NR-AhR       0.36      0.23      0.28        92
 NR-Aromatase       0.45      0.11      0.17        47
        NR-ER       0.44      0.11      0.18        70
    NR-ER-LBD       0.22      0.10      0.13        21
NR-PPAR-gamma       0.00      0.00      0.00        22
       SR-ARE       0.26      0.04      0.07       118
     SR-ATAD5       0.17      0.03      0.05        33
       SR-HSE       0.67      0.09      0.15        47
       SR-MMP       0.40      0.10      0.17        96
       SR-p53       0.78      0.10      0.17        72

    micro avg       0.41      0.11      0.18       664
    macro avg       0.41      0.12      0.17       664
 weighted avg       0.42      0.11      0.17       664
  samples avg       0.06      0.04      0.04       664

Macro F1: 0.17459792752883316
Micro F1: 0.1770956316410862


In [61]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import classification_report, precision_recall_curve, f1_score
import joblib

class LightGBMTrainer:
    def __init__(self, params=None, num_boost_round=500, early_stopping_rounds=50, random_state=42):
        """
        params: dictionary of LightGBM hyperparameters
        num_boost_round: max boosting rounds
        early_stopping_rounds: early stopping for validation
        """
        self.params = params if params else {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'is_unbalance': True,
            'metric': 'binary_logloss',
            'random_state': random_state,
            'n_jobs': -1
        }
        self.num_boost_round = num_boost_round
        self.early_stopping_rounds = early_stopping_rounds
        self.models = []
        self.thresholds = []

    def train(self, X_train, y_train, X_val=None, y_val=None, save_path="lgbm_model.pkl"):
        """
        X_train, y_train: training features and labels (multi-label one-hot)
        X_val, y_val: optional validation set for metrics
        """
        n_tasks = y_train.shape[1]
        self.models = []
        self.thresholds = []

        for task_idx in range(n_tasks):
            print(f"\nTraining task {task_idx+1}/{n_tasks}")
            train_data = lgb.Dataset(X_train, label=y_train[:, task_idx])

            valid_data = lgb.Dataset(X_val, label=y_val[:, task_idx]) if X_val is not None else None

            callbacks = []
            if X_val is not None and y_val is not None:
              callbacks.append(lgb.early_stopping(stopping_rounds=self.early_stopping_rounds, verbose=True))

            model = lgb.train(
              self.params,
              train_data,
              num_boost_round=self.num_boost_round,
              valid_sets=[valid_data] if valid_data is not None else None,
              callbacks=callbacks
            )
            self.models.append(model)

            # threshold tuning on validation set
            if X_val is not None and y_val is not None:
                probs = model.predict(X_val)
                precision, recall, thres = precision_recall_curve(y_val[:, task_idx], probs)
                f1 = 2*precision*recall/(precision+recall+1e-6)
                best_thres = thres[f1.argmax()] if len(thres) > 0 else 0.5
                self.thresholds.append(best_thres)
            else:
                self.thresholds.append(0.5)

        joblib.dump({'models': self.models, 'thresholds': self.thresholds}, save_path)
        print(f"\nAll models saved to {save_path}")

        if X_val is not None and y_val is not None:
            y_pred = self.predict(X_val)
            print("\nValidation Metrics:")
            print(classification_report(y_val, y_pred, zero_division=0))

        return self

    def predict(self, X):
        """
        Returns predictions using tuned thresholds
        """
        n_tasks = len(self.models)
        y_pred = np.zeros((X.shape[0], n_tasks))

        for i, model in enumerate(self.models):
            probs = model.predict(X)
            y_pred[:, i] = (probs >= self.thresholds[i]).astype(int)

        return y_pred

    def predict_proba(self, X):
        """
        Returns predicted probabilities for each task
        """
        n_tasks = len(self.models)
        y_proba = np.zeros((X.shape[0], n_tasks))

        for i, model in enumerate(self.models):
            y_proba[:, i] = model.predict(X)

        return y_proba

In [62]:
# Load your data
train_df = pd.read_csv("./train_fp1024.csv")
val_df   = pd.read_csv("./valid_fp1024.csv")
test_df  = pd.read_csv("./test_fp1024.csv")

label_cols = train_df.columns[1:13]
feature_cols = train_df.columns[13:]

X_train, y_train = train_df[feature_cols].values, train_df[label_cols].values
X_val, y_val     = val_df[feature_cols].values, val_df[label_cols].values
X_test, y_test   = test_df[feature_cols].values, test_df[label_cols].values

# Initialize and train
lgb_trainer = LightGBMTrainer(num_boost_round=1000, early_stopping_rounds=50)
lgb_trainer.train(X_train, y_train, X_val, y_val, save_path="toxicity_lgbm.pkl")

# Test set predictions
y_test_pred = lgb_trainer.predict(X_test)
y_test_proba = lgb_trainer.predict_proba(X_test)

# Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred, zero_division=0))


Training task 1/12
[LightGBM] [Info] Number of positive: 250, number of negative: 6008
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2004
[LightGBM] [Info] Number of data points in the train set: 6258, number of used features: 1002
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039949 -> initscore=-3.179386
[LightGBM] [Info] Start training from score -3.179386
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.162613

Training task 2/12
[LightGBM] [Info] Number of positive: 193, number of negative: 6065
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can se

In [17]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import early_stopping, log_evaluation
import optuna

class ESOLTrainer:
    def __init__(self, data_dir=".", target_col="measured log solubility in mols per litre"):
        self.data_dir = data_dir
        self.target_col = target_col
        self.models = {}
        self.best_params = None

        # Load data
        self.train = pd.read_csv(os.path.join(data_dir, "train_with_desc.csv"))
        self.valid = pd.read_csv(os.path.join(data_dir, "valid_with_desc.csv"))
        self.test = pd.read_csv(os.path.join(data_dir, "test_with_desc.csv"))

        # Split X, y
        self.X_train, self.y_train = self._split_xy(self.train)
        self.X_valid, self.y_valid = self._split_xy(self.valid)
        self.X_test, self.y_test = self._split_xy(self.test)

    def _split_xy(self, df):
        X = df.drop(columns=[self.target_col, "smiles"], errors="ignore")
        y = df[self.target_col]
        return X, y

    def _objective(self, trial):
        params = {
            "objective": "regression",
            "metric": "rmse",
            "boosting_type": "gbdt",
            "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 16, 256),
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 50),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
            "verbosity": -1,
            "seed": 42,
        }

        dtrain = lgb.Dataset(self.X_train, self.y_train)
        dvalid = lgb.Dataset(self.X_valid, self.y_valid, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            num_boost_round=5000,
            callbacks=[early_stopping(100), log_evaluation(200)]
        )

        preds = model.predict(self.X_valid, num_iteration=model.best_iteration)
        rmse = mean_squared_error(self.y_valid, preds)
        return rmse

    def tune(self, n_trials=30):
        """Run Optuna tuning and save best params."""
        study = optuna.create_study(direction="minimize")
        study.optimize(self._objective, n_trials=n_trials)
        self.best_params = study.best_params
        print("Best params:", self.best_params)
        return self.best_params

    def trainlgb(self, tuned=True):
        """Train LightGBM with either default or tuned params."""
        if tuned and self.best_params is None:
            print("⚠️ No tuned params found, running tuning first...")
            self.tune(n_trials=30)

        params = self.best_params if (tuned and self.best_params) else {
            "objective": "regression",
            "metric": "rmse",
            "boosting_type": "gbdt",
            "learning_rate": 0.01,
            "num_leaves": 64,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.8,
            "bagging_freq": 5,
            "seed": 42,
        }

        dtrain = lgb.Dataset(self.X_train, self.y_train)
        dvalid = lgb.Dataset(self.X_valid, self.y_valid, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            num_boost_round=5000,
            callbacks=[early_stopping(100), log_evaluation(200)]
        )

        self.models["final"] = model

        # Validation metrics
        val_preds = model.predict(self.X_valid, num_iteration=model.best_iteration)
        val_rmse = mean_squared_error(self.y_valid, val_preds)
        val_r2 = r2_score(self.y_valid, val_preds)
        print(f"✅ Validation RMSE: {val_rmse:.4f}, R²: {val_r2:.4f}")

        # Test metrics
        test_preds = model.predict(self.X_test, num_iteration=model.best_iteration)
        test_rmse = mean_squared_error(self.y_test, test_preds)
        test_r2 = r2_score(self.y_test, test_preds)
        print(f"✅ Test RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")

        return model, (val_rmse, val_r2), (test_rmse, test_r2)

    def save_model(self, filepath="esol_model.txt"):
        """Save the final trained model to disk."""
        if "final" not in self.models:
            raise ValueError("No trained model found. Train a model first with trainlgb().")
        self.models["final"].save_model(filepath)
        print(f"💾 Model saved to {filepath}")

    def load_model(self, filepath="esol_model.txt"):
        """Load a trained model from disk into self.models['final']."""
        self.models["final"] = lgb.Booster(model_file=filepath)
        print(f"📂 Model loaded from {filepath}")
        return self.models["final"]

In [19]:
trainer = ESOLTrainer(data_dir=".", target_col="p_np")

# Train with best params
model, val_metrics, test_metrics = trainer.trainlgb(tuned=True)

# Save it
trainer.save_model("esol_model.txt")

# Later (new session or notebook)
trainer2 = ESOLTrainer(data_dir=".", target_col="p_np")
loaded_model = trainer2.load_model("esol_model.txt")

# Inference on new data
new_df = pd.read_csv("new_data_with_desc.csv")
X_new = new_df.drop(columns=["smiles", "exp"], errors="ignore")
preds = loaded_model.predict(X_new)

[I 2025-10-07 13:33:57,689] A new study created in memory with name: no-name-150fdbda-4613-44a6-8ab7-2a67db3d5cc9


⚠️ No tuned params found, running tuning first...
Training until validation scores don't improve for 100 rounds


[I 2025-10-07 13:33:58,496] Trial 0 finished with value: 0.05925424002053358 and parameters: {'learning_rate': 0.04080030836352462, 'num_leaves': 173, 'max_depth': 7, 'min_data_in_leaf': 16, 'feature_fraction': 0.7307288480795568, 'bagging_fraction': 0.6273036457175115, 'bagging_freq': 8, 'lambda_l1': 1.754166876021954e-08, 'lambda_l2': 0.08323975954686595}. Best is trial 0 with value: 0.05925424002053358.


Early stopping, best iteration is:
[93]	valid_0's rmse: 0.243422
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.239279


[I 2025-10-07 13:34:00,485] Trial 1 finished with value: 0.05710865064934991 and parameters: {'learning_rate': 0.02632747127126337, 'num_leaves': 69, 'max_depth': 11, 'min_data_in_leaf': 30, 'feature_fraction': 0.6911060563298117, 'bagging_fraction': 0.8784526526603424, 'bagging_freq': 1, 'lambda_l1': 6.284803768963424e-07, 'lambda_l2': 0.0780632710071345}. Best is trial 1 with value: 0.05710865064934991.


Early stopping, best iteration is:
[195]	valid_0's rmse: 0.238974
Training until validation scores don't improve for 100 rounds


[I 2025-10-07 13:34:02,379] Trial 2 finished with value: 0.056346433971335444 and parameters: {'learning_rate': 0.035777803614116054, 'num_leaves': 94, 'max_depth': 11, 'min_data_in_leaf': 17, 'feature_fraction': 0.6160799647639202, 'bagging_fraction': 0.7777987107983574, 'bagging_freq': 1, 'lambda_l1': 0.0006456961429973911, 'lambda_l2': 1.8128993884784538e-07}. Best is trial 2 with value: 0.056346433971335444.


[200]	valid_0's rmse: 0.238573
Early stopping, best iteration is:
[113]	valid_0's rmse: 0.237374
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.287993
[400]	valid_0's rmse: 0.247328
[600]	valid_0's rmse: 0.243183
[800]	valid_0's rmse: 0.241516


[I 2025-10-07 13:34:07,570] Trial 3 finished with value: 0.058287874323639834 and parameters: {'learning_rate': 0.007328991971123967, 'num_leaves': 91, 'max_depth': 10, 'min_data_in_leaf': 27, 'feature_fraction': 0.8164665806274516, 'bagging_fraction': 0.9347553791684166, 'bagging_freq': 5, 'lambda_l1': 1.0560147930376167e-08, 'lambda_l2': 4.239179303635072e-05}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[826]	valid_0's rmse: 0.241429
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.244087


[I 2025-10-07 13:34:08,891] Trial 4 finished with value: 0.058211291268631456 and parameters: {'learning_rate': 0.04566364939947758, 'num_leaves': 245, 'max_depth': 11, 'min_data_in_leaf': 39, 'feature_fraction': 0.8278290453796879, 'bagging_fraction': 0.92565676739069, 'bagging_freq': 8, 'lambda_l1': 0.04996389870521953, 'lambda_l2': 0.22531692348880805}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[143]	valid_0's rmse: 0.24127
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.358573
[400]	valid_0's rmse: 0.278792
[600]	valid_0's rmse: 0.25417
[800]	valid_0's rmse: 0.246773
[1000]	valid_0's rmse: 0.242536
[1200]	valid_0's rmse: 0.241573
[1400]	valid_0's rmse: 0.239911


[I 2025-10-07 13:34:15,294] Trial 5 finished with value: 0.0572975130497897 and parameters: {'learning_rate': 0.005078082657210433, 'num_leaves': 90, 'max_depth': 9, 'min_data_in_leaf': 43, 'feature_fraction': 0.9162516960654926, 'bagging_fraction': 0.8499455791926723, 'bagging_freq': 5, 'lambda_l1': 2.4617839316772487, 'lambda_l2': 0.0003720519138324713}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[1485]	valid_0's rmse: 0.239369
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.254451


[I 2025-10-07 13:34:16,453] Trial 6 finished with value: 0.06246523460494778 and parameters: {'learning_rate': 0.020950273678173273, 'num_leaves': 128, 'max_depth': 8, 'min_data_in_leaf': 16, 'feature_fraction': 0.7879221982530066, 'bagging_fraction': 0.7450916929873377, 'bagging_freq': 10, 'lambda_l1': 4.680010791485529, 'lambda_l2': 0.002051145497965467}. Best is trial 2 with value: 0.056346433971335444.


[400]	valid_0's rmse: 0.250691
Early stopping, best iteration is:
[370]	valid_0's rmse: 0.24993
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.275368
[400]	valid_0's rmse: 0.243373
[600]	valid_0's rmse: 0.239822


[I 2025-10-07 13:34:21,170] Trial 7 finished with value: 0.057430193024359734 and parameters: {'learning_rate': 0.008689636154250885, 'num_leaves': 167, 'max_depth': 15, 'min_data_in_leaf': 29, 'feature_fraction': 0.7074492925925563, 'bagging_fraction': 0.9388441516425692, 'bagging_freq': 5, 'lambda_l1': 3.253409233879348e-05, 'lambda_l2': 9.948747791291135e-07}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[603]	valid_0's rmse: 0.239646
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.248378


[I 2025-10-07 13:34:21,908] Trial 8 finished with value: 0.0605527024753598 and parameters: {'learning_rate': 0.022247594710456762, 'num_leaves': 49, 'max_depth': 3, 'min_data_in_leaf': 31, 'feature_fraction': 0.8152896070158933, 'bagging_fraction': 0.8265503788105535, 'bagging_freq': 10, 'lambda_l1': 2.5233538393738657, 'lambda_l2': 0.009388687811896074}. Best is trial 2 with value: 0.056346433971335444.


[400]	valid_0's rmse: 0.247358
Early stopping, best iteration is:
[360]	valid_0's rmse: 0.246075
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.264314
[400]	valid_0's rmse: 0.253735
[600]	valid_0's rmse: 0.251901


[I 2025-10-07 13:34:23,224] Trial 9 finished with value: 0.06328645004679091 and parameters: {'learning_rate': 0.02010271891872505, 'num_leaves': 88, 'max_depth': 9, 'min_data_in_leaf': 20, 'feature_fraction': 0.6870819999877861, 'bagging_fraction': 0.6164312578448112, 'bagging_freq': 3, 'lambda_l1': 5.251621291255731, 'lambda_l2': 0.0020118568371399193}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[666]	valid_0's rmse: 0.251568
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.253598
[400]	valid_0's rmse: 0.244234


[I 2025-10-07 13:34:26,947] Trial 10 finished with value: 0.059556791713176724 and parameters: {'learning_rate': 0.012648193890219523, 'num_leaves': 26, 'max_depth': 14, 'min_data_in_leaf': 5, 'feature_fraction': 0.6128095211674864, 'bagging_fraction': 0.7473033622887405, 'bagging_freq': 1, 'lambda_l1': 0.0015477779102656326, 'lambda_l2': 1.778076135869884e-08}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[332]	valid_0's rmse: 0.244043
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.238816


[I 2025-10-07 13:34:29,722] Trial 11 finished with value: 0.057032992410974405 and parameters: {'learning_rate': 0.03216297199416629, 'num_leaves': 50, 'max_depth': 12, 'min_data_in_leaf': 6, 'feature_fraction': 0.6019897529892858, 'bagging_fraction': 0.7374859501373847, 'bagging_freq': 1, 'lambda_l1': 6.138328113928863e-06, 'lambda_l2': 6.596922622499779}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[200]	valid_0's rmse: 0.238816
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.24621


[I 2025-10-07 13:34:30,819] Trial 12 finished with value: 0.05990185728055453 and parameters: {'learning_rate': 0.03360338390225604, 'num_leaves': 16, 'max_depth': 13, 'min_data_in_leaf': 5, 'feature_fraction': 0.6165047601430819, 'bagging_fraction': 0.7260253123059303, 'bagging_freq': 3, 'lambda_l1': 3.440669413702188e-05, 'lambda_l2': 4.315686736567804e-06}. Best is trial 2 with value: 0.056346433971335444.


Early stopping, best iteration is:
[178]	valid_0's rmse: 0.244749
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.238508
[400]	valid_0's rmse: 0.238732
Early stopping, best iteration is:
[322]	valid_0's rmse: 0.237412


[I 2025-10-07 13:34:34,156] Trial 13 finished with value: 0.05636461664350138 and parameters: {'learning_rate': 0.03120244658140354, 'num_leaves': 124, 'max_depth': 12, 'min_data_in_leaf': 11, 'feature_fraction': 0.6091706170230895, 'bagging_fraction': 0.6838226628494538, 'bagging_freq': 3, 'lambda_l1': 0.000979502819910046, 'lambda_l2': 5.800327598994665}. Best is trial 2 with value: 0.056346433971335444.


Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.24557


[I 2025-10-07 13:34:36,366] Trial 14 finished with value: 0.05591230701332577 and parameters: {'learning_rate': 0.013365775645645738, 'num_leaves': 135, 'max_depth': 4, 'min_data_in_leaf': 16, 'feature_fraction': 0.9982114685027492, 'bagging_fraction': 0.6821130233464215, 'bagging_freq': 3, 'lambda_l1': 0.0024075683948095923, 'lambda_l2': 2.864623228001242e-08}. Best is trial 14 with value: 0.05591230701332577.


[400]	valid_0's rmse: 0.238077
Early stopping, best iteration is:
[333]	valid_0's rmse: 0.236458
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.248624
[400]	valid_0's rmse: 0.238986


[I 2025-10-07 13:34:38,435] Trial 15 finished with value: 0.05669000568989172 and parameters: {'learning_rate': 0.012829084653025971, 'num_leaves': 180, 'max_depth': 5, 'min_data_in_leaf': 22, 'feature_fraction': 0.9860854552334561, 'bagging_fraction': 0.672263460512831, 'bagging_freq': 3, 'lambda_l1': 0.017548117191260294, 'lambda_l2': 3.811631905324517e-08}. Best is trial 14 with value: 0.05591230701332577.


[600]	valid_0's rmse: 0.238893
Early stopping, best iteration is:
[537]	valid_0's rmse: 0.238097
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.246366
[400]	valid_0's rmse: 0.242117


[I 2025-10-07 13:34:39,991] Trial 16 finished with value: 0.058401144141127175 and parameters: {'learning_rate': 0.01493374875148112, 'num_leaves': 218, 'max_depth': 6, 'min_data_in_leaf': 50, 'feature_fraction': 0.8881240169180005, 'bagging_fraction': 0.792041891559476, 'bagging_freq': 2, 'lambda_l1': 0.033947184192330046, 'lambda_l2': 3.8719707107812205e-07}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[391]	valid_0's rmse: 0.241663
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.270887
[400]	valid_0's rmse: 0.242645


[I 2025-10-07 13:34:41,182] Trial 17 finished with value: 0.0583643306848945 and parameters: {'learning_rate': 0.009650496315969119, 'num_leaves': 146, 'max_depth': 3, 'min_data_in_leaf': 13, 'feature_fraction': 0.9932082312124646, 'bagging_fraction': 0.7810843217448361, 'bagging_freq': 4, 'lambda_l1': 0.004744103773913055, 'lambda_l2': 1.6645024034376588e-05}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[459]	valid_0's rmse: 0.241587
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.240797


[I 2025-10-07 13:34:42,209] Trial 18 finished with value: 0.056789356381044855 and parameters: {'learning_rate': 0.0173793519150575, 'num_leaves': 106, 'max_depth': 5, 'min_data_in_leaf': 23, 'feature_fraction': 0.9218898321666922, 'bagging_fraction': 0.6779344372104937, 'bagging_freq': 2, 'lambda_l1': 0.00021311427686568192, 'lambda_l2': 1.5302874996802269e-07}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[228]	valid_0's rmse: 0.238305
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.31424
[400]	valid_0's rmse: 0.254027
[600]	valid_0's rmse: 0.2434
[800]	valid_0's rmse: 0.240851
[1000]	valid_0's rmse: 0.240544


[I 2025-10-07 13:34:49,850] Trial 19 finished with value: 0.057752588978930534 and parameters: {'learning_rate': 0.00581517552164164, 'num_leaves': 198, 'max_depth': 7, 'min_data_in_leaf': 11, 'feature_fraction': 0.7740421912884066, 'bagging_fraction': 0.985939045942724, 'bagging_freq': 7, 'lambda_l1': 0.2445240815117757, 'lambda_l2': 1.2405435254432184e-08}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[1036]	valid_0's rmse: 0.240318
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.257655
[400]	valid_0's rmse: 0.240717


[I 2025-10-07 13:34:50,995] Trial 20 finished with value: 0.05789424759769522 and parameters: {'learning_rate': 0.011184336483613746, 'num_leaves': 145, 'max_depth': 4, 'min_data_in_leaf': 34, 'feature_fraction': 0.8718730808226087, 'bagging_fraction': 0.6470450548001623, 'bagging_freq': 2, 'lambda_l1': 0.00016760165717659355, 'lambda_l2': 1.6174784995172909e-06}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[446]	valid_0's rmse: 0.240612
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.241957


[I 2025-10-07 13:34:53,933] Trial 21 finished with value: 0.05801615189461197 and parameters: {'learning_rate': 0.02578755317259994, 'num_leaves': 123, 'max_depth': 12, 'min_data_in_leaf': 10, 'feature_fraction': 0.6488199473799976, 'bagging_fraction': 0.6897081035840184, 'bagging_freq': 4, 'lambda_l1': 0.002187169492822042, 'lambda_l2': 1.0112447314005157e-07}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[224]	valid_0's rmse: 0.240865
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.244526


[I 2025-10-07 13:34:55,846] Trial 22 finished with value: 0.059644103554850336 and parameters: {'learning_rate': 0.037057978071431304, 'num_leaves': 114, 'max_depth': 13, 'min_data_in_leaf': 17, 'feature_fraction': 0.6448837526487453, 'bagging_fraction': 0.7092964312151666, 'bagging_freq': 4, 'lambda_l1': 0.0005041160169338643, 'lambda_l2': 8.203872240101532}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[185]	valid_0's rmse: 0.244221
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.245218


[I 2025-10-07 13:34:59,626] Trial 23 finished with value: 0.05930775160941522 and parameters: {'learning_rate': 0.047690506893427874, 'num_leaves': 147, 'max_depth': 10, 'min_data_in_leaf': 10, 'feature_fraction': 0.7463703458216299, 'bagging_fraction': 0.7752267234812945, 'bagging_freq': 2, 'lambda_l1': 2.6326656752380505e-06, 'lambda_l2': 8.665691856209887e-05}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[286]	valid_0's rmse: 0.243532
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.248767


[I 2025-10-07 13:35:00,855] Trial 24 finished with value: 0.06087369908815352 and parameters: {'learning_rate': 0.03030951936521498, 'num_leaves': 73, 'max_depth': 11, 'min_data_in_leaf': 24, 'feature_fraction': 0.6597841513597256, 'bagging_fraction': 0.6010447977461073, 'bagging_freq': 3, 'lambda_l1': 0.14688325220926626, 'lambda_l2': 5.692312687087916e-06}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[168]	valid_0's rmse: 0.246726
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.241956
[400]	valid_0's rmse: 0.240481
Early stopping, best iteration is:
[322]	valid_0's rmse: 0.238882


[I 2025-10-07 13:35:04,449] Trial 25 finished with value: 0.05706478668193402 and parameters: {'learning_rate': 0.01649284414242595, 'num_leaves': 104, 'max_depth': 15, 'min_data_in_leaf': 20, 'feature_fraction': 0.9493707099925303, 'bagging_fraction': 0.6479236074399097, 'bagging_freq': 1, 'lambda_l1': 0.007835084222188947, 'lambda_l2': 0.759390897380507}. Best is trial 14 with value: 0.05591230701332577.


Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.238127


[I 2025-10-07 13:35:06,697] Trial 26 finished with value: 0.05605909612215715 and parameters: {'learning_rate': 0.02950610068515548, 'num_leaves': 157, 'max_depth': 13, 'min_data_in_leaf': 14, 'feature_fraction': 0.8496920299397761, 'bagging_fraction': 0.7048516260354084, 'bagging_freq': 4, 'lambda_l1': 4.41009404391169e-05, 'lambda_l2': 1.1342894494018081e-07}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[140]	valid_0's rmse: 0.236768
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.247005


[I 2025-10-07 13:35:09,161] Trial 27 finished with value: 0.05982851954586857 and parameters: {'learning_rate': 0.02748070146924849, 'num_leaves': 162, 'max_depth': 14, 'min_data_in_leaf': 15, 'feature_fraction': 0.8748036923054335, 'bagging_fraction': 0.7663145556313192, 'bagging_freq': 6, 'lambda_l1': 6.291899676502158e-05, 'lambda_l2': 1.0013163876898029e-07}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[144]	valid_0's rmse: 0.244599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's rmse: 0.243059


[I 2025-10-07 13:35:11,852] Trial 28 finished with value: 0.05714235405884513 and parameters: {'learning_rate': 0.0412237101364282, 'num_leaves': 194, 'max_depth': 10, 'min_data_in_leaf': 19, 'feature_fraction': 0.8469949611679061, 'bagging_fraction': 0.8178130258141662, 'bagging_freq': 6, 'lambda_l1': 4.1202276361011383e-07, 'lambda_l2': 5.370542627558154e-07}. Best is trial 14 with value: 0.05591230701332577.


Early stopping, best iteration is:
[135]	valid_0's rmse: 0.239045
Training until validation scores don't improve for 100 rounds


[I 2025-10-07 13:35:12,847] Trial 29 finished with value: 0.05828249072993983 and parameters: {'learning_rate': 0.03973961965593817, 'num_leaves': 160, 'max_depth': 8, 'min_data_in_leaf': 25, 'feature_fraction': 0.7504046482943139, 'bagging_fraction': 0.7072454450625336, 'bagging_freq': 4, 'lambda_l1': 6.055133277234501e-06, 'lambda_l2': 4.5806318809058775e-08}. Best is trial 14 with value: 0.05591230701332577.


[200]	valid_0's rmse: 0.244653
Early stopping, best iteration is:
[143]	valid_0's rmse: 0.241418
Best params: {'learning_rate': 0.013365775645645738, 'num_leaves': 135, 'max_depth': 4, 'min_data_in_leaf': 16, 'feature_fraction': 0.9982114685027492, 'bagging_fraction': 0.6821130233464215, 'bagging_freq': 3, 'lambda_l1': 0.0024075683948095923, 'lambda_l2': 2.864623228001242e-08}
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l2: 0.06502
Early stopping, best iteration is:
[270]	valid_0's l2: 0.0621196
✅ Validation RMSE: 0.0621, R²: 0.7491
✅ Test RMSE: 0.2862, R²: -0.1474
💾 Model saved to esol_model.txt


KeyError: 'exp'

In [None]:
Best params: {'learning_rate': 0.014896238632781483, 'num_leaves': 256, 'max_depth': 11, 'min_data_in_leaf': 29, 'feature_fraction': 0.7642821811125154, 'bagging_fraction': 0.834450819278106, 'bagging_freq': 9, 'lambda_l1': 0.00873099190852502, 'lambda_l2': 0.0013897750455788976}

In [5]:
# Best params from your tuning
best_params = {
    'learning_rate': 0.014896238632781483,
    'num_leaves': 256,
    'max_depth': 11,
    'min_data_in_leaf': 29,
    'feature_fraction': 0.7642821811125154,
    'bagging_fraction': 0.834450819278106,
    'bagging_freq': 9,
    'lambda_l1': 0.00873099190852502,
    'lambda_l2': 0.0013897750455788976,
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'seed': 42
}

# Prepare datasets
dtrain = lgb.Dataset(trainer.X_train, trainer.y_train)
dvalid = lgb.Dataset(trainer.X_valid, trainer.y_valid, reference=dtrain)

# Train final model
final_model = lgb.train(
    best_params,
    dtrain,
    valid_sets=[dvalid],
    num_boost_round=5000,
    callbacks=[early_stopping(100), log_evaluation(200)]
)

# Save
trainer.models["final"] = final_model

# Evaluate
val_preds = final_model.predict(trainer.X_valid, num_iteration=final_model.best_iteration)
val_rmse = mean_squared_error(trainer.y_valid, val_preds, squared=False)
val_r2 = r2_score(trainer.y_valid, val_preds)

test_preds = final_model.predict(trainer.X_test, num_iteration=final_model.best_iteration)
test_rmse = mean_squared_error(trainer.y_test, test_preds, squared=False)
test_r2 = r2_score(trainer.y_test, test_preds)

print(f"✅ Validation RMSE: {val_rmse:.4f}, R²: {val_r2:.4f}")
print(f"✅ Test RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")

NameError: name 'trainer' is not defined

In [22]:
import os
import joblib
import optuna
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

class BBBPExtraTrees:
    def __init__(self, data_dir=".", target_col="p_np", random_state=42):
        """
        Extra Trees model for BBBP classification with Optuna tuning & persistence.
        - Assumes train/valid/test CSVs already prepared with descriptors/fingerprints.
        """
        self.data_dir = data_dir
        self.target_col = target_col
        self.random_state = random_state
        self.model = None
        self.best_params = None

        # Load data
        self.train = pd.read_csv(os.path.join(data_dir, "train_with_desc.csv"))
        self.valid = pd.read_csv(os.path.join(data_dir, "valid_with_desc.csv"))
        self.test = pd.read_csv(os.path.join(data_dir, "test_with_desc.csv"))

        # Split X, y
        self.X_train, self.y_train = self._split_xy(self.train)
        self.X_valid, self.y_valid = self._split_xy(self.valid)
        self.X_test, self.y_test = self._split_xy(self.test)

    def _split_xy(self, df):
        """Drop SMILES + target, keep descriptors/fingerprints."""
        X = df.drop(columns=[self.target_col, "smiles"], errors="ignore")
        y = df[self.target_col]
        return X, y

    def _objective(self, trial):
        """Optuna objective: maximize ROC-AUC on validation set."""
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 5, 50),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }

        model = ExtraTreesClassifier(
            **params,
            random_state=self.random_state,
            n_jobs=-1
        )
        model.fit(self.X_train, self.y_train)
        preds = model.predict_proba(self.X_valid)[:, 1]
        return roc_auc_score(self.y_valid, preds)

    def tune(self, n_trials=30):
        """Run Optuna tuning, store best params."""
        study = optuna.create_study(direction="maximize")
        study.optimize(self._objective, n_trials=n_trials)

        self.best_params = study.best_params
        print("Best Params:", self.best_params)
        print("Best ROC-AUC:", study.best_value)
        return self.best_params

    def train_extratrees(self, tuned=True):
        """Train Extra Trees using either tuned or default params."""
        if tuned and self.best_params is None:
            print("⚠️ No tuned params found, running tuning first...")
            self.tune(n_trials=30)

        params = self.best_params if (tuned and self.best_params) else {
            "n_estimators": 300,
            "max_depth": None,
            "max_features": "sqrt",
            "min_samples_split": 2,
            "min_samples_leaf": 1,
        }

        self.model = ExtraTreesClassifier(
            **params,
            random_state=self.random_state,
            n_jobs=-1
        )
        # train on train + valid together for final model
        X_final = pd.concat([self.X_train, self.X_valid], axis=0)
        y_final = pd.concat([self.y_train, self.y_valid], axis=0)
        self.model.fit(X_final, y_final)

        # Validation metrics
        val_preds = self.model.predict(self.X_valid)
        val_probs = self.model.predict_proba(self.X_valid)[:, 1]
        val_acc = accuracy_score(self.y_valid, val_preds)
        val_roc = roc_auc_score(self.y_valid, val_probs)
        print(f"✅ Validation ACC: {val_acc:.4f}, ROC-AUC: {val_roc:.4f}")

        # Test metrics
        test_preds = self.model.predict(self.X_test)
        test_probs = self.model.predict_proba(self.X_test)[:, 1]
        test_acc = accuracy_score(self.y_test, test_preds)
        test_roc = roc_auc_score(self.y_test, test_probs)
        print(f"✅ Test ACC: {test_acc:.4f}, ROC-AUC: {test_roc:.4f}")

        return self.model, (val_acc, val_roc), (test_acc, test_roc)

    def predict(self, X_new):
        if self.model is None:
            raise ValueError("Model not trained yet.")
        return self.model.predict(X_new)

    def predict_proba(self, X_new):
        if self.model is None:
            raise ValueError("Model not trained yet.")
        return self.model.predict_proba(X_new)

    def save_model(self, filepath="bbbp_extratrees.pkl"):
        if self.model is None:
            raise ValueError("No model trained yet.")
        joblib.dump({"model": self.model, "params": self.best_params}, filepath)
        print(f"💾 Model saved to {filepath}")

    def load_model(self, filepath="bbbp_extratrees.pkl"):
        data = joblib.load(filepath)
        self.model = data["model"]
        self.best_params = data.get("params", None)
        print(f"📂 Model loaded from {filepath}")
        return self.model

In [23]:
bbbp_trainer = BBBPExtraTrees(data_dir=".", target_col="p_np", random_state=42)

In [27]:
bbbp_trainer.tune(n_trials=30)  # can increase n_trials for better search



[I 2025-10-07 13:43:15,308] A new study created in memory with name: no-name-8f1419f9-0426-435f-ae2e-25d5db68c688
[I 2025-10-07 13:43:16,295] Trial 0 finished with value: 0.9744759316770186 and parameters: {'n_estimators': 466, 'max_depth': 26, 'max_features': 'log2', 'min_samples_split': 13, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.9744759316770186.
[I 2025-10-07 13:43:23,396] Trial 1 finished with value: 0.968361801242236 and parameters: {'n_estimators': 538, 'max_depth': 14, 'max_features': None, 'min_samples_split': 19, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.9744759316770186.
[I 2025-10-07 13:43:35,087] Trial 2 finished with value: 0.9691381987577639 and parameters: {'n_estimators': 881, 'max_depth': 15, 'max_features': None, 'min_samples_split': 6, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.9744759316770186.
[I 2025-10-07 13:43:41,762] Trial 3 finished with value: 0.9712732919254659 and parameters: {'n_estimators': 605, 'max_depth': 14, 'max_

Best Params: {'n_estimators': 145, 'max_depth': 49, 'max_features': 'sqrt', 'min_samples_split': 15, 'min_samples_leaf': 1}
Best ROC-AUC: 0.9760287267080745


{'n_estimators': 145,
 'max_depth': 49,
 'max_features': 'sqrt',
 'min_samples_split': 15,
 'min_samples_leaf': 1}

In [26]:
import numpy as np

def clean_extremes(df, threshold=1e6):
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.clip(-threshold, threshold)
    df = df.fillna(df.median())
    return df

bbbp_trainer.X_train = clean_extremes(bbbp_trainer.X_train)
bbbp_trainer.X_valid = clean_extremes(bbbp_trainer.X_valid)
bbbp_trainer.X_test  = clean_extremes(bbbp_trainer.X_test)

In [28]:
# Train with tuned params if available
model, val_metrics, test_metrics = bbbp_trainer.train_extratrees(tuned=True)
print("Validation metrics (ACC, ROC-AUC):", val_metrics)
print("Test metrics (ACC, ROC-AUC):", test_metrics)

✅ Validation ACC: 0.9902, ROC-AUC: 0.9997
✅ Test ACC: 0.5686, ROC-AUC: 0.7605
Validation metrics (ACC, ROC-AUC): (0.9901960784313726, np.float64(0.9997088509316769))
Test metrics (ACC, ROC-AUC): (0.5686274509803921, np.float64(0.760477888043164))


In [29]:
# Save trained model
bbbp_trainer.save_model("bbbp_extratrees.pkl")


💾 Model saved to bbbp_extratrees.pkl


In [38]:
import os
import joblib
import optuna
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.impute import SimpleImputer

class KlipoSVR:
    def __init__(self, data_dir="data", target_col="logP", random_state=42):
        self.data_dir = data_dir
        self.target_col = target_col
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy="median")  # fill NaNs with median
        self.model = None
        self.best_params = None

        # Load CSVs
        self.train = pd.read_csv(os.path.join(data_dir, "train_with_desc.csv"))
        self.valid = pd.read_csv(os.path.join(data_dir, "valid_with_desc.csv"))
        self.test = pd.read_csv(os.path.join(data_dir, "test_with_desc.csv"))

        # Split X, y
        self.X_train, self.y_train = self._split_xy(self.train)
        self.X_valid, self.y_valid = self._split_xy(self.valid)
        self.X_test, self.y_test = self._split_xy(self.test)

        # Impute missing values
        self.X_train = self.imputer.fit_transform(self.X_train)
        self.X_valid = self.imputer.transform(self.X_valid)
        self.X_test = self.imputer.transform(self.X_test)

        # Scale features
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_valid = self.scaler.transform(self.X_valid)
        self.X_test = self.scaler.transform(self.X_test)

    def _split_xy(self, df):
        """Drop SMILES + target, keep descriptors only"""
        X = df.drop(columns=[self.target_col, "smiles"], errors="ignore")
        y = df[self.target_col]
        return X, y

    def _objective(self, trial):
        """Optuna objective: minimize RMSE on validation set"""
        params = {
            "C": trial.suggest_loguniform("C", 1e-2, 1e3),
            "epsilon": trial.suggest_loguniform("epsilon", 1e-3, 1.0),
            "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
            "kernel": trial.suggest_categorical("kernel", ["rbf", "poly", "sigmoid"]),
        }

        model = SVR(**params)
        model.fit(self.X_train, self.y_train)
        preds = model.predict(self.X_valid)
        rmse = np.sqrt(mean_squared_error(self.y_valid, preds))
        return rmse

    def tune(self, n_trials=50):
        """Run Optuna tuning"""
        study = optuna.create_study(direction="minimize")
        study.optimize(self._objective, n_trials=n_trials)
        self.best_params = study.best_params
        print("Best Params:", self.best_params)
        print("Best RMSE:", study.best_value)
        return self.best_params

    def train_svr(self, tuned=True):
        """Train SVR using tuned or default params"""
        if tuned and self.best_params is None:
            print("⚠️ No tuned params found, running tuning first...")
            self.tune(n_trials=50)

        params = self.best_params if (tuned and self.best_params) else {
            "C": 1.0,
            "epsilon": 0.1,
            "gamma": "scale",
            "kernel": "rbf"
        }

        self.model = SVR(**params)
        X_final = pd.concat([pd.DataFrame(self.X_train), pd.DataFrame(self.X_valid)], axis=0).values
        y_final = pd.concat([self.y_train, self.y_valid], axis=0).values
        self.model.fit(X_final, y_final)

        # Validation metrics
        val_preds = self.model.predict(self.X_valid)
        val_rmse = np.sqrt(mean_squared_error(self.y_valid, val_preds))
        val_r2 = r2_score(self.y_valid, val_preds)
        print(f"✅ Validation RMSE: {val_rmse:.4f}, R²: {val_r2:.4f}")

        # Test metrics
        test_preds = self.model.predict(self.X_test)
        test_rmse = np.sqrt(mean_squared_error(self.y_valid, test_preds))
        test_r2 = r2_score(self.y_test, test_preds)
        print(f"✅ Test RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")

        return self.model, (val_rmse, val_r2), (test_rmse, test_r2)

    def predict(self, X_new):
        X_new_scaled = self.scaler.transform(X_new)
        return self.model.predict(X_new_scaled)

    def save_model(self, filepath="klipo_svr.pkl"):
        if self.model is None:
            raise ValueError("No model trained yet.")
        joblib.dump({"model": self.model, "scaler": self.scaler, "params": self.best_params}, filepath)
        print(f"💾 Model saved to {filepath}")

    def load_model(self, filepath="klipo_svr.pkl"):
        data = joblib.load(filepath)
        self.model = data["model"]
        self.scaler = data["scaler"]
        self.best_params = data.get("params", None)
        print(f"📂 Model loaded from {filepath}")
        return self.model

In [39]:
klipo = KlipoSVR(data_dir=".", target_col="exp")

klipo.tune(n_trials=30)  # reduce/increase n_trials as needed

klipo.train_svr(tuned=True)
klipo.save_model("klipo_svr.pkl")

[I 2025-10-07 14:19:44,877] A new study created in memory with name: no-name-af317a38-29d7-4714-897d-81da708bdf3a
  "C": trial.suggest_loguniform("C", 1e-2, 1e3),
  "epsilon": trial.suggest_loguniform("epsilon", 1e-3, 1.0),
[I 2025-10-07 14:19:47,289] Trial 0 finished with value: 0.6518016585983064 and parameters: {'C': 57.14793390273132, 'epsilon': 0.023664596154148884, 'gamma': 'scale', 'kernel': 'rbf'}. Best is trial 0 with value: 0.6518016585983064.
  "C": trial.suggest_loguniform("C", 1e-2, 1e3),
  "epsilon": trial.suggest_loguniform("epsilon", 1e-3, 1.0),
[I 2025-10-07 14:19:49,931] Trial 1 finished with value: 11.25532250368084 and parameters: {'C': 94.2562150612123, 'epsilon': 0.03685804451659977, 'gamma': 'auto', 'kernel': 'poly'}. Best is trial 0 with value: 0.6518016585983064.
  "C": trial.suggest_loguniform("C", 1e-2, 1e3),
  "epsilon": trial.suggest_loguniform("epsilon", 1e-3, 1.0),
[I 2025-10-07 14:19:50,273] Trial 2 finished with value: 0.8601725617397932 and parameters:

Best Params: {'C': 2.631454426892058, 'epsilon': 0.012308759548259534, 'gamma': 'auto', 'kernel': 'rbf'}
Best RMSE: 0.6181263147148874
✅ Validation RMSE: 0.2436, R²: 0.9413
✅ Test RMSE: 1.2590, R²: 0.6046
💾 Model saved to klipo_svr.pkl


In [34]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from src.featuriser import DescriptorFeaturizer
import optuna

class KlipoSKLearn:
    def __init__(self, data_dir="data", target_col="logP", random_state=42):
        self.data_dir = data_dir
        self.target_col = target_col
        self.random_state = random_state
        self.imputer = SimpleImputer(strategy="median")
        self.featurizer = DescriptorFeaturizer()
        self.model = None
        self.best_params = None

        # Load raw CSVs
        self.train = pd.read_csv(os.path.join(data_dir, "train.csv"))
        self.valid = pd.read_csv(os.path.join(data_dir, "valid.csv"))
        self.test = pd.read_csv(os.path.join(data_dir, "test.csv"))

        # Featurize descriptors
        self.X_train = self.featurize_df(self.train)
        self.X_valid = self.featurize_df(self.valid)
        self.X_test = self.featurize_df(self.test)

        self.y_train = self.train[self.target_col].values
        self.y_valid = self.valid[self.target_col].values
        self.y_test = self.test[self.target_col].values

        # Impute missing values
        self.X_train = self.imputer.fit_transform(self.X_train)
        self.X_valid = self.imputer.transform(self.X_valid)
        self.X_test = self.imputer.transform(self.X_test)

    def featurize_df(self, df):
        """Featurize a dataframe of SMILES"""
        features = []
        for smi in df["smiles"]:
            desc = self.featurizer.featurize_smiles(smi)
            features.append(desc)
        return np.array(features, dtype=np.float32)

    def _objective(self, trial):
        """Optuna objective to minimize RMSE"""
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "random_state": self.random_state,
        }

        model = GradientBoostingRegressor(**params)
        model.fit(self.X_train, self.y_train)
        preds = model.predict(self.X_valid)
        rmse = np.sqrt(mean_squared_error(self.y_valid, preds))
        return rmse

    def tune(self, n_trials=30):
        """Run Optuna hyperparameter tuning"""
        study = optuna.create_study(direction="minimize")
        study.optimize(self._objective, n_trials=n_trials)
        self.best_params = study.best_params
        print("✅ Best Params:", self.best_params)
        print("✅ Best RMSE:", study.best_value)
        return self.best_params

    def p_train(self, tuned=True):
        """Train final model on train + valid"""
        if tuned and self.best_params is None:
            print("⚠️ No tuned params found, running tuning first...")
            self.tune(n_trials=30)

        params = self.best_params if (tuned and self.best_params) else {
            "n_estimators": 500,
            "max_depth": 6,
            "learning_rate": 0.05,
            "subsample": 0.8,
            "random_state": self.random_state,
        }

        self.model = GradientBoostingRegressor(**params)
        X_full = np.vstack([self.X_train, self.X_valid])
        y_full = np.concatenate([self.y_train, self.y_valid])
        self.model.fit(X_full, y_full)

        # Metrics
        val_preds = self.model.predict(self.X_valid)
        val_rmse = np.sqrt(mean_squared_error(self.y_valid, val_preds))
        val_r2 = r2_score(self.y_valid, val_preds)
        print(f"✅ Validation RMSE: {val_rmse:.4f}, R²: {val_r2:.4f}")

        test_preds = self.model.predict(self.X_test)
        test_rmse = np.sqrt(mean_squared_error(self.y_test, test_preds))
        test_r2 = r2_score(self.y_test, test_preds)
        print(f"✅ Test RMSE: {test_rmse:.4f}, R²: {test_r2:.4f}")

        return self.model, (val_rmse, val_r2), (test_rmse, test_r2)

    def predict(self, X_new):
        """Predict on new featurized data"""
        return self.model.predict(X_new)

    def save_model(self, filepath="./models/klipo_skl.pkl"):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        joblib.dump({"model": self.model, "imputer": self.imputer}, filepath)
        print(f"💾 Model saved to {filepath}")

    def load_model(self, filepath="./models/klipo_skl.pkl"):
        data = joblib.load(filepath)
        self.model = data["model"]
        self.imputer = data["imputer"]
        print(f"📂 Model loaded from {filepath}")
        return self.model

In [35]:
trainer = KlipoSKLearn(
    data_dir="data/lipo",
    target_col="target",
    random_state=42
)

# Train final model with best parameters
best_params = {
    'n_estimators': 872,
    'max_depth': 5,
    'learning_rate': 0.13318585500919636,
    'subsample': 0.68744377756399,
    'min_samples_split': 6,
    'min_samples_leaf': 7,
    'random_state': 42
}

trainer.p_train(tuned=False)
trainer.model.set_params(**best_params)
trainer.p_train(tuned=False)
trainer.save_model(filepath="./models/klipo_skl_best.pkl")

2025-10-08 18:55:48,214 - INFO - DescriptorFeaturizer initialized with 217 descriptors


✅ Validation RMSE: 0.1060, R²: 0.9889
✅ Test RMSE: 0.5718, R²: 0.6065
✅ Validation RMSE: 0.1060, R²: 0.9889
✅ Test RMSE: 0.5718, R²: 0.6065
💾 Model saved to ./models/klipo_skl_best.pkl


In [42]:
import os
import joblib
import numpy as np
import pandas as pd
import optuna
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, accuracy_score
from src.featuriser import DescriptorFeaturizer

class BBBPExtraTrees:
    def __init__(self, data_dir="data/bbbp", target_col="p_np", random_state=42):
        self.data_dir = data_dir
        self.target_col = target_col
        self.random_state = random_state
        self.model = None
        self.best_params = None
        self.scaler = StandardScaler()
        self.featurizer = DescriptorFeaturizer()

        # Load raw CSVs
        self.train = pd.read_csv(os.path.join(data_dir, "train.csv"))
        self.valid = pd.read_csv(os.path.join(data_dir, "valid.csv"))
        self.test = pd.read_csv(os.path.join(data_dir, "test.csv"))

        # Featurize descriptors
        self.X_train = self.featurize_df(self.train)
        self.X_valid = self.featurize_df(self.valid)
        self.X_test = self.featurize_df(self.test)

        # Normalise / scale descriptors
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_valid = self.scaler.transform(self.X_valid)
        self.X_test = self.scaler.transform(self.X_test)

        # Targets
        self.y_train = self.train[self.target_col].values
        self.y_valid = self.valid[self.target_col].values
        self.y_test = self.test[self.target_col].values

        # Compute class weights for imbalance
        classes = np.unique(self.y_train)
        self.class_weights = dict(zip(
            classes,
            compute_class_weight(class_weight="balanced", classes=classes, y=self.y_train)
        ))

    def featurize_df(self, df):
        """Generate descriptors from SMILES and handle large/infinite values"""
        features = []
        for smi in df["smiles"]:
            desc = self.featurizer.featurize_smiles(smi)
            desc = np.nan_to_num(desc, nan=0.0, posinf=np.finfo(np.float32).max, neginf=-np.finfo(np.float32).max)
            features.append(desc)
        X = np.array(features, dtype=np.float32)

        # Replace extremely large values with median of that column
        for i in range(X.shape[1]):
            col = X[:, i]
            max_val = np.finfo(np.float32).max
            median_val = np.median(col[np.isfinite(col)])
            col[col > max_val] = median_val
            col[col < -max_val] = median_val
            X[:, i] = col
        return X

    def _objective(self, trial):
        """Optuna objective: maximize ROC-AUC on validation set"""
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 5, 50),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        }

        model = ExtraTreesClassifier(
            **params,
            random_state=self.random_state,
            class_weight=self.class_weights,
            n_jobs=-1
        )
        model.fit(self.X_train, self.y_train)
        preds = model.predict_proba(self.X_valid)[:, 1]
        return roc_auc_score(self.y_valid, preds)

    def tune(self, n_trials=30):
        """Run Optuna tuning, store best params."""
        study = optuna.create_study(direction="maximize")
        study.optimize(self._objective, n_trials=n_trials)
        self.best_params = study.best_params
        print("✅ Best Params:", self.best_params)
        print("✅ Best ROC-AUC:", study.best_value)
        return self.best_params

    def train_extratrees(self, tuned=True):
        """Train Extra Trees using tuned or default params"""
        if tuned and self.best_params is None:
            print("⚠️ No tuned params found, running tuning first...")
            self.tune(n_trials=30)

        params = self.best_params if (tuned and self.best_params) else {
            "n_estimators": 300,
            "max_depth": None,
            "max_features": "sqrt",
            "min_samples_split": 2,
            "min_samples_leaf": 1,
        }

        self.model = ExtraTreesClassifier(
            **params,
            random_state=self.random_state,
            class_weight=self.class_weights,
            n_jobs=-1
        )
        # Train on train + valid together for final model
        X_final = np.vstack([self.X_train, self.X_valid])
        y_final = np.concatenate([self.y_train, self.y_valid])
        self.model.fit(X_final, y_final)

        # Validation metrics
        val_preds = self.model.predict(self.X_valid)
        val_probs = self.model.predict_proba(self.X_valid)[:, 1]
        val_acc = accuracy_score(self.y_valid, val_preds)
        val_roc = roc_auc_score(self.y_valid, val_probs)
        print(f"✅ Validation ACC: {val_acc:.4f}, ROC-AUC: {val_roc:.4f}")

        # Test metrics
        test_preds = self.model.predict(self.X_test)
        test_probs = self.model.predict_proba(self.X_test)[:, 1]
        test_acc = accuracy_score(self.y_test, test_preds)
        test_roc = roc_auc_score(self.y_test, test_probs)
        print(f"✅ Test ACC: {test_acc:.4f}, ROC-AUC: {test_roc:.4f}")

        return self.model, (val_acc, val_roc), (test_acc, test_roc)

    def predict(self, X_new):
        """Predict class labels from featurized descriptors"""
        X_new_scaled = self.scaler.transform(X_new)
        return self.model.predict(X_new_scaled)

    def predict_proba(self, X_new):
        X_new_scaled = self.scaler.transform(X_new)
        return self.model.predict_proba(X_new_scaled)

    def save_model(self, filepath="./models/bbbp_extratrees.pkl"):
        if self.model is None:
            raise ValueError("No model trained yet.")
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        joblib.dump({
            "model": self.model,
            "scaler": self.scaler,
            "best_params": self.best_params
        }, filepath)
        print(f"💾 Model saved to {filepath}")

    def load_model(self, filepath="./models/bbbp_extratrees.pkl"):
        data = joblib.load(filepath)
        self.model = data["model"]
        self.scaler = data["scaler"]
        self.best_params = data.get("best_params", None)
        print(f"📂 Model loaded from {filepath}")
        return self.model

In [43]:
trainer = BBBPExtraTrees(
    data_dir="data/bbbp",   # Folder containing train.csv, valid.csv, test.csv
    target_col="p_np",      # Column name for BBB permeability
    random_state=42
)

# %% [code]
# Step 3: (Optional) Hyperparameter tuning using Optuna
# This may take a while. If you want to skip, set tuned=False in train_extratrees.
tuned_params = trainer.tune(n_trials=30)
print("Tuned parameters:", tuned_params)

# %% [code]
# Step 4: Train final ExtraTrees model (using tuned parameters)
model, val_metrics, test_metrics = trainer.train_extratrees(tuned=True)
print("Validation Metrics (ACC, ROC-AUC):", val_metrics)
print("Test Metrics (ACC, ROC-AUC):", test_metrics)

# %% [code]
# Step 5: Save the trained model
trainer.save_model(filepath="./models/bbbp_extratrees_best.pkl")


2025-10-08 19:14:00,264 - INFO - DescriptorFeaturizer initialized with 217 descriptors
  X = np.array(features, dtype=np.float32)
[I 2025-10-08 19:14:13,620] A new study created in memory with name: no-name-eb7d183f-9d01-4a2b-8c12-421c490ae9c9
[I 2025-10-08 19:14:14,036] Trial 0 finished with value: 0.970496894409938 and parameters: {'n_estimators': 640, 'max_depth': 24, 'max_features': 'log2', 'min_samples_split': 13, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.970496894409938.
[I 2025-10-08 19:14:14,405] Trial 1 finished with value: 0.9736024844720497 and parameters: {'n_estimators': 192, 'max_depth': 29, 'max_features': None, 'min_samples_split': 7, 'min_samples_leaf': 13}. Best is trial 1 with value: 0.9736024844720497.
[I 2025-10-08 19:14:14,940] Trial 2 finished with value: 0.9716614906832299 and parameters: {'n_estimators': 868, 'max_depth': 43, 'max_features': 'sqrt', 'min_samples_split': 17, 'min_samples_leaf': 20}. Best is trial 1 with value: 0.9736024844720497.
[I

✅ Best Params: {'n_estimators': 484, 'max_depth': 32, 'max_features': None, 'min_samples_split': 3, 'min_samples_leaf': 3}
✅ Best ROC-AUC: 0.9762228260869564
Tuned parameters: {'n_estimators': 484, 'max_depth': 32, 'max_features': None, 'min_samples_split': 3, 'min_samples_leaf': 3}
✅ Validation ACC: 0.9951, ROC-AUC: 1.0000
✅ Test ACC: 0.6029, ROC-AUC: 0.7454
Validation Metrics (ACC, ROC-AUC): (0.9950980392156863, 1.0)
Test Metrics (ACC, ROC-AUC): (0.6029411764705882, 0.7453511899026881)
💾 Model saved to ./models/bbbp_extratrees_best.pkl
