In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import clone
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer 

def run_stratified_cv(
    pipeline: Pipeline,
    features: np.ndarray,
    labels: np.ndarray,
    n_splits: int = 5,
    n_bins: int = 10,
    random_state: int = 42,
    transform_target: bool = False,
    verbose: bool = True
) -> tuple[np.ndarray, np.ndarray, float, float, float, float]:
    if labels.ndim > 1:
      labels = labels.ravel() 

    logp_bins = pd.cut(labels, bins=n_bins, labels=False, duplicates="drop")
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    offset = 0.0 
    if transform_target:
        min_label = np.min(labels)
        if min_label <= 0:
            offset = abs(min_label) + 1 
        else:
            offset = 1 
            
    all_rmse, all_r2, all_y_pred, all_y_test = [], [], [], []
    if verbose:
        print(f"\nStarting {n_splits}-Fold CV for:\n{pipeline}")

    for fold, (train_idx, test_idx) in enumerate(skf.split(features, logp_bins), 1):
        
        X_train, X_test = features.iloc[train_idx], features.iloc[test_idx]
        #X_train, X_test = features[train_idx], features[test_idx]
        y_train_orig, y_test_orig = labels[train_idx], labels[test_idx] 

            
        fold_pipeline = clone(pipeline)
        if transform_target:
            y_train_transformed = np.log(y_train_orig + offset)
            target = y_train_transformed
        else:
            target = y_train_orig

        fold_pipeline.fit(X_train, target)
        y_pred_raw = fold_pipeline.predict(X_test) 
        if transform_target:
            y_pred_original_scale = np.exp(y_pred_raw) - offset
        else:
            y_pred_original_scale = y_pred_raw

        all_y_pred.append(y_pred_original_scale)
        all_y_test.append(y_test_orig)
        rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_original_scale))
        r2 = r2_score(y_test_orig, y_pred_original_scale)
        if verbose:
            print(f"Fold {fold} RMSE: {rmse:.4f}, R2 Score: {r2:.4f}")

        all_rmse.append(rmse)
        all_r2.append(r2)

    final_y_pred = np.concatenate(all_y_pred)
    final_y_test = np.concatenate(all_y_test)
    mean_rmse = np.nanmean(all_rmse) 
    std_rmse = np.nanstd(all_rmse)  
    mean_r2 = np.nanmean(all_r2)
    std_r2 = np.nanstd(all_r2)

    print(f"\nSummary for: {pipeline}")
    print(f"Average RMSE: {mean_rmse:.4f} +/- {std_rmse:.4f}")
    print(f"Average R2 Score: {mean_r2:.4f} +/- {std_r2:.4f}")
    print("-------------------------------------------------") 

    return final_y_pred, final_y_test, mean_rmse, std_rmse, mean_r2, std_r2


In [None]:
df = pd.read_parquet("./data/processed/deepchem_mol2vec_300.parquet")
labels = df["exp"].to_numpy()
features = df.drop(["exp", "smiles", "CMPD_CHEMBLID"], axis = 1)
features = features.to_numpy()
labels = labels.reshape(-1, 1)
features = np.stack(features.squeeze())

morgan_df = pd.read_parquet("./data/processed/deepchem_morgan_fp.parquet")
morgan_features = morgan_df.drop(["exp"], axis = 1)
morgan_features = morgan_features.to_numpy()
morgan_features = np.stack(morgan_features.squeeze())

extended_df = pd.read_parquet("../data/processed/deepchem_extended_mol2vec_300.parquet")
extended_features = extended_df.drop(["exp", "smiles", "CMPD_CHEMBLID"], axis=1)

mol2vec_col = extended_df["mol2vec"]
extended_descriptors = [
    "molwt", "clogp", "hba", "hbd",
    "tpsa",
    "num_rotatable_bonds",
    "num_rings",
    "num_aromatic_rings",
    "fraction_csp3",
    "num_heavy_atoms",
    "num_valence_electrons"
]


features_mol2vec = np.array(mol2vec_col.tolist(), dtype=np.float64)
features_descriptors = extended_df[extended_descriptors].to_numpy(dtype=np.float64)

extended_features = np.hstack((features_mol2vec, features_descriptors))

In [None]:
import optuna
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import RobustScaler

num_features = extended_features.shape[1]
generic_feature_names = [f'f{i}' for i in range(num_features)]
features_df = pd.DataFrame(extended_features, columns=generic_feature_names)

def objective_svr(trial):
    svr_params = {
        "C": trial.suggest_float("C", 1e-1, 1e3, log=True),
        "epsilon": trial.suggest_float("epsilon", 0.01, 0.5),
        "gamma": "scale",
        "kernel": "rbf"
    }
    pipeline = make_pipeline(RobustScaler(), SVR(**svr_params))

    _, _, mean_rmse, _, _, _ = run_stratified_cv(
        pipeline=pipeline,
        features=features_df, 
        labels=labels,
        n_splits=5,
        verbose=False,
        transform_target=False 
    )
    return mean_rmse

def objective_lgbm(trial):
    lgbm_params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "n_estimators": trial.suggest_int("n_estimators", 500, 2500, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 10, 60),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "verbose": -1
    }
    pipeline = make_pipeline(lgb.LGBMRegressor(**lgbm_params))
    _, _, mean_rmse, _, _, _ = run_stratified_cv(
        pipeline=pipeline,
        features=features_df,#extended_features, 
        labels=labels,
        n_splits=5,
        verbose=False,
        transform_target=False 
    )
    return mean_rmse

print(type(extended_features))
print(np.shape(extended_features))

In [None]:
n_trials = 50 

print(f"\nStarting LightGBM ({n_trials} trials):")
study_lgbm = optuna.create_study(direction="minimize")
study_lgbm.optimize(objective_lgbm, n_trials=n_trials)

print(f"\nLightGBM Best RMSE: {study_lgbm.best_value:.4f}")
print("\nLightGBM Best hyperparameters:")
for key, value in study_lgbm.best_params.items():
    print(f"{key}: {value}")

In [None]:
n_trials = 50

print(f"\nStarting SVR ({n_trials} trials):")
study_svr = optuna.create_study(direction="minimize")
study_svr.optimize(objective_svr, n_trials=n_trials)

print(f"\nSVR Best RMSE: {study_svr.best_value:.4f}")
print("\nSVR Best hyperparameters:")
for key, value in study_svr.best_params.items():
    print(f"{key}: {value}")