# Optuna Meta-model (tuned ensemble of regressors)

# 1. Definitions

In [1]:
import numpy as np; import pandas as pd; import matplotlib.pyplot as plt;
import os; import joblib; 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# models
model_names = ['AdaBoostRegressor', 'ARDRegression', 'BaggingRegressor','BayesianRidge',
               'DecisionTreeRegressor','DummyRegressor','ElasticNet','ElasticNetCV',
               'ExtraTreeRegressor','GradientBoostingRegressor','GaussianProcessRegressor','HistGradientBoostingRegressor',               
               'HuberRegressor', 'KNeighborsRegressor','Lars','LarsCV',
               'Lasso','LassoLars','LassoLarsCV','LassoLarsIC',
               'LinearSVR','MLPRegressor','NuSVR','OrthogonalMatchingPursuit',
               'OrthogonalMatchingPursuitCV','PassiveAggressiveRegressor','RANSACRegressor','Ridge',
               'RandomForestRegressor', 'SGDRegressor','SVR','TheilSenRegressor',
               'TransformedTargetRegressor','XGBRegressor','LGBMRegressor','CatBoostRegressor']

from sklearn.linear_model import ARDRegression, BayesianRidge, ElasticNet, ElasticNetCV, Lasso, LassoLarsIC,Lars
from sklearn.linear_model import OrthogonalMatchingPursuit, OrthogonalMatchingPursuitCV, RANSACRegressor,TheilSenRegressor
from sklearn.linear_model import SGDRegressor, LarsCV, Ridge, LassoLars
from sklearn.svm import NuSVR, SVR
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor, HistGradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
import catboost as cb


# 2. Data loading

In [2]:
# Load the data
data_file="data.xlsx"
data_exp_sheet="dataset"
data_ext_sheet="solvents"
data_MetaModel_sheet='meta model'
predictions_file="predictions.xlsx"
# name of independent value
Yname='log(x1)exp'
print('------------------')
print(' Benzenesulfonamide ')
print('solubility meta model ')
print('-------------------')
YX=pd.read_excel('.//'+data_file,sheet_name=data_exp_sheet,usecols="F:J")
XX=YX.drop(YX.columns[0], axis=1)
Y=YX[Yname]
extXX=pd.read_excel('.//'+data_file,sheet_name=data_ext_sheet,usecols="F:I")
print('Dataset:',len(Y),'measurements')
print('Screening:',len(extXX),'solvents')


------------------
 Benzenesulfonamide 
solubility meta model 
-------------------
Dataset: 190 measurements
Screening: 1698 solvents


In [3]:
# Spliting and standarization
X_train, X_test1, y_train, y_test1 = train_test_split(XX, Y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test1, y_test1, test_size=0.5, random_state=42)
# Scale the data: standardize all data subsets
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_val_std = scaler.transform(X_val)
XX_std = scaler.transform(XX)
extXX_std=scaler.transform(extXX)


# 3. Models loading

In [4]:
regressors_pd=pd.read_excel('.//'+data_file,sheet_name=data_MetaModel_sheet,usecols="A,B")
# Load models from disk
models = []
models_loaded = []
i=0
for name in regressors_pd['Regressor']:
    file=".//models//" + "model_"+ name + ".pkl"
    if os.path.exists(file):
        model=joblib.load(file)
        models.append(model)
        y_train_pred = model.predict(X_train_std)
        y_test_pred = model.predict(X_test_std)
        y_val_pred = model.predict(X_val_std)
        mse_train = mean_squared_error(y_train, y_train_pred)
        mse_test = mean_squared_error(y_test, y_test_pred)
        mse_val = mean_squared_error(y_val, y_val_pred)
        loaded=[i,name,mse_test,model]
        models_loaded.append(loaded)
        extY_pred = model.predict(extXX_std)
        i=i+1
        print(f"{i}. {name}: train set: [mse={mse_train:.3f}] test set: [mse={mse_test:.3f} ]  validation set: [mse={mse_val:.3f} ]")  

    else:
        print(f"Warning: Model file {file} does not exist.")

if len(model_names)==len(models):
    print("All models were loaded sucesfully:")
else:
    print("There are still missing some models:", 34-len(models))
N_models=i    

1. AdaBoostRegressor: train set: [mse=0.014] test set: [mse=0.025 ]  validation set: [mse=0.019 ]
2. ARDRegression: train set: [mse=0.026] test set: [mse=0.027 ]  validation set: [mse=0.018 ]
3. BaggingRegressor: train set: [mse=0.003] test set: [mse=0.018 ]  validation set: [mse=0.010 ]
4. BayesianRidge: train set: [mse=0.026] test set: [mse=0.027 ]  validation set: [mse=0.018 ]
5. CatBoostRegressor: train set: [mse=0.292] test set: [mse=0.359 ]  validation set: [mse=0.269 ]
6. DecisionTreeRegressor: train set: [mse=0.016] test set: [mse=0.029 ]  validation set: [mse=0.030 ]
7. DummyRegressor: train set: [mse=0.370] test set: [mse=0.382 ]  validation set: [mse=0.209 ]
8. ElasticNet: train set: [mse=0.030] test set: [mse=0.032 ]  validation set: [mse=0.018 ]
9. ElasticNetCV: train set: [mse=2.199] test set: [mse=2.247 ]  validation set: [mse=2.110 ]
10. ExtraTreeRegressor: train set: [mse=0.016] test set: [mse=0.028 ]  validation set: [mse=0.016 ]
11. GaussianProcessRegressor: train se

# 4. Prediction

In [24]:
newXX=pd.read_excel('.//'+predictions_file,usecols="C:F")
newSolvents=pd.read_excel('.//'+predictions_file,usecols="A")
newXX_std=scaler.transform(newXX)
newY_pred = np.zeros(newXX_std.shape[0])
for i  in range(N_models):
    name=regressors_pd['Regressor'][i]
    weight=regressors_pd['weight'][i]
    model=models[i]
    newY_pred += weight * model.predict(newXX_std)
newY_pred_df = pd.DataFrame(index=range(len(newY_pred))) 
newY_pred_df['meta model']=newY_pred.reshape((-1, 1))
newY_pred_df = pd.concat([newSolvents,newY_pred_df], axis=1)
print('------------------------------------------------')
print(' Predicted benzenesulfonamide solubiliy ')
print('     in the foloowing solvents')
print('    (logarithm of mole fraction)')
print('------------------------------------------------')
for i in range(len(newY_pred_df)):
    print(f"{newY_pred_df['solvent'][i]} {newY_pred_df['meta model'][i]:.2f}" )
print('------------------------------------------------')

------------------------------------------------
 Predicted benzenesulfonamide solubiliy 
     in the foloowing solvents
    (logarithm of mole fraction)
------------------------------------------------
DMSO -0.58
DMF -0.62
Morpholine-4-carbaldehyde -1.06
n-Pentanol -1.95
pyrimidine -0.62
------------------------------------------------
