In [1]:
# Pandas dataframe
import pandas as pd
from pandas import DataFrame

# Scikit-Learn
import sklearn 
from sklearn import datasets, linear_model, metrics, tree
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score

# Models
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.ensemble import (BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, 
RandomForestRegressor, AdaBoostRegressor)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
        
import joblib
from matplotlib import pyplot as plt
import numpy as np
import os
from tqdm import tqdm

In [2]:
df_abs = pd.read_csv("dataset_scaled_abs.csv")
df_abs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 61 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         152 non-null    int64  
 1   in_amount_mmol                     152 non-null    float64
 2   p_amount_mmol                      152 non-null    float64
 3   ligand_amount_mmol                 152 non-null    float64
 4   first_sol_amount_ml                152 non-null    float64
 5   second_sol_amount_ml               152 non-null    float64
 6   other_1_amount_mmol                152 non-null    float64
 7   other_2_amount_mmol                152 non-null    float64
 8   total_volume_ml                    152 non-null    float64
 9   temp_c                             152 non-null    float64
 10  time_min                           152 non-null    float64
 11  x0_indium acetate                  152 non-null    float64

In [3]:
#Checks the column names, and ensures that they do not have any leading or trailing spaces
df_abs.columns = df_abs.columns.str.strip()

In [4]:
# Input for ML models

input_col = ['in_amount_mmol',
             'p_amount_mmol',
             'ligand_amount_mmol',
             'first_sol_amount_ml',
             'second_sol_amount_ml',
             'other_1_amount_mmol',
             'other_2_amount_mmol',
             'total_volume_ml',
             'temp_c',
             'time_min',
             'x0_indium acetate',
             'x0_indium bromide',
             'x0_indium chloride',
             'x0_indium iodide',
             'x0_indium myristate',
             'x0_indium trifluoroacetate',
             'x1_bis(trimethylsilyl)phosphine',
             'x1_phosphorus trichloride',
             'x1_tris(diethylamino)phosphine',
             'x1_tris(dimethylamino)phosphine',
             'x1_tris(trimethylgermyl)phosphine',
             'x1_tris(trimethylsilyl)phosphine',
             'x2_None',
             'x2_lauric acid',
             'x2_myristic acid',
             'x2_oleic acid',
             'x2_palmitic acid',
             'x2_stearic acid',
             'x3_dodecylamine',
             'x3_octadecene',
             'x3_oleylamine',
             'x3_trioctylamine',
             'x3_trioctylphosphine',
             'x4_None',
             'x4_dioctyl ether',
             'x4_dioctylamine',
             'x4_hexadecylamine',
             'x4_octylamine',
             'x4_oleylamine',
             'x4_toluene',
             'x4_trioctylphosphine',
             'x4_trioctylphosphine oxide',
             'x5_None',
             'x5_acetic acid',
             'x5_superhydride',
             'x5_tetrabutylammonium myristate',
             'x5_zinc bromide'
             ,'x5_zinc chloride'
             ,'x5_zinc iodide'
             ,'x5_zinc oleate',
             'x5_zinc stearate',
             'x5_zinc undecylenate',
             'x6_None',
             'x6_copper bromide',
             'x6_trioctylphosphine',
             'x6_water',
             'x6_zinc iodide',
             ]

output_col = ['abs_nm']

X = df_abs[input_col]

Y = df_abs[output_col]

In [5]:
# Splitting dataset for training
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=45, shuffle=True)

## Grid search 10/26/2021

### Bagging


In [6]:
min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 20)):
    for j in range(1, 20):
        for k in range(2, 36, 2):
            
            B_regr = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=i),
                                      n_estimators=j,
                                      random_state=k)
            
            B_regr.fit(X_train, np.ravel(Y_train))
            
            B_Y_pred = B_regr.predict(X_test)
            
            mae = mean_absolute_error(Y_test, B_Y_pred)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 19/19 [02:11<00:00,  6.92s/it]

29.61154443382705 9 2 16





### Random Forest

In [7]:
min_mae = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 26)):
    for j in range(1, 26):
        for k in range(2, 36, 2):
            RF_regr = RandomForestRegressor(max_depth=i, 
                                            n_estimators=j, 
                                            max_features=k,
                                            random_state=45
                                                )
            RF_regr.fit(X_train, np.ravel(Y_train))
            RF_Y_pred = RF_regr.predict(X_test)

            mae = mean_absolute_error(Y_test, RF_Y_pred)
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 25/25 [03:36<00:00,  8.67s/it]

27.156625258799178 12 3 16





### Decision Tree

In [10]:
min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 26)):
    for j in range(1, 26):
        for k in range(2, 45, 1):
            
            DT_regr = DecisionTreeRegressor(max_depth=i,
                                max_features=j,
                                random_state=k)
            
            DT_regr.fit(X_train, Y_train)

            DT_Y_pred = DT_regr.predict(X_test)

            mae = mean_absolute_error(Y_test, DT_Y_pred)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 25/25 [02:40<00:00,  6.43s/it]

23.436956521739127 12 12 16





### Extra Trees

In [9]:
min_mae = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 30)):
    for j in range(1, 30):
        for k in range(2, 40, 1):
            ET_regr = ExtraTreesRegressor(n_estimators=i, 
                                            max_features=j,
                                            random_state=k
                                                )
            ET_regr.fit(X_train, np.ravel(Y_train))
            ET_Y_pred = ET_regr.predict(X_test)

            mae = mean_absolute_error(Y_test, ET_Y_pred)
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 29/29 [10:44<00:00, 22.24s/it]

24.217391304347824 1 24 19





### AdaBoost

In [11]:
min_mae = 99999
min_i, min_j = 0, 0
for i in tqdm(range(1, 30)):
    for j in range(1, 30):
        AB_regr = AdaBoostRegressor(n_estimators=i, loss='exponential', learning_rate=j)
        AB_regr.fit(X_train, np.ravel(Y_train))
        AB_Y_pred = AB_regr.predict(X_test)
        AB_mae = mean_absolute_error(Y_test, AB_Y_pred)
        if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
print(min_mae, min_i, min_j)

100%|██████████| 29/29 [00:16<00:00,  1.73it/s]

26.343328335832084 1 1





### Gradient Boosting

In [12]:
min_mae = 999
min_i, min_j, min_k, min_l = 0, 0, 0.0, 0
for i in tqdm(range(300, 450, 10)):
    for j in range(2, 30, 2):
        for k in np.arange(0.06, 0.22, 0.02):
            for l in range(2, 22, 2):
                GB_regr = GradientBoostingRegressor(n_estimators=i, max_depth=j, learning_rate=k, random_state=l)
                GB_regr.fit(X_train, np.ravel(Y_train))
                GB_Y_pred = GB_regr.predict(X_test)

                mae = mean_absolute_error(Y_test, GB_Y_pred)
                if (min_mae > mae):
                    min_mae = mae
                    min_i = i
                    min_j = j
                    min_k = k
                    min_l = l

print(min_mae, min_i, min_j, min_k, min_l)

100%|██████████| 15/15 [1:24:38<00:00, 338.54s/it]

23.664665516785554 370 4 0.16000000000000003 6





## Saving Extra Trees model

In [12]:
DT_regr = DecisionTreeRegressor(max_depth=12,
                                max_features=12,
                                random_state=16)
            
DT_regr.fit(X_train, Y_train)
DT_Y_pred = DT_regr.predict(X_test)

joblib.dump(DT_regr, "./model_aug_abs_DecisionTree.joblib")

['./model_aug_abs_DecisionTree.joblib']