In [18]:
# Pandas dataframe
import pandas as pd
from pandas import DataFrame

# Scikit-Learn
import sklearn 
from sklearn import datasets, linear_model, metrics, tree
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score

# Models
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.ensemble import (BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, 
RandomForestRegressor, AdaBoostRegressor)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet, Lasso,  RidgeCV
from sklearn.neighbors import KNeighborsRegressor
        
import joblib
from matplotlib import pyplot as plt
import numpy as np
import os
from tqdm import tqdm

In [19]:
df_size = pd.read_csv("dataset_scaled_size.csv")
df_size

Unnamed: 0.1,Unnamed: 0,metal_amount_mmol,ligand_amount_mmol,reductant_amount_mmol,sol1_vol_mL,solv2_vol_mL,time_min,temp_c,7,8,...,121,122,123,124,125,126,127,128,size_nm,g_factor*10^4.1
0,0,-0.454677,-0.189866,-0.196766,-0.120691,-0.523499,0.681196,-0.998532,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.06,0
1,1,-0.454677,-0.189866,-0.196766,-0.100205,-0.523499,0.681196,-0.998532,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.00,0.757989206
2,2,1.606371,0.030694,0.275068,-0.171905,-0.523499,-0.595993,-0.998532,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.00,0.58306862
3,3,1.773936,0.045732,0.286496,-0.028504,-0.523499,-0.595993,-0.998532,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.37,0
4,4,-0.454677,-0.189866,-0.196766,-0.120691,-0.523499,0.681196,-0.998532,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,4.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,192,-0.819828,-0.455390,-0.268493,-0.744075,-0.523499,-0.654047,-0.998532,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,20.00,0
140,198,0.573054,-0.204904,0.003505,-0.171905,-0.003814,-0.537939,-0.175924,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.05,15
141,199,0.573054,-0.204904,0.003505,-0.171905,-0.003814,-0.537939,-0.175924,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.30,10
142,200,0.573054,-0.330222,0.003505,-0.171905,-0.003814,-0.537939,-0.175924,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.05,7


In [20]:
input_col = [col for col in df_size.columns]
input_col.remove('Unnamed: 0')
input_col.remove('size_nm')
input_col.remove('g_factor*10^4.1')
input_col

['metal_amount_mmol',
 'ligand_amount_mmol',
 'reductant_amount_mmol',
 'sol1_vol_mL',
 'solv2_vol_mL',
 'time_min',
 'temp_c',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',


In [21]:
output_col = ['size_nm']

X = df_size[input_col]

Y = df_size[output_col]

In [22]:
# Splitting dataset for training
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=45, shuffle=True)

## Bagging

In [23]:
min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 20)):
    for j in range(1, 20):
        for k in range(2, 36, 2):
            
            B_regr = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=i),
                                      n_estimators=j,
                                      random_state=k)
            
            B_regr.fit(X_train, np.ravel(Y_train))
            
            B_Y_pred = B_regr.predict(X_test)
            
            mae = mean_absolute_error(Y_test, B_Y_pred)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 19/19 [02:52<00:00,  9.09s/it]

1.2333589743589743 13 1 26





## Random Forest

In [24]:
min_mae = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 26)):
    for j in range(1, 26):
        for k in range(2, 23, 2):
            RF_regr = RandomForestRegressor(max_depth=i, 
                                            n_estimators=j, 
                                            max_features=k,
                                            random_state=45
                                                )
            RF_regr.fit(X_train, np.ravel(Y_train))
            RF_Y_pred = RF_regr.predict(X_test)

            mae = mean_absolute_error(Y_test, RF_Y_pred)
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 25/25 [02:27<00:00,  5.91s/it]

1.1818434343434345 22 3 2





## Decision Tree

In [25]:
min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 26)):
    for j in range(1, 23):
        for k in range(2, 45, 1):
            
            DT_regr = DecisionTreeRegressor(max_depth=i,
                                max_features=j,
                                random_state=k)
            
            DT_regr.fit(X_train, Y_train)

            DT_Y_pred = DT_regr.predict(X_test)

            mae = mean_absolute_error(Y_test, DT_Y_pred)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 25/25 [02:13<00:00,  5.34s/it]

0.8236688311688312 12 19 28





## Extra Trees

In [26]:
min_mae = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 30)):
    for j in range(1, 23):
        for k in range(2, 40, 1):
            ET_regr = ExtraTreesRegressor(n_estimators=i, 
                                            max_features=j,
                                            random_state=k
                                                )
            ET_regr.fit(X_train, np.ravel(Y_train))
            ET_Y_pred = ET_regr.predict(X_test)

            mae = mean_absolute_error(Y_test, ET_Y_pred)
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 29/29 [08:11<00:00, 16.94s/it]

0.8881818181818183 2 3 23





## AdaBoost

In [27]:
min_mae = 99999
min_i, min_j = 0, 0
for i in tqdm(range(1, 30)):
    for j in range(1, 30):
        AB_regr = AdaBoostRegressor(n_estimators=i, loss='exponential', learning_rate=j)
        AB_regr.fit(X_train, np.ravel(Y_train))
        AB_Y_pred = AB_regr.predict(X_test)
        AB_mae = mean_absolute_error(Y_test, AB_Y_pred)
        if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
print(min_mae, min_i, min_j)

100%|██████████| 29/29 [00:19<00:00,  1.45it/s]

2.6996264367816094 1 1





## Gradient Boosting

In [28]:
'''
min_mae = 999
min_i, min_j, min_k, min_l = 0, 0, 0.0, 0
for i in tqdm(range(300, 450, 10)):
    for j in range(2, 30, 2):
        for k in np.arange(0.06, 0.22, 0.02):
            for l in range(2, 22, 2):
                GB_regr = GradientBoostingRegressor(n_estimators=i, max_depth=j, learning_rate=k, random_state=l)
                GB_regr.fit(X_train, np.ravel(Y_train))
                GB_Y_pred = GB_regr.predict(X_test)

                mae = mean_absolute_error(Y_test, GB_Y_pred)
                if (min_mae > mae):
                    min_mae = mae
                    min_i = i
                    min_j = j
                    min_k = k
                    min_l = l

print(min_mae, min_i, min_j, min_k, min_l)
'''

'\nmin_mae = 999\nmin_i, min_j, min_k, min_l = 0, 0, 0.0, 0\nfor i in tqdm(range(300, 450, 10)):\n    for j in range(2, 30, 2):\n        for k in np.arange(0.06, 0.22, 0.02):\n            for l in range(2, 22, 2):\n                GB_regr = GradientBoostingRegressor(n_estimators=i, max_depth=j, learning_rate=k, random_state=l)\n                GB_regr.fit(X_train, np.ravel(Y_train))\n                GB_Y_pred = GB_regr.predict(X_test)\n\n                mae = mean_absolute_error(Y_test, GB_Y_pred)\n                if (min_mae > mae):\n                    min_mae = mae\n                    min_i = i\n                    min_j = j\n                    min_k = k\n                    min_l = l\n\nprint(min_mae, min_i, min_j, min_k, min_l)\n'

## Others

In [29]:
REGRESSIONS = {
    "K-nn": KNeighborsRegressor(),                          
    "Ridge": RidgeCV(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(random_state=0),
                }
# mean absolute error is used to evaluate the performance of all regressions.


for name, reg in REGRESSIONS.items():     
    reg.fit(X_train, Y_train)                 
    Y_pred = pd.DataFrame(reg.predict(X_test))
    
    print(name)
    
    mae = mean_absolute_error(Y_test, Y_pred)
    
    print(' MAE for Abs is ', mae, '\n')

K-nn
 MAE for Abs is  2.821909090909091 

Ridge
 MAE for Abs is  3.794035061957014 

Lasso
 MAE for Abs is  4.805209099824539 

ElasticNet
 MAE for Abs is  5.8376051372191915 



## Saving the best model

In [31]:
DT_regr = DecisionTreeRegressor(max_depth=12,
                                max_features=19,
                                random_state=28)
            
DT_regr.fit(X_train, Y_train)
DT_Y_pred = DT_regr.predict(X_test)

joblib.dump(DT_regr, "./model_aug_size_DecisionTree.joblib")

['./model_aug_size_DecisionTree.joblib']