This notebook is used to create two ML models. 

One is used for predicting None values in Absorbance column.

One is used for predicting None values in PL max column.

Three algorithms are considered: Bagging, Decision Trees, and Random Forests

In [1]:
import numpy as np
import pandas as pd
import os
import joblib
import sklearn         
from sklearn import linear_model, datasets
from sklearn.utils import resample
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.svm import SVR
from tqdm import tqdm

# 1. Model for Absorbance 

In [2]:
# Absorbance
df_ab = pd.read_csv('dataset_CdSe_scaled_abs.csv')

In [3]:
# This dataset excludes all rows that have 'None' in the 'Absorbance max (nm)' column.
# This dataset is used to train and create a model for predicting absorbance.
df_ab

Unnamed: 0.1,Unnamed: 0,Growth Temp (Celsius),Metal_amount (g),Metal_mmol (mmol),Metal_concentration (mmol/g),Chalcogen_amount (g),Chalcogen_mmol (mmol),Chalcogen_concentration (mmol/g),Metal/Se_ratio,CA_amount (g),...,x4_liquid parafin,x4_octadecene,x4_phenyl ether,x4_trioctylphosphine oxide,x5_None,x5_phosphinic acid,x5_trioctylphosphine oxide,diameter_nm,abs_nm,emission_nm
0,0,1.105103,-0.217616,-0.437486,-0.394948,-0.248190,-0.243779,-0.160457,-1.018315,-0.478072,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.41,566,575
1,1,-0.062540,-0.513517,-0.472027,-0.421421,-0.422206,-0.436361,-0.179158,-0.999133,-0.248749,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.50,474,617
2,19,0.404517,-0.414883,-0.371858,-0.024322,-0.538217,-0.550992,-0.172924,1.715206,0.201404,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.60,526,556
3,20,0.404517,-0.414883,-0.371858,-0.024322,-0.538217,-0.550992,-0.172924,1.715206,0.201404,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.20,559,580
4,21,0.404517,-0.414883,-0.371858,-0.024322,-0.538217,-0.550992,-0.172924,1.715206,0.201404,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.60,574,598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,229,0.482360,-0.500201,-0.465119,-0.407390,-0.486012,-0.500554,-0.185018,-0.778533,-0.334108,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.70,450,
192,230,0.482360,-0.500201,-0.465119,-0.407390,-0.486012,-0.500554,-0.185018,-0.778533,-0.334108,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.50,530,
193,231,0.482360,-0.500201,-0.465119,-0.407390,-0.486012,-0.500554,-0.185018,-0.778533,-0.334108,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.50,585,
194,232,0.482360,-0.500201,-0.465119,-0.407390,-0.486012,-0.500554,-0.185018,-0.778533,-0.334108,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.70,590,


In [4]:
# Below is all the columns input
# Note: This includes diameter as a input to predict Abs max

input_col = ['Growth Temp (Celsius)', 'Metal_mmol (mmol)', 'Chalcogen_mmol (mmol)',
             'Amines_mmol (mmol)', 'CA_mmol (mmol)', 'Phosphines_mmol (mmol)', 
             'S_I_amount (g)', 'S_II_amount (g)', 'Time_min (min)', 
             'x0_cadmium acetate', 'x0_cadmium acetate dihtdrate', 
             'x0_cadmium acetate dihydrate', 'x0_cadmium oxide', 
             'x0_cadmium stearate', 'x0_dimethylcadmium', 'x1_None', 
             'x1_benzoic acid', 'x1_dodecylphosphonic acid', 
             'x1_ethylphosphonic acid', 'x1_lauric acid', 
             'x1_myrstic acid', 'x1_oleic acid', 'x1_stearic acid',
             'x2_2-6-dimethylpyridine', 'x2_None', 'x2_aniline', 
             'x2_benzylamine', 'x2_dioctylamine/hexadecylamine',
             'x2_dodecylamine', 'x2_heptylamine', 'x2_hexadecylamine', 
             'x2_octadecylamine', 'x2_octylamine', 'x2_oleylamine', 
             'x2_pyridine', 'x2_trioctylamine', 'x3_None', 'x3_diphenylphosphine', 
             'x3_tributylphosphine', 'x3_trioctylphosphine', 
             'x3_triphenylphosphine', 'x4_None', 'x4_liquid parafin', 
             'x4_octadecene', 'x4_phenyl ether', 'x4_trioctylphosphine oxide', 
             'x5_None', 'x5_phosphinic acid', 'x5_trioctylphosphine oxide',
             'diameter_nm', ]

output_col_ab = ['abs_nm']

X_ab = df_ab[input_col]

Y_ab = df_ab[output_col_ab]

In [5]:
# Splitting dataset for training
X_train_ab, X_test_ab, Y_train_ab, Y_test_ab = train_test_split(X_ab, Y_ab, test_size=0.15, random_state=45, shuffle=True)

## 1a. Bagging

In [6]:
# This is a grid search for three parameters in the Bagging algorithm. 
# Parameters are: max_depth, n_estimators, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(4, 70, 2):
            
            B_regr_ab = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=i),
                                      n_estimators=j,
                                      random_state=k)
            
            B_regr_ab.fit(X_train_ab, np.ravel(Y_train_ab))
            
            B_Y_pred_ab = B_regr_ab.predict(X_test_ab)
            
            mae = mean_absolute_error(Y_test_ab, B_Y_pred_ab)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 20/20 [07:20<00:00, 22.01s/it]

8.039583333333333 12 4 42





## 1b. Decision Trees

In [7]:
# This is a grid search for three parameters in the Decision Trees algorithm. 
# Parameters are: max_depth, max_features, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 31)):
    for j in range(1, 31):
        for k in range(2, 80, 2):
            
            DT_regr_ab = DecisionTreeRegressor(max_depth=i,
                                max_features=j,
                                random_state=k)
            
            DT_regr_ab.fit(X_train_ab, Y_train_ab)

            DT_Y_pred_ab = DT_regr_ab.predict(X_test_ab)

            mae = mean_absolute_error(Y_test_ab, DT_Y_pred_ab)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 30/30 [05:12<00:00, 10.41s/it]

9.303998316498316 6 29 28





## 1c. Random Forests

In [8]:
# This is a grid search for three parameters in the Random Forest algorithm. 
# Parameters are: max_depth, n_estimators, max_features.
# Random_state is set to 45.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mae = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 29)):
    for j in range(1, 29):
        for k in range(2, 44, 2):
            RF_regr_ab = RandomForestRegressor(max_depth=i, 
                                            n_estimators=j, 
                                            max_features=k,
                                            random_state=45
                                                )
            RF_regr_ab.fit(X_train_ab, np.ravel(Y_train_ab))
            RF_Y_pred_ab = RF_regr_ab.predict(X_test_ab)

            mae = mean_absolute_error(Y_test_ab, RF_Y_pred_ab)
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 28/28 [07:31<00:00, 16.11s/it]

8.098095238095231 13 14 40





In [11]:

min_mae = 99999
min_i = 0
for i in tqdm(range(1, 80)):
    RF_regr_ab = RandomForestRegressor(max_depth=13, 
                                            n_estimators=14, 
                                            max_features=40,
                                            random_state=i
                                                )
    RF_regr_ab.fit(X_train_ab, np.ravel(Y_train_ab))
    RF_Y_pred_ab = RF_regr_ab.predict(X_test_ab)

    mae = mean_absolute_error(Y_test_ab, RF_Y_pred_ab)
    if (min_mae > mae):
        min_mae = mae
        min_i = i

print(min_mae, min_i)            

100%|██████████| 79/79 [00:02<00:00, 34.01it/s]

8.098095238095231 45





## Conclusion:

Random forest gives the least value of MAE with the parameters combination of: max_depth= **13**, n_estimators = **14**, max_features = **40**, and random_state = **45**.                  

## Saving model

In [14]:
RF_regr_ab = RandomForestRegressor(max_depth=13, 
                                n_estimators=14, 
                                max_features=40,
                                random_state=45
                                )
RF_regr_ab.fit(X_train_ab, np.ravel(Y_train_ab))
RF_Y_pred_ab = RF_regr_ab.predict(X_test_ab)

In [15]:
joblib.dump(RF_regr_ab, "./model_RandomForest_aug_abs.joblib")

['./model_RandomForest_abs.joblib']

# 2. Model for Photoluminescence  (PL)

In [16]:
# PL
df_pl = pd.read_csv('dataset_CdSe_scaled_emission.csv')

In [17]:
# This dataset excludes all rows that have 'None' in the 'PL max (nm)' column.
# This dataset is used to train and create a model for predicting absorbance.
df_pl

Unnamed: 0.1,Unnamed: 0,Injection Temp (Celsius),Growth Temp (Celsius),Metal_amount (g),Metal_mmol (mmol),Metal_concentration (mmol/g),Chalcogen_amount (g),Chalcogen_mmol (mmol),Chalcogen_concentration (mmol/g),Metal/Se_ratio,...,x4_liquid parafin,x4_octadecene,x4_phenyl ether,x4_trioctylphosphine oxide,x5_None,x5_phosphinic acid,x5_trioctylphosphine oxide,diameter_nm,abs_nm,emission_nm
0,0,0.759733,1.105103,-0.217616,-0.437486,-0.394948,-0.248190,-0.243779,-0.160457,-1.018315,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.41,566,575
1,1,0.014647,-0.062540,-0.513517,-0.472027,-0.421421,-0.422206,-0.436361,-0.179158,-0.999133,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.50,474,617
2,2,0.263009,0.326674,-0.370991,-0.472027,-0.354556,-0.573310,-0.587674,-0.193104,0.660163,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.99,,497
3,3,0.263009,0.326674,-0.370991,-0.472027,-0.355462,-0.573310,-0.587674,-0.193210,0.660163,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.13,,510
4,4,0.263009,0.326674,-0.370991,-0.472027,-0.355462,-0.573310,-0.587674,-0.193210,0.660163,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.27,,517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,200,-2.021922,0.793731,-0.482447,-0.476863,-0.412685,-0.311996,-0.326314,-0.144187,-1.123820,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,4.10,,580
153,201,-2.021922,0.793731,-0.482447,-0.476863,4.105508,-0.311996,-0.326314,7.525790,-1.123820,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.70,,535
154,202,-2.021922,0.793731,-0.482447,-0.476863,4.105508,-0.311996,-0.326314,7.525790,-1.123820,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,4.10,,575
155,203,-2.021922,0.793731,-0.482447,-0.476863,4.105508,-0.311996,-0.326314,7.525790,-1.123820,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.60,,530


In [18]:
output_col_pl = ['emission_nm']

X_pl = df_pl[input_col]

Y_pl = df_pl[output_col_pl]

In [19]:
# Splitting dataset for training
X_train_pl, X_test_pl, Y_train_pl, Y_test_pl = train_test_split(X_pl, Y_pl, test_size=0.15, random_state=45, shuffle=True)

## 2a. Bagging


In [20]:
# This is a grid search for three parameters in the Bagging algorithm. 
# Parameters are: max_depth, n_estimators, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 21)):
    for j in range(1, 21):
        for k in range(5, 80, 5):
            
            B_regr_pl = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=i),
                                      n_estimators=j,
                                      random_state=k)
            
            B_regr_pl.fit(X_train_pl, np.ravel(Y_train_pl))
            
            B_Y_pred_pl = B_regr_pl.predict(X_test_pl)
            
            mae = mean_absolute_error(Y_test_pl, B_Y_pred_pl)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 20/20 [02:03<00:00,  6.15s/it]

10.635416666666666 13 4 60





## 2b. Decision Trees

In [23]:
# This is a grid search for three parameters in the Decision Trees algorithm. 
# Parameters are: max_depth, max_features, random_state.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mae = 99999

min_i, min_j, min_k  = 0, 0, 0

for i in tqdm(range(1, 31)):
    for j in range(1, 31):
        for k in range(2, 80, 2):
            
            DT_regr_pl = DecisionTreeRegressor(max_depth=i,
                                max_features=j,
                                random_state=k)
            
            DT_regr_pl.fit(X_train_pl, Y_train_pl)

            DT_Y_pred_pl = DT_regr_pl.predict(X_test_pl)

            mae = mean_absolute_error(Y_test_pl, DT_Y_pred_pl)
            
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 30/30 [02:55<00:00,  5.85s/it]

9.583333333333334 14 30 42





## 2c Random Forest

In [24]:
# This is a grid search for three parameters in the Random Forest algorithm. 
# Parameters are: max_depth, n_estimators, max_features.
# Random_state is set to 45.
# This gives the best combination of the three parameters for the smallest mean squared error.

min_mae = 99999
min_i, min_j, min_k = 0, 0, 0
for i in tqdm(range(1, 31)):
    for j in range(1, 31):
        for k in range(2, 44, 2):
            RF_regr_pl = RandomForestRegressor(max_depth=i, 
                                            n_estimators=j, 
                                            max_features=k,
                                            random_state=45
                                                )
            RF_regr_pl.fit(X_train_pl, np.ravel(Y_train_pl))
            RF_Y_pred_pl = RF_regr_pl.predict(X_test_pl)

            mae = mean_absolute_error(Y_test_pl, RF_Y_pred_pl)
            if (min_mae > mae):
                min_mae = mae
                min_i = i
                min_j = j
                min_k = k
            
print(min_mae, min_i, min_j, min_k)

100%|██████████| 30/30 [07:01<00:00, 14.06s/it]

9.570216049382724 12 9 8





## Conclusion 

Random forest gives the least value of MAE with the parameters combination of: max_depth= **12**, n_estimators = **9**, max_features = **8**, and random_state = **45**.        

## Saving model

In [25]:
RF_regr_pl = RandomForestRegressor(max_depth=12, 
                                n_estimators=9, 
                                max_features=8,
                                random_state=45
                                )
RF_regr_pl.fit(X_train_pl, np.ravel(Y_train_pl))
RF_Y_pred_pl = RF_regr_pl.predict(X_test_pl)

In [26]:
joblib.dump(RF_regr_pl, "./model_RandomForest_aug_emission.joblib")

['./model_RandomForest_emission.joblib']

## Augmenting data

In [27]:

df = pd.read_csv("dataset_CdSe_scaled.csv") 


input_col = ['Growth Temp (Celsius)', 'Metal_mmol (mmol)', 'Chalcogen_mmol (mmol)',
             'Amines_mmol (mmol)', 'CA_mmol (mmol)', 'Phosphines_mmol (mmol)', 
             'S_I_amount (g)', 'S_II_amount (g)', 'Time_min (min)', 
             'x0_cadmium acetate', 'x0_cadmium acetate dihtdrate', 
             'x0_cadmium acetate dihydrate', 'x0_cadmium oxide', 
             'x0_cadmium stearate', 'x0_dimethylcadmium', 'x1_None', 
             'x1_benzoic acid', 'x1_dodecylphosphonic acid', 
             'x1_ethylphosphonic acid', 'x1_lauric acid', 
             'x1_myrstic acid', 'x1_oleic acid', 'x1_stearic acid',
             'x2_2-6-dimethylpyridine', 'x2_None', 'x2_aniline', 
             'x2_benzylamine', 'x2_dioctylamine/hexadecylamine',
             'x2_dodecylamine', 'x2_heptylamine', 'x2_hexadecylamine', 
             'x2_octadecylamine', 'x2_octylamine', 'x2_oleylamine', 
             'x2_pyridine', 'x2_trioctylamine', 'x3_None', 'x3_diphenylphosphine', 
             'x3_tributylphosphine', 'x3_trioctylphosphine', 
             'x3_triphenylphosphine', 'x4_None', 'x4_liquid parafin', 
             'x4_octadecene', 'x4_phenyl ether', 'x4_trioctylphosphine oxide', 
             'x5_None', 'x5_phosphinic acid', 'x5_trioctylphosphine oxide',
             'diameter_nm']


In [28]:
# Load ML model for predicting absorbance

loaded_rf_ab = joblib.load('model_RandomForest_aug_abs.joblib')

# Replace 'None' entries in 'Absorbance max (nm)' column by predicted values.

a = 0
for index, row in df_ab.iterrows():
    if row['abs_nm'] == 'None':
        X = df_ab.loc[index, input_col].to_numpy()
        df_ab.loc[index, 'abs_nm'] = loaded_rf_ab.predict(X.reshape(1, -1))[0]
        a += 1

        
# Save the dataset where all 'None' values in 'Absorbane max (nm)' column are replaced.   
df_ab.to_csv('dataset_CdSe_first_augmentation.csv')

In [29]:
# Now, the dataset only has 'None' values in the 'PL max (nm)' column

df_pl = pd.read_csv('dataset_CdSe_first_augmentation.csv') 

# Load ML model for predicting PL

loaded_rf_pl = joblib.load('model_RandomForest_aug_emission.joblib')


# Replace 'None' entries in 'PL max (nm)' column by predicted values.

a = 0
for index, row in df_pl.iterrows():
    if row['emission_nm'] == 'None':
        X = df_pl.loc[index, input_col].to_numpy()
        df_pl.loc[index, 'emission_nm'] = loaded_rf_pl.predict(X.reshape(1, -1))[0]
        a += 1

# Save the dataset where all 'None' values are replaced.
# Final augmented dataset.
# Ready to use for other analysis.

df_pl.to_csv('dataset_CdSe_augmented.csv')