In [51]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import joblib
import sklearn
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [52]:
df = pd.read_csv('../../dataset/hao_dataset.csv')
df

Unnamed: 0,doi,in_source,in_amount_mmol,p_source,p_amount_mmol,sol,sol_amount_ml,TOP,TOP_amount_mmol,acid,...,other,other_amount_mmol,total_volume_ml,temp_c,time_min,diameter_nm,abs_nm,emission_nm,date_input,user
0,10.1016/j.jcis.2010.06.037,chloroindium oxalate,4.20,tris(trimethylsilyl)phosphine,2.07,,0.0,trioctylphosphine,13.452570,,...,trioctylphosphine oxide,11.111111,17.112,280,4320.0,,599,612,6/25/2021,Hao
1,10.1021/j100019a063,chloroindium oxalate,0.88,tris(trimethylsilyl)phosphine,0.52,,0.0,trioctylphosphine,13.490343,,...,trioctylphosphine oxide,5.555556,11.573,270,4320.0,2.6,530,,6/25/2021,Hao
2,10.1021/j100019a063,chloroindium oxalate,1.76,tris(trimethylsilyl)phosphine,1.04,,0.0,trioctylphosphine,5.396137,,...,trioctylphosphine oxide,0.222222,2.629,270,4320.0,3.5,595,,6/25/2021,Hao
3,10.1021/j100019a063,chloroindium oxalate,1.76,tris(trimethylsilyl)phosphine,1.04,,0.0,trioctylphosphine,5.396137,,...,trioctylphosphine oxide,0.222222,2.629,300,4320.0,4.6,650,,6/25/2021,Hao
4,10.1021/j100070a004,chloroindium oxalate,0.88,tris(trimethylsilyl)phosphine,1.40,,0.0,trioctylphosphine,13.490343,,...,trioctylphosphine oxide,5.555556,11.573,270,4320.0,2.52,520,,6/25/2021,Hao
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,10.1557/jmr.2006.0068,indium trifluoroacetate,0.11,tris(trimethylsilyl)phosphine,0.10,octadecene,7.0,,0.000000,myristic acid,...,,0.000000,7.000,270,6.0,,570,,6/25/2021,Hao
215,10.1557/jmr.2006.0068,indium trifluoroacetate,0.11,tris(trimethylsilyl)phosphine,0.10,octadecene,7.0,,0.000000,myristic acid,...,,0.000000,7.000,270,60.0,,562,,6/25/2021,Hao
216,10.1557/jmr.2006.0068,indium trifluoroacetate,0.11,tris(trimethylsilyl)phosphine,0.10,octadecene,7.0,,0.000000,myristic acid,...,,0.000000,7.000,270,120.0,,549,,6/25/2021,Hao
217,10.1021/acs.chemmater.7b04069,"indium tris(N,N-diisopropylacetamidinato)",0.30,tris(trimethylsilyl)phosphine,0.15,mesitylene,5.0,,0.000000,palmitic acid,...,,0.000000,5.077,150,1260.0,2,523,,6/26/2021,Hao


In [53]:
#Checks if there are any columns with no values
df.isna().sum()

doi                  0
in_source            0
in_amount_mmol       0
p_source             0
p_amount_mmol        0
sol                  0
sol_amount_ml        0
TOP                  0
TOP_amount_mmol      0
acid                 0
acid_amount_mmol     0
amine                0
amine_amount_mmol    0
thiol                0
thiol_amount_mmol    0
zinc                 0
zinc_amount_mmol     0
other                0
other_amount_mmol    0
total_volume_ml      0
temp_c               0
time_min             0
diameter_nm          0
abs_nm               0
emission_nm          0
date_input           0
user                 0
dtype: int64

In [54]:
#Separate out initial DataFrame into the input features and output features
df_input = df.drop(columns =['diameter_nm', 'abs_nm', 'emission_nm','doi','user','date_input'], inplace = False, axis = 1)
df_output = df[['diameter_nm', 'abs_nm', 'emission_nm']]

In [55]:
#Checks the column names, and ensures that they do not have any leading or trailing spaces
df_input.columns = df_input.columns.str.strip()
df_output.columns = df_output.columns.str.strip()

In [56]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   in_source          219 non-null    object 
 1   in_amount_mmol     219 non-null    float64
 2   p_source           219 non-null    object 
 3   p_amount_mmol      219 non-null    float64
 4   sol                219 non-null    object 
 5   sol_amount_ml      219 non-null    float64
 6   TOP                219 non-null    object 
 7   TOP_amount_mmol    219 non-null    float64
 8   acid               219 non-null    object 
 9   acid_amount_mmol   219 non-null    float64
 10  amine              219 non-null    object 
 11  amine_amount_mmol  219 non-null    float64
 12  thiol              219 non-null    object 
 13  thiol_amount_mmol  219 non-null    float64
 14  zinc               219 non-null    object 
 15  zinc_amount_mmol   219 non-null    float64
 16  other              219 non

In [57]:
df_input['temp_c'] = df_input['temp_c'].astype(float)
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   in_source          219 non-null    object 
 1   in_amount_mmol     219 non-null    float64
 2   p_source           219 non-null    object 
 3   p_amount_mmol      219 non-null    float64
 4   sol                219 non-null    object 
 5   sol_amount_ml      219 non-null    float64
 6   TOP                219 non-null    object 
 7   TOP_amount_mmol    219 non-null    float64
 8   acid               219 non-null    object 
 9   acid_amount_mmol   219 non-null    float64
 10  amine              219 non-null    object 
 11  amine_amount_mmol  219 non-null    float64
 12  thiol              219 non-null    object 
 13  thiol_amount_mmol  219 non-null    float64
 14  zinc               219 non-null    object 
 15  zinc_amount_mmol   219 non-null    float64
 16  other              219 non

In [58]:
#Initializes 2 lists to contain all of the numerical and categorical input columns
input_num_cols = [col for col in df_input.columns if df[col].dtypes !='O']
input_cat_cols = [col for col in df_input.columns if df[col].dtypes =='O']

In [59]:
input_cat_cols

['in_source',
 'p_source',
 'sol',
 'TOP',
 'acid',
 'amine',
 'thiol',
 'zinc',
 'other']

In [60]:
#Initializes the ColumnTransformer object, and specifies what it will do with a passed in dataframe
ct = ColumnTransformer([
    ('step1', StandardScaler(), input_num_cols),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'), input_cat_cols)
], remainder = 'passthrough')

In [61]:
#Uses the ColumnTransformer object to modify the input columns
df_input_scaled_encoded = pd.DataFrame(ct.fit_transform(df_input))
df_input_scaled_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
0,2.798293,0.769563,-0.528015,1.951881,-0.431077,-0.641827,-0.197117,-0.629015,8.009708,0.412626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.055436,-0.361432,-0.528015,1.958217,-0.431077,-0.641827,-0.197117,-0.629015,3.873052,0.063284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.782458,0.017999,-0.528015,0.600491,-0.431077,-0.641827,-0.197117,-0.629015,-0.098139,-0.500811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.782458,0.017999,-0.528015,0.600491,-0.431077,-0.641827,-0.197117,-0.629015,-0.098139,-0.500811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.055436,0.280681,-0.528015,1.958217,-0.431077,-0.641827,-0.197117,-0.629015,3.873052,0.063284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
217,-0.423738,-0.631411,-0.201566,-0.304661,-0.085945,-0.600008,-0.197117,-0.629015,-0.263605,-0.346416,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
ct.transformers_

[('step1',
  StandardScaler(),
  ['in_amount_mmol',
   'p_amount_mmol',
   'sol_amount_ml',
   'TOP_amount_mmol',
   'acid_amount_mmol',
   'amine_amount_mmol',
   'thiol_amount_mmol',
   'zinc_amount_mmol',
   'other_amount_mmol',
   'total_volume_ml',
   'temp_c',
   'time_min']),
 ('step2',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['in_source',
   'p_source',
   'sol',
   'TOP',
   'acid',
   'amine',
   'thiol',
   'zinc',
   'other'])]

In [63]:
#Look into ct object to see the column titles
array_of_cat_titles = ct.transformers_[1][1].get_feature_names()
len(array_of_cat_titles) #There are a total of 41 encoded categorical columns

60

In [64]:
#Number of numerical columns in the dataset
len(input_num_cols)

12

In [65]:
for i in range(len(input_num_cols)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i]: input_num_cols[i]}, inplace = True)

for j in range(len(array_of_cat_titles)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i+1]: array_of_cat_titles[j]}, inplace = True)
    i = i + 1

In [66]:
df_input_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,sol_amount_ml,TOP_amount_mmol,acid_amount_mmol,amine_amount_mmol,thiol_amount_mmol,zinc_amount_mmol,other_amount_mmol,total_volume_ml,...,x8_None,x8_acetic acid,x8_copper bromide,x8_oleic acid,x8_superhydride,x8_tetrabutylammonium myristate,x8_trioctylamine,x8_trioctylphosphine oxide,x8_water,x8_zinc iodide
0,2.798293,0.769563,-0.528015,1.951881,-0.431077,-0.641827,-0.197117,-0.629015,8.009708,0.412626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.055436,-0.361432,-0.528015,1.958217,-0.431077,-0.641827,-0.197117,-0.629015,3.873052,0.063284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.782458,0.017999,-0.528015,0.600491,-0.431077,-0.641827,-0.197117,-0.629015,-0.098139,-0.500811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.782458,0.017999,-0.528015,0.600491,-0.431077,-0.641827,-0.197117,-0.629015,-0.098139,-0.500811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.055436,0.280681,-0.528015,1.958217,-0.431077,-0.641827,-0.197117,-0.629015,3.873052,0.063284,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
215,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
217,-0.423738,-0.631411,-0.201566,-0.304661,-0.085945,-0.600008,-0.197117,-0.629015,-0.263605,-0.346416,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
df_scaled_encoded = pd.concat([df_input_scaled_encoded, df_output], axis = 1)
df_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,sol_amount_ml,TOP_amount_mmol,acid_amount_mmol,amine_amount_mmol,thiol_amount_mmol,zinc_amount_mmol,other_amount_mmol,total_volume_ml,...,x8_oleic acid,x8_superhydride,x8_tetrabutylammonium myristate,x8_trioctylamine,x8_trioctylphosphine oxide,x8_water,x8_zinc iodide,diameter_nm,abs_nm,emission_nm
0,2.798293,0.769563,-0.528015,1.951881,-0.431077,-0.641827,-0.197117,-0.629015,8.009708,0.412626,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,599,612
1,0.055436,-0.361432,-0.528015,1.958217,-0.431077,-0.641827,-0.197117,-0.629015,3.873052,0.063284,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.6,530,
2,0.782458,0.017999,-0.528015,0.600491,-0.431077,-0.641827,-0.197117,-0.629015,-0.098139,-0.500811,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.5,595,
3,0.782458,0.017999,-0.528015,0.600491,-0.431077,-0.641827,-0.197117,-0.629015,-0.098139,-0.500811,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.6,650,
4,0.055436,0.280681,-0.528015,1.958217,-0.431077,-0.641827,-0.197117,-0.629015,3.873052,0.063284,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.52,520,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,570,
215,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,562,
216,-0.580709,-0.667895,-0.070987,-0.304661,-0.316033,-0.641827,-0.197117,-0.629015,-0.263605,-0.225133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,549,
217,-0.423738,-0.631411,-0.201566,-0.304661,-0.085945,-0.600008,-0.197117,-0.629015,-0.263605,-0.346416,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,523,


In [68]:
df_scaled_encoded.to_csv('dataset_scaled.csv')

In [69]:
#Saves the row indexes to drop for absorbance modelling into a list
total_row_num = len(df_scaled_encoded)
drop_list_abs =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['abs_nm'].values[row_i] == 'None':
        drop_list_abs.append(row_i)
    
len(drop_list_abs)

14

In [70]:
#Drops the appropriate rows
df_absorbance_scaled_encoded = df_scaled_encoded.drop(drop_list_abs)
#Saves the data for absorbance modelling to CSV
df_absorbance_scaled_encoded.to_csv('dataset_scaled_abs.csv')