In [2]:
import numpy as np
import pandas as pd
import os
import joblib
import sklearn
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [3]:
df = pd.read_csv('../../flo_dataset_test.csv')
df

Unnamed: 0,in_source,in_amount_mmol,p_source,p_amount_mmol,ligand_source,ligand_amount_mmol,first_sol,first_sol_amount_ml,second_sol,second_sol_amount_ml,other_1,other_1_amount_mmol,other_2,other_2_amount_mmol,total_volume_ml,temp_c,time_min,diameter_nm,abs_nm,emission_nm
0,indium acetate,1.00,tris(trimethylsilyl)phosphine,1.000,dodecanethiol,0.5,,0.000000,,0.0,zinc stearate,2.00,,0.0,0.120,300,30.0,,480,539
1,chloroindium oxalate,1.05,tris(trimethylsilyl)phosphine,1.400,,0.0,trioctylphosphine oxide,0.222222,,0.0,,0.00,,0.0,0.223,270,4320.0,2.61,610,
2,indium chloride,0.30,white phosphorus,0.450,,0.0,oleylamine,0.264000,,0.0,zinc chloride,1.47,,0.0,0.264,180,30.0,,560,595
3,indium chloride,0.30,white phosphorus,0.450,,0.0,oleylamine,0.264000,,0.0,zinc chloride,1.47,,0.0,0.264,210,30.0,,590,635
4,indium acetate,1.00,tris(trimethylsilyl)phosphine,1.000,dodecanethiol,0.5,octadecene,1.000000,,0.0,zinc octanoate,2.00,,0.0,1.120,180,60.0,,,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,indium acetate,1.00,tris(trimethylsilyl)phosphine,0.499,myristic acid,3.0,octadecene,75.000000,,0.0,,0.00,,0.0,75.000,270,180.0,4,585,630
215,indium chloride,10.00,sodium phosphide,11.300,,0.0,dimethylformamide,90.000000,,0.0,,0.00,,0.0,90.000,160,120.0,,465,550
216,indium acetate,4.00,tris(trimethylsilyl)phosphine,2.000,palmitic acid,12.0,octadecene,100.000000,trioctylphosphine,10.0,,0.00,,0.0,110.000,260,1.0,,465,
217,indium acetate,4.00,tris(trimethylsilyl)phosphine,2.000,palmitic acid,12.0,octadecene,100.000000,trioctylphosphine,10.0,,0.00,,0.0,110.000,260,20.0,,495,


In [4]:
#Checks if there are any columns with empty cells
df.isna().sum()

in_source               0
in_amount_mmol          0
p_source                0
p_amount_mmol           0
ligand_source           0
ligand_amount_mmol      0
first_sol               0
first_sol_amount_ml     0
second_sol              0
second_sol_amount_ml    0
other_1                 0
other_1_amount_mmol     0
other_2                 0
other_2_amount_mmol     0
total_volume_ml         0
temp_c                  0
time_min                0
diameter_nm             0
abs_nm                  0
emission_nm             0
dtype: int64

# Scaling and Transforming

In [5]:
#Separate initial DataFrame into input and output features (output doesn't get scaled)
df_input = df.drop(columns =['diameter_nm', 'abs_nm', 'emission_nm'], inplace = False, axis = 1)
df_output = df[['diameter_nm', 'abs_nm', 'emission_nm']]

In [6]:
#Checks the column names, and ensures that they do not have any leading or trailing spaces
df_input.columns = df_input.columns.str.strip()
df_output.columns = df_output.columns.str.strip()

In [7]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   in_source             219 non-null    object 
 1   in_amount_mmol        219 non-null    float64
 2   p_source              219 non-null    object 
 3   p_amount_mmol         219 non-null    float64
 4   ligand_source         219 non-null    object 
 5   ligand_amount_mmol    219 non-null    float64
 6   first_sol             219 non-null    object 
 7   first_sol_amount_ml   219 non-null    float64
 8   second_sol            219 non-null    object 
 9   second_sol_amount_ml  219 non-null    float64
 10  other_1               219 non-null    object 
 11  other_1_amount_mmol   219 non-null    float64
 12  other_2               219 non-null    object 
 13  other_2_amount_mmol   219 non-null    float64
 14  total_volume_ml       219 non-null    float64
 15  temp_c                2

In [8]:
#change temp from integer to float
df_input['temp_c'] = df_input['temp_c'].astype(float)
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   in_source             219 non-null    object 
 1   in_amount_mmol        219 non-null    float64
 2   p_source              219 non-null    object 
 3   p_amount_mmol         219 non-null    float64
 4   ligand_source         219 non-null    object 
 5   ligand_amount_mmol    219 non-null    float64
 6   first_sol             219 non-null    object 
 7   first_sol_amount_ml   219 non-null    float64
 8   second_sol            219 non-null    object 
 9   second_sol_amount_ml  219 non-null    float64
 10  other_1               219 non-null    object 
 11  other_1_amount_mmol   219 non-null    float64
 12  other_2               219 non-null    object 
 13  other_2_amount_mmol   219 non-null    float64
 14  total_volume_ml       219 non-null    float64
 15  temp_c                2

In [9]:
#Initializes 2 lists to contain all of the numerical and categorical input columns
input_num_cols = [col for col in df_input.columns if df[col].dtypes !='O']
input_cat_cols = [col for col in df_input.columns if df[col].dtypes =='O']

In [10]:
print(input_num_cols, input_cat_cols)

['in_amount_mmol', 'p_amount_mmol', 'ligand_amount_mmol', 'first_sol_amount_ml', 'second_sol_amount_ml', 'other_1_amount_mmol', 'other_2_amount_mmol', 'total_volume_ml', 'temp_c', 'time_min'] ['in_source', 'p_source', 'ligand_source', 'first_sol', 'second_sol', 'other_1', 'other_2']


In [11]:
#Initializes the ColumnTransformer object, and specifies what it will do with a dataframe
#scaling numerical columns
#onehotencoder creates a binary column for each categorical entry
ct = ColumnTransformer([
    ('step1', StandardScaler(), input_num_cols),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'), input_cat_cols)
], remainder = 'passthrough')

In [12]:
#what are the transformers in ct
ct.transformers

[('step1',
  StandardScaler(),
  ['in_amount_mmol',
   'p_amount_mmol',
   'ligand_amount_mmol',
   'first_sol_amount_ml',
   'second_sol_amount_ml',
   'other_1_amount_mmol',
   'other_2_amount_mmol',
   'total_volume_ml',
   'temp_c',
   'time_min']),
 ('step2',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['in_source',
   'p_source',
   'ligand_source',
   'first_sol',
   'second_sol',
   'other_1',
   'other_2'])]

In [13]:
#Uses the ColumnTransformer object to modify the input columns
df_input_scaled_encoded = pd.DataFrame(ct.fit_transform(df_input))
df_input_scaled_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,72
0,0.154575,-0.011188,-0.247025,-0.673379,-0.370637,0.865472,-0.156742,-0.660116,1.573235,-0.282373,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.195883,0.280681,-0.439228,-0.658075,-0.370637,-0.675365,-0.156742,-0.653603,0.975942,3.449771,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.423738,-0.412509,-0.439228,-0.655198,-0.370637,0.457150,-0.156742,-0.651010,-0.815937,-0.282373,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.423738,-0.412509,-0.439228,-0.655198,-0.370637,0.457150,-0.156742,-0.651010,-0.218644,-0.282373,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.154575,-0.011188,-0.247025,-0.604513,-0.370637,0.865472,-0.156742,-0.596878,-0.815937,-0.256274,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,0.154575,-0.376755,0.713993,4.491519,-0.370637,-0.675365,-0.156742,4.075133,0.975942,-0.151879,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
215,7.590031,7.504453,-0.439228,5.524499,-0.370637,-0.675365,-0.156742,5.023700,-1.214132,-0.204076,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
216,2.633061,0.718486,4.173657,6.213152,4.521657,-0.675365,-0.156742,6.288457,0.776845,-0.307602,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
217,2.633061,0.718486,4.173657,6.213152,4.521657,-0.675365,-0.156742,6.288457,0.776845,-0.291073,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [14]:
#Number of categorical columns
array_of_cat_titles = ct.transformers_[1][1].get_feature_names()
len(array_of_cat_titles)

63

In [15]:
#Number of numerical columns
len(input_num_cols)

10

In [16]:
#renaming columns
for i in range(len(input_num_cols)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i]: input_num_cols[i]}, inplace = True)

for j in range(len(array_of_cat_titles)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i+1]: array_of_cat_titles[j]}, inplace = True)
    i = i + 1

In [17]:
df_input_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,ligand_amount_mmol,first_sol_amount_ml,second_sol_amount_ml,other_1_amount_mmol,other_2_amount_mmol,total_volume_ml,temp_c,time_min,...,x5_zinc octanoate,x5_zinc oleate,x5_zinc stearate,x5_zinc undecylenate,x6_None,x6_copper bromide,x6_oleic acid,x6_trioctylphosphine,x6_water,x6_zinc iodide
0,0.154575,-0.011188,-0.247025,-0.673379,-0.370637,0.865472,-0.156742,-0.660116,1.573235,-0.282373,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.195883,0.280681,-0.439228,-0.658075,-0.370637,-0.675365,-0.156742,-0.653603,0.975942,3.449771,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.423738,-0.412509,-0.439228,-0.655198,-0.370637,0.457150,-0.156742,-0.651010,-0.815937,-0.282373,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.423738,-0.412509,-0.439228,-0.655198,-0.370637,0.457150,-0.156742,-0.651010,-0.218644,-0.282373,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.154575,-0.011188,-0.247025,-0.604513,-0.370637,0.865472,-0.156742,-0.596878,-0.815937,-0.256274,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,0.154575,-0.376755,0.713993,4.491519,-0.370637,-0.675365,-0.156742,4.075133,0.975942,-0.151879,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
215,7.590031,7.504453,-0.439228,5.524499,-0.370637,-0.675365,-0.156742,5.023700,-1.214132,-0.204076,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
216,2.633061,0.718486,4.173657,6.213152,4.521657,-0.675365,-0.156742,6.288457,0.776845,-0.307602,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
217,2.633061,0.718486,4.173657,6.213152,4.521657,-0.675365,-0.156742,6.288457,0.776845,-0.291073,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_input_scaled_encoded.columns

Index(['in_amount_mmol', 'p_amount_mmol', 'ligand_amount_mmol',
       'first_sol_amount_ml', 'second_sol_amount_ml', 'other_1_amount_mmol',
       'other_2_amount_mmol', 'total_volume_ml', 'temp_c', 'time_min',
       'x0_chloroindium oxalate', 'x0_indium acetate', 'x0_indium bromide',
       'x0_indium chloride', 'x0_indium iodide', 'x0_indium myristate',
       'x0_indium oxalate', 'x0_indium palmitate',
       'x0_indium trifluoroacetate',
       'x0_indium tris(N,N-diisopropylacetamidinato)',
       'x1_bis(trimethylsilyl)phosphine', 'x1_phosphine gas',
       'x1_phosphorus trichloride', 'x1_sodium phosphide',
       'x1_tris(diethylamino)phosphine', 'x1_tris(dimethylamino)phosphine',
       'x1_tris(trimethylgermyl)phosphine', 'x1_tris(trimethylsilyl)phosphine',
       'x1_white phosphorus', 'x2_None', 'x2_dodecanethiol', 'x2_lauric acid',
       'x2_myristic acid', 'x2_oleic acid', 'x2_palmitic acid',
       'x2_stearic acid', 'x3_4-ethylpyridine', 'x3_None',
       'x3_dimethy

In [19]:
#appends output columns
df_scaled_encoded = pd.concat([df_input_scaled_encoded, df_output], axis = 1)
df_scaled_encoded

Unnamed: 0,in_amount_mmol,p_amount_mmol,ligand_amount_mmol,first_sol_amount_ml,second_sol_amount_ml,other_1_amount_mmol,other_2_amount_mmol,total_volume_ml,temp_c,time_min,...,x5_zinc undecylenate,x6_None,x6_copper bromide,x6_oleic acid,x6_trioctylphosphine,x6_water,x6_zinc iodide,diameter_nm,abs_nm,emission_nm
0,0.154575,-0.011188,-0.247025,-0.673379,-0.370637,0.865472,-0.156742,-0.660116,1.573235,-0.282373,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,480,539
1,0.195883,0.280681,-0.439228,-0.658075,-0.370637,-0.675365,-0.156742,-0.653603,0.975942,3.449771,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.61,610,
2,-0.423738,-0.412509,-0.439228,-0.655198,-0.370637,0.457150,-0.156742,-0.651010,-0.815937,-0.282373,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,560,595
3,-0.423738,-0.412509,-0.439228,-0.655198,-0.370637,0.457150,-0.156742,-0.651010,-0.218644,-0.282373,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,590,635
4,0.154575,-0.011188,-0.247025,-0.604513,-0.370637,0.865472,-0.156742,-0.596878,-0.815937,-0.256274,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,0.154575,-0.376755,0.713993,4.491519,-0.370637,-0.675365,-0.156742,4.075133,0.975942,-0.151879,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4,585,630
215,7.590031,7.504453,-0.439228,5.524499,-0.370637,-0.675365,-0.156742,5.023700,-1.214132,-0.204076,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,465,550
216,2.633061,0.718486,4.173657,6.213152,4.521657,-0.675365,-0.156742,6.288457,0.776845,-0.307602,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,465,
217,2.633061,0.718486,4.173657,6.213152,4.521657,-0.675365,-0.156742,6.288457,0.776845,-0.291073,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,495,


In [20]:
df_scaled_encoded.to_csv('flo_dataset_scaled.csv')

# Making separate abs, em and diam datasets

In [21]:
#Saves into a list the row indexes to drop for absorbance dataset
total_row_num = len(df_scaled_encoded)
drop_list_abs =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['abs_nm'].values[row_i] == 'None':
        drop_list_abs.append(row_i)
    
#number of entries
print(total_row_num-len(drop_list_abs))

205


In [22]:
#Drops rows that don't have abs output
df_absorbance_scaled_encoded = df_scaled_encoded.drop(drop_list_abs)
df_absorbance_scaled_encoded.to_csv('dataset_scaled_abs.csv')

In [23]:
#Saves the row indexes to drop for emission dataset
total_row_num = len(df_scaled_encoded)
drop_list_em =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['emission_nm'].values[row_i] == 'None':
        drop_list_em.append(row_i)

#number of entries
print(total_row_num-len(drop_list_em))

85


In [24]:
#Drops rows that don't have emission output
df_emission_scaled_encoded = df_scaled_encoded.drop(drop_list_em)
df_emission_scaled_encoded.to_csv('dataset_scaled_em.csv')

In [25]:
#Saves the row indexes to drop for diameter dataset
total_row_num = len(df_scaled_encoded)
drop_list_diam =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['diameter_nm'].values[row_i] == 'None':
        drop_list_diam.append(row_i)
    
#number of entries
print(total_row_num-len(drop_list_diam))

72


In [26]:
#Drops rows that don't have diameter output
df_diameter_scaled_encoded = df_scaled_encoded.drop(drop_list_diam)
df_diameter_scaled_encoded.to_csv('dataset_scaled_diam.csv')