In [31]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [32]:
#Turns the CSV file into a pandas dataframe
df = pd.read_csv('dataset_CdSe_raw.csv')
df

Unnamed: 0,Injection Temp (Celsius),Growth Temp (Celsius),Metal_source,Metal_amount (g),Metal_mmol (mmol),Metal_concentration (mmol/g),Chalcogen_source,Chalcogen_amount (g),Chalcogen_mmol (mmol),Chalcogen_concentration (mmol/g),...,S_I_amount (g),Solvent II,S_II_amount (g),Total_amount (g),Time_min (min),diameter_nm,abs_nm,emission_nm,Diameter from,Citation
0,300,300,cadmium stearate,0.1300,0.20,0.020000,Se powder,0.06000,0.80,0.070000,...,10.0000,,0.0000,11.65000,5.00,3.41,566,575,TEM,J. Phys. Chem. C 2012 116 47 25065 - 25073
1,250,250,cadmium stearate,0.0678,0.10,0.035258,Se powder,0.00395,0.05,0.017629,...,2.7645,,0.0000,2.83625,0.50,1.99,,497,TEM,J. Phys. Chem. Lett. 2017 8 15 3576-3580
2,250,250,cadmium stearate,0.0678,0.10,0.034916,Se powder,0.00395,0.05,0.017458,...,2.7645,,0.0000,2.86405,0.50,2.13,,510,TEM,J. Phys. Chem. Lett. 2017 8 15 3576-3580
3,250,250,cadmium stearate,0.0678,0.10,0.034916,Se powder,0.00395,0.05,0.017458,...,2.7645,,0.0000,2.86405,1.00,2.27,,517,TEM,J. Phys. Chem. Lett. 2017 8 15 3576-3580
4,250,250,cadmium stearate,0.0678,0.10,0.034916,Se powder,0.00395,0.05,0.017458,...,2.7645,,0.0000,2.86405,2.00,2.53,,522,TEM,J. Phys. Chem. Lett. 2017 8 15 3576-3580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,300,260,cadmium oxide,0.0154,0.12,0.015300,Se powder,0.01900,0.24,0.030600,...,4.3430,trioctylphosphine oxide,2.7835,7.85570,0.13,1.70,450,,TEM,J. Phys. Chem. B 2006 110 33 16508 - 16513
229,300,260,cadmium oxide,0.0154,0.12,0.015300,Se powder,0.01900,0.24,0.030600,...,4.3430,trioctylphosphine oxide,2.7835,7.85570,1.50,2.50,530,,TEM,J. Phys. Chem. B 2006 110 33 16508 - 16513
230,300,260,cadmium oxide,0.0154,0.12,0.015300,Se powder,0.01900,0.24,0.030600,...,4.3430,trioctylphosphine oxide,2.7835,7.85570,14.37,3.50,585,,TEM,J. Phys. Chem. B 2006 110 33 16508 - 16513
231,300,260,cadmium oxide,0.0154,0.12,0.015300,Se powder,0.01900,0.24,0.030600,...,4.3430,trioctylphosphine oxide,2.7835,7.85570,26.70,3.70,590,,TEM,J. Phys. Chem. B 2006 110 33 16508 - 16513


In [33]:
#Identifies the shape of the original dataframe
df.shape

(233, 33)

In [34]:
#Checks if there are any columns with no values
df.isna().sum()

Injection Temp (Celsius)            0
Growth Temp (Celsius)               0
Metal_source                        0
Metal_amount (g)                    0
Metal_mmol (mmol)                   0
Metal_concentration (mmol/g)        0
Chalcogen_source                    0
Chalcogen_amount (g)                0
Chalcogen_mmol (mmol)               0
Chalcogen_concentration (mmol/g)    0
Metal/Se_ratio                      0
Carboxylic_Acid                     0
CA_amount (g)                       0
CA_mmol (mmol)                      0
Cd/CA_ratio                         0
Amines                              0
Amines_amount (g)                   0
Amines_mmol (mmol)                  0
Phosphines                          0
Phosphines_amount (g)               0
Phosphines_mmol (mmol)              0
Chalcogen/Ph_ratio                  0
Solvent I                           0
S_I_amount (g)                      0
Solvent II                          0
S_II_amount (g)                     0
Total_amount

In [35]:
#Separate out initial DataFrame into the input features and output features
df_input = df.drop(columns =['Chalcogen_source','diameter_nm', 'abs_nm', 'emission_nm', 'Diameter from', 'Citation'], inplace = False, axis = 1) #Dropped out FWHM, Phase and PLQY columns as well because they have NaN values
df_output = df[['diameter_nm', 'abs_nm', 'emission_nm']]

In [36]:
#Checks the column names, and ensures that they do not have any leading or trailing spaces
df_input.columns = df_input.columns.str.strip()
df_output.columns = df_output.columns.str.strip()

In [37]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 27 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Injection Temp (Celsius)          233 non-null    int64  
 1   Growth Temp (Celsius)             233 non-null    int64  
 2   Metal_source                      233 non-null    object 
 3   Metal_amount (g)                  233 non-null    float64
 4   Metal_mmol (mmol)                 233 non-null    float64
 5   Metal_concentration (mmol/g)      233 non-null    float64
 6   Chalcogen_amount (g)              233 non-null    float64
 7   Chalcogen_mmol (mmol)             233 non-null    float64
 8   Chalcogen_concentration (mmol/g)  233 non-null    float64
 9   Metal/Se_ratio                    233 non-null    float64
 10  Carboxylic_Acid                   233 non-null    object 
 11  CA_amount (g)                     233 non-null    float64
 12  CA_mmol 

In [38]:
#Converts the values in the "Temperature" and "Growth" Columns into float types
df_input['Injection Temp (Celsius)'] = df_input['Injection Temp (Celsius)'].astype(float)
df_input['Growth Temp (Celsius)'] = df_input['Growth Temp (Celsius)'].astype(float)

df_input.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233 entries, 0 to 232
Data columns (total 27 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Injection Temp (Celsius)          233 non-null    float64
 1   Growth Temp (Celsius)             233 non-null    float64
 2   Metal_source                      233 non-null    object 
 3   Metal_amount (g)                  233 non-null    float64
 4   Metal_mmol (mmol)                 233 non-null    float64
 5   Metal_concentration (mmol/g)      233 non-null    float64
 6   Chalcogen_amount (g)              233 non-null    float64
 7   Chalcogen_mmol (mmol)             233 non-null    float64
 8   Chalcogen_concentration (mmol/g)  233 non-null    float64
 9   Metal/Se_ratio                    233 non-null    float64
 10  Carboxylic_Acid                   233 non-null    object 
 11  CA_amount (g)                     233 non-null    float64
 12  CA_mmol 

In [39]:
#Initializes 2 lists to contain all of the numerical and categorical input columns
input_num_cols = [col for col in df_input.columns if df[col].dtypes !='O']
input_cat_cols = [col for col in df_input.columns if df[col].dtypes =='O']

In [40]:
input_cat_cols

['Metal_source',
 'Carboxylic_Acid',
 'Amines',
 'Phosphines',
 'Solvent I',
 'Solvent II']

In [41]:
#Initializes the ColumnTransformer object, and specifies what it will do with a passed in dataframe
ct = ColumnTransformer([
    ('step1', StandardScaler(), input_num_cols),
    ('step2', OneHotEncoder(sparse=False, handle_unknown='ignore'), input_cat_cols)
], remainder = 'passthrough')

In [42]:
#Uses the ColumnTransformer object to modify the input columns
df_input_scaled_encoded = pd.DataFrame(ct.fit_transform(df_input))
df_input_scaled_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.758171,1.102480,-0.219474,-0.438782,-0.396059,-0.249563,-0.245227,-0.160892,-1.022609,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.262509,0.325710,-0.372608,-0.473265,-0.355738,-0.574111,-0.588527,-0.193471,0.655878,-0.478178,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
229,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
230,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
231,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


#### Now that we have transformed all of the relevant columns, how can we take back all of the relevant column names?

In [43]:
ct.transformers_

[('step1',
  StandardScaler(),
  ['Injection Temp (Celsius)',
   'Growth Temp (Celsius)',
   'Metal_amount (g)',
   'Metal_mmol (mmol)',
   'Metal_concentration (mmol/g)',
   'Chalcogen_amount (g)',
   'Chalcogen_mmol (mmol)',
   'Chalcogen_concentration (mmol/g)',
   'Metal/Se_ratio',
   'CA_amount (g)',
   'CA_mmol (mmol)',
   'Cd/CA_ratio',
   'Amines_amount (g)',
   'Amines_mmol (mmol)',
   'Phosphines_amount (g)',
   'Phosphines_mmol (mmol)',
   'Chalcogen/Ph_ratio',
   'S_I_amount (g)',
   'S_II_amount (g)',
   'Total_amount (g)',
   'Time_min (min)']),
 ('step2',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['Metal_source',
   'Carboxylic_Acid',
   'Amines',
   'Phosphines',
   'Solvent I',
   'Solvent II'])]

In [44]:
#Look into ct object to see the column titles
array_of_cat_titles = ct.transformers_[1][1].get_feature_names()
len(array_of_cat_titles) #There are a total of 41 encoded categorical columns

39

In [45]:
#Number of numerical columns in the dataset
len(input_num_cols)

21

Pausing to do an initial sanity check here, we noticed that our initial dataset had 20 numerical columns (we checked the length of our `input_num_cols` array). 

Our from two cells above, we notice that our one hot encoding of the categorical columns resulted in 41 columns.

If we add 41 + 20, we obtain 61, which is consistent with the number of columns displayed when we printed `df_input_scaled_encoded`. 

We note here that the one hot encoded categorical columns are all appended at the end, and this is apparent when we displayed `df_input_scaled_encoded`. 

What we need to do next is figure out which column index corresponds to which category. E.g Column 30 refers to which category?


In [46]:
for i in range(len(input_num_cols)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i]: input_num_cols[i]}, inplace = True)

for j in range(len(array_of_cat_titles)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i+1]: array_of_cat_titles[j]}, inplace = True)
    i = i + 1


In [47]:
df_input_scaled_encoded.columns[43]

'x2_octylamine'

In [48]:
df_input_scaled_encoded

Unnamed: 0,Injection Temp (Celsius),Growth Temp (Celsius),Metal_amount (g),Metal_mmol (mmol),Metal_concentration (mmol/g),Chalcogen_amount (g),Chalcogen_mmol (mmol),Chalcogen_concentration (mmol/g),Metal/Se_ratio,CA_amount (g),...,x3_trioctylphosphine,x3_triphenylphosphine,x4_None,x4_liquid parafin,x4_octadecene,x4_phenyl ether,x4_trioctylphosphine oxide,x5_None,x5_phosphinic acid,x5_trioctylphosphine oxide
0,0.758171,1.102480,-0.219474,-0.438782,-0.396059,-0.249563,-0.245227,-0.160892,-1.022609,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.262509,0.325710,-0.372608,-0.473265,-0.355738,-0.574111,-0.588527,-0.193471,0.655878,-0.478178,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
229,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
230,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
231,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [49]:
df_scaled_encoded = pd.concat([df_input_scaled_encoded, df_output], axis = 1)
df_scaled_encoded

Unnamed: 0,Injection Temp (Celsius),Growth Temp (Celsius),Metal_amount (g),Metal_mmol (mmol),Metal_concentration (mmol/g),Chalcogen_amount (g),Chalcogen_mmol (mmol),Chalcogen_concentration (mmol/g),Metal/Se_ratio,CA_amount (g),...,x4_liquid parafin,x4_octadecene,x4_phenyl ether,x4_trioctylphosphine oxide,x5_None,x5_phosphinic acid,x5_trioctylphosphine oxide,diameter_nm,abs_nm,emission_nm
0,0.758171,1.102480,-0.219474,-0.438782,-0.396059,-0.249563,-0.245227,-0.160892,-1.022609,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.41,566,575
1,0.262509,0.325710,-0.372608,-0.473265,-0.355738,-0.574111,-0.588527,-0.193471,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.99,,497
2,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.13,,510
3,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.27,,517
4,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.53,,522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.70,450,
229,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.50,530,
230,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.50,585,
231,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.70,590,


In [50]:
df_scaled_encoded.to_csv('dataset_CdSe_scaled.csv')

In [51]:
df_scaled_encoded

Unnamed: 0,Injection Temp (Celsius),Growth Temp (Celsius),Metal_amount (g),Metal_mmol (mmol),Metal_concentration (mmol/g),Chalcogen_amount (g),Chalcogen_mmol (mmol),Chalcogen_concentration (mmol/g),Metal/Se_ratio,CA_amount (g),...,x4_liquid parafin,x4_octadecene,x4_phenyl ether,x4_trioctylphosphine oxide,x5_None,x5_phosphinic acid,x5_trioctylphosphine oxide,diameter_nm,abs_nm,emission_nm
0,0.758171,1.102480,-0.219474,-0.438782,-0.396059,-0.249563,-0.245227,-0.160892,-1.022609,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3.41,566,575
1,0.262509,0.325710,-0.372608,-0.473265,-0.355738,-0.574111,-0.588527,-0.193471,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.99,,497
2,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.13,,510
3,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.27,,517
4,0.262509,0.325710,-0.372608,-0.473265,-0.356642,-0.574111,-0.588527,-0.193578,0.655878,-0.478178,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.53,,522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.70,450,
229,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.50,530,
230,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.50,585,
231,0.758171,0.481064,-0.501614,-0.466369,-0.408479,-0.486967,-0.501557,-0.185402,-0.782825,-0.334503,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.70,590,


In [52]:
#Saves the row indexes to drop for absorbance modelling into a list
total_row_num = len(df_scaled_encoded)
drop_list_abs =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['abs_nm'].values[row_i] == 'None':
        drop_list_abs.append(row_i)
    
len(drop_list_abs)


38

In [53]:
#Drops the appropriate rows
df_absorbance_scaled_encoded = df_scaled_encoded.drop(drop_list_abs)

In [27]:
#Saves the data for absorbance modelling to CSV
df_absorbance_scaled_encoded.to_csv('dataset_CdSe_abs_filler.csv')

Drop unuseful columns **MANUALLY**


In [28]:
#Saves the row indexes to drop for PL modelling into a list
total_row_num = len(df_scaled_encoded)
drop_list_pl =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['emission_nm'].values[row_i] == 'None':
        drop_list_pl.append(row_i)
    
len(drop_list_pl)

77

In [29]:
#Drops the appropriate rows
df_pl_scaled_encoded = df_scaled_encoded.drop(drop_list_pl)

In [30]:
#Saves the data for PL modelling to CSV
df_pl_scaled_encoded.to_csv('dataset_CdSe_emission_filler.csv')

Drop unuseful columns **MANUALLY**
