In [13]:
import numpy as np 
import pandas as pd 
from pandas import DataFrame
import os
import joblib
import sklearn
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Import plotting libraries
import seaborn as sns
import matplotlib 
from matplotlib import pyplot as plt

# Set larger fontsize for all plots
matplotlib.rcParams.update({'font.size': 30})
from IPython.display import clear_output

In [14]:
# Getting dataset
df = pd.read_csv('../processed_data.csv')
df

Unnamed: 0,atomic_number,radius_pm,oxid_state,relativistic,metal_amount_mmol,ligand,num of group,spcaer,anchoring_group,chirality type,...,aromaticity1,sol1_vol_mL,protic2,polarity index 2,aromaticity2,solv2_vol_mL,time_min,temp_c,size_nm,g_factor*10^4
0,79,166,3,1,0.132,N-acetyl-L-cysteine,2,2.0,1,1,...,0,32.5,0,0.0,0,0.0,720,0,3.06,0.000000
1,79,166,3,1,0.132,N-acetyl-L-cysteine,2,2.0,1,1,...,0,33.5,0,0.0,0,0.0,720,0,1.00,0.757989
2,47,172,1,0,0.870,glutathione,4,2.0,1,1,...,0,30.0,0,0.0,0,0.0,60,0,1.00,0.583069
3,47,172,1,0,0.930,N-acetyl-Lcysteine,2,2.0,1,1,...,0,37.0,0,0.0,0,0.0,60,0,2.37,0.000000
4,79,166,3,1,0.132,glutathione,4,2.0,1,1,...,0,32.5,0,0.0,0,0.0,720,0,4.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,47,172,1,0,0.200,Dihydrolipoic acid,3,1.0,1,1,...,0,9.0,0,0.0,0,0.0,480,25,,2.000000
197,47,172,1,0,0.500,penicillamine,3,2.0,1,1,...,0,30.0,1,6.6,0,25.0,90,25,1.05,15.000000
198,47,172,1,0,0.500,penicillamine,3,2.0,1,1,...,0,30.0,1,6.6,0,25.0,90,25,1.30,10.000000
199,47,172,1,0,0.500,penicillamine,3,2.0,1,1,...,0,30.0,1,6.6,0,25.0,90,25,1.05,7.000000


In [15]:
#Checks if there are any columns with empty cells
df.isna().sum()

atomic_number             0
radius_pm                 0
oxid_state                0
relativistic              0
metal_amount_mmol         0
ligand                    0
num of group              0
spcaer                    0
anchoring_group           0
chirality type            0
hydrogen bond             0
aromaticity               0
ligand_amount_mmol        0
redu_num                  0
reductant_amount_mmol     0
protic1                   0
polarity index1           0
aromaticity1              0
sol1_vol_mL               0
protic2                   0
polarity index 2          0
aromaticity2              0
solv2_vol_mL              0
time_min                  0
temp_c                    0
size_nm                  52
g_factor*10^4            74
dtype: int64

# Scaling and Transforming

In [16]:
df_input = df.drop(columns =['ligand','size_nm','g_factor*10^4'], inplace = False, axis = 1)
df_output = df[['size_nm','g_factor*10^4']]

In [17]:
#Checks the column names, and ensures that they do not have any leading or trailing spaces
df_input.columns = df_input.columns.str.strip()
df_output.columns = df_output.columns.str.strip()

In [18]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   atomic_number          201 non-null    int64  
 1   radius_pm              201 non-null    int64  
 2   oxid_state             201 non-null    int64  
 3   relativistic           201 non-null    int64  
 4   metal_amount_mmol      201 non-null    float64
 5   num of group           201 non-null    int64  
 6   spcaer                 201 non-null    float64
 7   anchoring_group        201 non-null    int64  
 8   chirality type         201 non-null    int64  
 9   hydrogen bond          201 non-null    int64  
 10  aromaticity            201 non-null    int64  
 11  ligand_amount_mmol     201 non-null    float64
 12  redu_num               201 non-null    int64  
 13  reductant_amount_mmol  201 non-null    float64
 14  protic1                201 non-null    int64  
 15  polari

In [19]:
#change temp and time from integer to float
df_input['temp_c'] = df_input['temp_c'].astype(float)
df_input['time_min'] = df_input['time_min'].astype(float)
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   atomic_number          201 non-null    int64  
 1   radius_pm              201 non-null    int64  
 2   oxid_state             201 non-null    int64  
 3   relativistic           201 non-null    int64  
 4   metal_amount_mmol      201 non-null    float64
 5   num of group           201 non-null    int64  
 6   spcaer                 201 non-null    float64
 7   anchoring_group        201 non-null    int64  
 8   chirality type         201 non-null    int64  
 9   hydrogen bond          201 non-null    int64  
 10  aromaticity            201 non-null    int64  
 11  ligand_amount_mmol     201 non-null    float64
 12  redu_num               201 non-null    int64  
 13  reductant_amount_mmol  201 non-null    float64
 14  protic1                201 non-null    int64  
 15  polari

In [21]:
#Initializes 2 lists to contain all the numerical and categorical input columns
input_num_cols = [col for col in df_input.columns if df[col].dtypes !='O']
input_cat_cols = [col for col in df_input.columns if df[col].dtypes =='O']
print(input_num_cols, input_cat_cols)

['atomic_number', 'radius_pm', 'oxid_state', 'relativistic', 'metal_amount_mmol', 'num of group', 'spcaer', 'anchoring_group', 'chirality type', 'hydrogen bond', 'aromaticity', 'ligand_amount_mmol', 'redu_num', 'reductant_amount_mmol', 'protic1', 'polarity index1', 'aromaticity1', 'sol1_vol_mL', 'protic2', 'polarity index 2', 'aromaticity2', 'solv2_vol_mL', 'time_min', 'temp_c'] []


In [24]:
#Initializes the ColumnTransformer object, and specifies what it will do with a dataframe
#scaling numerical columns
#onehotencoder creates a binary column for each categorical entry
ct = ColumnTransformer([
    ('step1', StandardScaler(), input_num_cols),
    ('step2', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), input_cat_cols)
], remainder = 'passthrough')
#what are the transformers in ct
ct.transformers

[('step1',
  StandardScaler(),
  ['atomic_number',
   'radius_pm',
   'oxid_state',
   'relativistic',
   'metal_amount_mmol',
   'num of group',
   'spcaer',
   'anchoring_group',
   'chirality type',
   'hydrogen bond',
   'aromaticity',
   'ligand_amount_mmol',
   'redu_num',
   'reductant_amount_mmol',
   'protic1',
   'polarity index1',
   'aromaticity1',
   'sol1_vol_mL',
   'protic2',
   'polarity index 2',
   'aromaticity2',
   'solv2_vol_mL',
   'time_min',
   'temp_c']),
 ('step2', OneHotEncoder(handle_unknown='ignore', sparse_output=False), [])]

In [25]:
#Uses the ColumnTransformer object to modify the input columns
df_input_scaled_encoded = pd.DataFrame(ct.fit_transform(df_input))
df_input_scaled_encoded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.90070,-0.626518,0.980887,0.900450,-0.454943,-0.280991,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.124583,-0.537086,-0.793018,-0.240613,-0.523310,0.680663,-0.997007
1,0.90070,-0.626518,0.980887,0.900450,-0.454943,-0.280991,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.104117,-0.537086,-0.793018,-0.240613,-0.523310,0.680663,-0.997007
2,-1.03686,-0.252283,-0.853143,-1.110555,1.601355,0.049297,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.175747,-0.537086,-0.793018,-0.240613,-0.523310,-0.593509,-0.997007
3,-1.03686,-0.252283,-0.853143,-1.110555,1.768534,-0.280991,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.032487,-0.537086,-0.793018,-0.240613,-0.523310,-0.593509,-0.997007
4,0.90070,-0.626518,0.980887,0.900450,-0.454943,0.049297,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.124583,-0.537086,-0.793018,-0.240613,-0.523310,0.680663,-0.997007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1.03686,-0.252283,-0.853143,-1.110555,-0.265474,-0.115847,-0.742392,-0.803389,0.120568,-0.956183,...,0.443203,0.492773,-0.070711,-0.605526,-0.537086,-0.793018,-0.240613,-0.523310,0.217328,-0.176375
197,-1.03686,-0.252283,-0.853143,-1.110555,0.570420,-0.115847,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.175747,1.861899,1.679991,-0.240613,-0.004855,-0.535592,-0.176375
198,-1.03686,-0.252283,-0.853143,-1.110555,0.570420,-0.115847,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.175747,1.861899,1.679991,-0.240613,-0.004855,-0.535592,-0.176375
199,-1.03686,-0.252283,-0.853143,-1.110555,0.570420,-0.115847,-0.526910,-0.803389,0.120568,1.045825,...,0.443203,0.492773,-0.070711,-0.175747,1.861899,1.679991,-0.240613,-0.004855,-0.535592,-0.176375


In [26]:
#Number of numerical columns
len(input_num_cols)

24

In [27]:
#renaming columns
for i in range(len(input_num_cols)):
    df_input_scaled_encoded.rename(columns={df_input_scaled_encoded.columns[i]: input_num_cols[i]}, inplace = True)

In [28]:
df_input_scaled_encoded.columns

Index(['atomic_number', 'radius_pm', 'oxid_state', 'relativistic',
       'metal_amount_mmol', 'num of group', 'spcaer', 'anchoring_group',
       'chirality type', 'hydrogen bond', 'aromaticity', 'ligand_amount_mmol',
       'redu_num', 'reductant_amount_mmol', 'protic1', 'polarity index1',
       'aromaticity1', 'sol1_vol_mL', 'protic2', 'polarity index 2',
       'aromaticity2', 'solv2_vol_mL', 'time_min', 'temp_c'],
      dtype='object')

In [29]:
#appends output columns
df_scaled_encoded = pd.concat([df_input_scaled_encoded, df_output], axis = 1)
df_scaled_encoded

Unnamed: 0,atomic_number,radius_pm,oxid_state,relativistic,metal_amount_mmol,num of group,spcaer,anchoring_group,chirality type,hydrogen bond,...,aromaticity1,sol1_vol_mL,protic2,polarity index 2,aromaticity2,solv2_vol_mL,time_min,temp_c,size_nm,g_factor*10^4
0,0.90070,-0.626518,0.980887,0.900450,-0.454943,-0.280991,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.124583,-0.537086,-0.793018,-0.240613,-0.523310,0.680663,-0.997007,3.06,0.000000
1,0.90070,-0.626518,0.980887,0.900450,-0.454943,-0.280991,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.104117,-0.537086,-0.793018,-0.240613,-0.523310,0.680663,-0.997007,1.00,0.757989
2,-1.03686,-0.252283,-0.853143,-1.110555,1.601355,0.049297,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.175747,-0.537086,-0.793018,-0.240613,-0.523310,-0.593509,-0.997007,1.00,0.583069
3,-1.03686,-0.252283,-0.853143,-1.110555,1.768534,-0.280991,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.032487,-0.537086,-0.793018,-0.240613,-0.523310,-0.593509,-0.997007,2.37,0.000000
4,0.90070,-0.626518,0.980887,0.900450,-0.454943,0.049297,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.124583,-0.537086,-0.793018,-0.240613,-0.523310,0.680663,-0.997007,4.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1.03686,-0.252283,-0.853143,-1.110555,-0.265474,-0.115847,-0.742392,-0.803389,0.120568,-0.956183,...,-0.070711,-0.605526,-0.537086,-0.793018,-0.240613,-0.523310,0.217328,-0.176375,,2.000000
197,-1.03686,-0.252283,-0.853143,-1.110555,0.570420,-0.115847,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.175747,1.861899,1.679991,-0.240613,-0.004855,-0.535592,-0.176375,1.05,15.000000
198,-1.03686,-0.252283,-0.853143,-1.110555,0.570420,-0.115847,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.175747,1.861899,1.679991,-0.240613,-0.004855,-0.535592,-0.176375,1.30,10.000000
199,-1.03686,-0.252283,-0.853143,-1.110555,0.570420,-0.115847,-0.526910,-0.803389,0.120568,1.045825,...,-0.070711,-0.175747,1.861899,1.679991,-0.240613,-0.004855,-0.535592,-0.176375,1.05,7.000000


In [30]:
df_scaled_encoded.to_csv('dataset_scaled.csv')

# Making separate outputs dataset

In [31]:
#Saves into a list the row indexes to drop for size dataset
total_row_num = len(df_scaled_encoded)
drop_list_size =[]
for row_i in range(total_row_num):
    if df_scaled_encoded['size_nm'].values[row_i] == 'None':
        drop_list_size.append(row_i)
    
#number of entries
print(total_row_num-len(drop_list_size))

201


In [32]:
#Drops rows that don't have size output
df_size_scaled_encoded = df_scaled_encoded.drop(drop_list_size)
df_size_scaled_encoded.to_csv('dataset_scaled_size.csv')