This notebook is about preperaring data for GNN saving the data to use it for later under 
`gnn_smiles_texpi_kpl.csv` & `gnn_smiles_kpl.csv`.

<a id="import-libraries"></a>
# 1. Import Libraries & Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# we get original df from trial 1, but we change it to trial 3 folder
original_df = pd.read_csv("../../GNN-trials/trial 1/data-original.csv", encoding= 'unicode_escape')
smiles = pd.read_csv("../../GNN-trials/trial 1/smiles.csv", encoding= 'unicode_escape')

In [3]:
original_df.shape, smiles.shape

((476, 20), (476, 2))

In [4]:
original_df.head(2)

Unnamed: 0,No,Compound,SMILES,CAS No,set,MWa,logKowb,Mptc,LogSaqd,LogSoce,Hdf,Hag,MVh,Texpi,Skin thicknessj,Skin Integrity testk,Skin Selection criteria,logkpl,logJmaxm,Reference
0,80,Urea,C(=O)(N)N,57-13-6,t,60.1,-2.11,406.0,-1.85,-3.96,4,3,36.7,312,0.1,transepidermal electrical resistance,20-130 Kohmcm2,-3.55,-5.41,(Peck et al. 1995)
1,81,Urea,C(=O)(N)N,57-13-6,t,60.1,-2.11,406.0,-2.02,-4.13,4,3,36.7,300,0.1,transepidermal electrical resistance,20-130 Kohmcm2,-3.69,-5.71,(Peck et al. 1995)


In [5]:
smiles.head(2)

Unnamed: 0,SMILES,Texpi
0,C(=O)(N)N,312
1,C(=O)(N)N,300


In [6]:
# comparing the new smiles
ready_df = smiles.copy()
ready_df['SMILES_2'] = original_df['SMILES']
ready_df['logkpl'] = original_df['logkpl']

In [8]:
ready_df.query("SMILES != SMILES_2")

Unnamed: 0,SMILES,Texpi,SMILES_2,logkpl
31,C[N+](C)(C)C1=CC2=C(C=C1)C=CC(=C2N=NC3=CC=CC=C...,305,C[N+](C)(C)C1=CC2=C(C=C1)C=CC(=C2N=NC3=CC=CC=C...,-2.77
127,C1CNP(=O)(OC1)N(CCCl)CCCl,305,C1CNP(=O)(OC1)N(CCCl)CCCl.O,-2.54
128,C1CNP(=O)(OC1)N(CCCl)CCCl,305,C1CNP(=O)(OC1)N(CCCl)CCCl.O,-2.91
167,CC[N+](CC)(CC)CC,310,CC[N+](CC)(CC)CC.[Br-],-3.37
168,CC[N+](CC)(CC)CC,298,CC[N+](CC)(CC)CC.[Br-],-3.25
338,CCCCCCCCCC(=O)O,310,CCCCCCCCCC(=O)[O-].[Na+],-2.95


In [9]:
gnn_df = smiles.copy()
gnn_df['logkpl'] = original_df['logkpl']

In [10]:
gnn_df.head(3)

Unnamed: 0,SMILES,Texpi,logkpl
0,C(=O)(N)N,312,-3.55
1,C(=O)(N)N,300,-3.69
2,C(=O)(N)N,310,-3.83


In [11]:
gnn_df.isna().sum()

SMILES    0
Texpi     0
logkpl    0
dtype: int64

In [12]:
# starting from index 445 - 476 (432 exlcuded) removing water compounds
index_to_drop = [i for i in range(445, 476)]

In [21]:
# remove the rows from our dataframes
gnn_df = gnn_df.drop(index_to_drop, axis=0)
gnn_df.reset_index(inplace=True, drop=True)

In [22]:
# grouping by smiles 
smiles_grouped = gnn_df.groupby(['SMILES'], as_index=False).agg({'logkpl': "mean"})
smiles_grouped

Unnamed: 0,SMILES,logkpl
0,C(=O)(N)N,-3.730000
1,C(C(C(C(C(CO)O)O)O)O)O,-4.354615
2,C(C(CO)O)O,-3.790000
3,C(C1C(C(C(C(O1)OC2(C(C(C(O2)CO)O)O)CO)O)O)O)O,-4.512857
4,C(C1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3(C(C(C(O3)CO)...,-4.575000
...,...,...
141,COC(=O)CC1=CC=C(C=C1)O,-1.700000
142,COC(C(C=O)O)C(C(CO)O)O,-3.680000
143,COC1=C(C=CC(=C1)CC=C)O,-1.575000
144,COS(=O)(=O)C,-1.990000


In [23]:
# grouping by smiles then by temperature
smiles_texpi_grouped = gnn_df.groupby(['SMILES', "Texpi"], as_index=False).agg({'logkpl': "mean"})
smiles_texpi_grouped

Unnamed: 0,SMILES,Texpi,logkpl
0,C(=O)(N)N,300,-3.690
1,C(=O)(N)N,310,-3.774
2,C(=O)(N)N,312,-3.550
3,C(C(C(C(C(CO)O)O)O)O)O,298,-3.970
4,C(C(C(C(C(CO)O)O)O)O)O,300,-4.190
...,...,...,...
217,COC(=O)CC1=CC=C(C=C1)O,310,-1.700
218,COC(C(C=O)O)C(C(CO)O)O,310,-3.680
219,COC1=C(C=CC(=C1)CC=C)O,305,-1.575
220,COS(=O)(=O)C,305,-1.990


In [24]:
smiles_grouped.isna().sum(), smiles.isna().sum()

(SMILES    0
 logkpl    0
 dtype: int64,
 SMILES    0
 Texpi     0
 dtype: int64)

In [25]:
# saving the two files
smiles_grouped.to_csv('../../data/final/gnn_smiles_kpl.csv', index=False)
smiles_texpi_grouped.to_csv('../../data/final/gnn_smiles_texpi_kpl.csv', index=False)