In [10]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Lipinski
from rdkit.Chem import Descriptors
from rdkit.Chem import GraphDescriptors

pd.options.mode.chained_assignment = None 
pd.set_option('display.max_rows',1000)
pd.set_option('display.max_column',20)

In [11]:
#Remove those  items that IUPACName without name
pubchem_data = pd.read_csv("data/pubchem_compounds.csv")
print("With NaN shape:",pubchem_data.shape)
pubchem_data.dropna(subset=['IUPACName'],inplace= True)
print("Without NaN shape:",pubchem_data.shape)
pubchem_data.head(5)


With NaN shape: (70, 4)
Without NaN shape: (70, 4)


Unnamed: 0,CID,IUPACName,MolecularFormula,IsomericSMILES
0,7812,4-chloroaniline,C6H6ClN,C1=CC(=CC=C1N)Cl
1,7423,3-nitroaniline,C6H6N2O2,C1=CC(=CC(=C1)[N+](=O)[O-])N
2,9731,4-fluoroaniline,C6H6FN,C1=CC(=CC=C1N)F
3,7447,2-methoxy-5-nitroaniline,C7H8N2O3,COC1=C(C=C(C=C1)[N+](=O)[O-])N
4,7441,2-methyl-4-nitroaniline,C7H8N2O2,CC1=C(C=CC(=C1)[N+](=O)[O-])N


In [12]:
#Preprocess data with A-site & concentration
df = pubchem_data

#A-site type equals to 0
df_A_site_0 = df.copy()
df_A_site_0['A_site'] = 0

# A:0 concentration 1
df_A_site_0_conc_1 = df_A_site_0.copy()
df_A_site_0_conc_1['Conc(mg/ml)'] = 1

# A:0 concentration 2
df_A_site_0_conc_2 = df_A_site_0.copy()
df_A_site_0_conc_2['Conc(mg/ml)'] = 2

# A:0 concentration 4
df_A_site_0_conc_4 = df_A_site_0.copy()
df_A_site_0_conc_4['Conc(mg/ml)'] = 4

# A:0 concentration 6
df_A_site_0_conc_6 = df_A_site_0.copy()
df_A_site_0_conc_6['Conc(mg/ml)'] = 6

# A:0 concentration 8
df_A_site_0_conc_8 = df_A_site_0.copy()
df_A_site_0_conc_8['Conc(mg/ml)'] = 8




#A-site type equals to 1
df_A_site_1 = df.copy()
df_A_site_1['A_site'] = 1

# A:1 concentration 1
df_A_site_1_conc_1 = df_A_site_1.copy()
df_A_site_1_conc_1['Conc(mg/ml)'] = 1


# A:1 concentration 2
df_A_site_1_conc_2 = df_A_site_1.copy()
df_A_site_1_conc_2['Conc(mg/ml)'] = 2

# A:1 concentration 4
df_A_site_1_conc_4 = df_A_site_1.copy()
df_A_site_1_conc_4['Conc(mg/ml)'] = 4

# A:1 concentration 6
df_A_site_1_conc_6 = df_A_site_1.copy()
df_A_site_1_conc_6['Conc(mg/ml)'] = 6

# A:1 concentration 8
df_A_site_1_conc_8 = df_A_site_1.copy()
df_A_site_1_conc_8['Conc(mg/ml)'] = 8


df_concated = pd.concat([df_A_site_0_conc_1,df_A_site_0_conc_2,df_A_site_0_conc_4,df_A_site_0_conc_6,df_A_site_0_conc_8,
                         df_A_site_1_conc_1,df_A_site_1_conc_2,df_A_site_1_conc_4,df_A_site_1_conc_6,df_A_site_1_conc_8],
                        axis = 0)
print("shape:",df_concated.shape)

shape: (700, 6)


In [13]:
#halide atom accumulator
def Halide_accumulator(mol):
    Num_Fluorine_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 9)#F
    Num_Chlorine_Atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 17)#Cl
    Num_Bromine_Atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 35)#Br
    Num_Iodine_atoms = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 53)#I
    Num_Halide_Atoms =  Num_Fluorine_atoms + Num_Chlorine_Atoms +Num_Bromine_Atoms +  Num_Iodine_atoms
    return Num_Halide_Atoms

In [14]:
data_set = df_concated
data_set.head()

Unnamed: 0,CID,IUPACName,MolecularFormula,IsomericSMILES,A_site,Conc(mg/ml)
0,7812,4-chloroaniline,C6H6ClN,C1=CC(=CC=C1N)Cl,0,1
1,7423,3-nitroaniline,C6H6N2O2,C1=CC(=CC(=C1)[N+](=O)[O-])N,0,1
2,9731,4-fluoroaniline,C6H6FN,C1=CC(=CC=C1N)F,0,1
3,7447,2-methoxy-5-nitroaniline,C7H8N2O3,COC1=C(C=C(C=C1)[N+](=O)[O-])N,0,1
4,7441,2-methyl-4-nitroaniline,C7H8N2O2,CC1=C(C=CC(=C1)[N+](=O)[O-])N,0,1


In [15]:
smi = data_set['IsomericSMILES'].to_list()
feature_columns = [

    'H_Acceptors',
    'H_Donors',
    'Rot_bonds',
    'Mol_Wt',
    'TPSA',
    'AvgIpc',

    'Hydrogen',
    'Oxide',
    'Nitrogen',
    'Halide',
    
]

for col in feature_columns:
    data_set[col] = ' ' 

print(f"shape:{data_set.shape}")
data_set = data_set.iloc[:,:]

shape:(700, 16)


In [16]:
for i in range(len(smi)):
    mol_hided_H = Chem.MolFromSmiles(smi[i])
    mol = Chem.AddHs(mol_hided_H)
    
    data_set.loc[i,'H_Acceptors'] = Lipinski.NumHAcceptors(mol)
    data_set.loc[i,'H_Donors'] = Lipinski.NumHDonors(mol)
    data_set.loc[i,'Rot_bonds'] = Lipinski.NumRotatableBonds(mol)
    data_set.loc[i,'Mol_Wt'] = Descriptors.MolWt(mol)
    data_set.loc[i,'TPSA'] = Descriptors.TPSA(mol)
    data_set.loc[i,'AvgIpc'] = GraphDescriptors.AvgIpc(mol)

    data_set.loc[i,'Hydrogen'] = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 1)
    data_set.loc[i,'Oxide'] = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 8)
    data_set.loc[i,'Nitrogen'] = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() == 7)
    data_set.loc[i,'Halide'] = Halide_accumulator(mol)
    
newpassivtors_add_features = data_set.iloc[:len(smi),:]

#Save
newpassivtors_add_features.to_csv("data/newpassivator_with_features.csv",index=False)


In [17]:
newpassivtors_add_features.shape

(700, 16)

In [18]:
newpassivtors_add_features.head(100)

Unnamed: 0,CID,IUPACName,MolecularFormula,IsomericSMILES,A_site,Conc(mg/ml),H_Acceptors,H_Donors,Rot_bonds,Mol_Wt,TPSA,AvgIpc,Hydrogen,Oxide,Nitrogen,Halide
0,7812.0,4-chloroaniline,C6H6ClN,C1=CC(=CC=C1N)Cl,0.0,1.0,1,1,1,127.574,26.02,2.112066,6,0,1,1
1,7423.0,3-nitroaniline,C6H6N2O2,C1=CC(=CC(=C1)[N+](=O)[O-])N,0.0,1.0,3,1,2,138.126,69.16,2.131108,6,2,2,0
2,9731.0,4-fluoroaniline,C6H6FN,C1=CC(=CC=C1N)F,0.0,1.0,1,1,1,111.119,26.02,2.112066,6,0,1,1
3,7447.0,2-methoxy-5-nitroaniline,C7H8N2O3,COC1=C(C=C(C=C1)[N+](=O)[O-])N,0.0,1.0,4,1,4,168.152,78.39,2.222235,8,3,2,0
4,7441.0,2-methyl-4-nitroaniline,C7H8N2O2,CC1=C(C=CC(=C1)[N+](=O)[O-])N,0.0,1.0,3,1,3,152.153,69.16,2.196466,8,2,2,0
5,79617.0,4-methylsulfonylaniline,C7H9NO2S,CS(=O)(=O)C1=CC=C(C=C1)N,0.0,1.0,3,1,3,171.221,60.16,2.242309,9,2,1,0
6,11118.0,5-nitro-2-propoxyaniline,C9H12N2O3,CCCOC1=C(C=C(C=C1)[N+](=O)[O-])N,0.0,1.0,4,1,6,196.206,78.39,2.416056,12,3,2,0
7,75655.0,4-morpholin-4-ylaniline,C10H14N2O,C1COCCN1C2=CC=C(C=C2)N,0.0,1.0,3,1,2,178.235,38.49,2.459172,14,1,2,0
8,74218.0,5-chloro-2-nitroaniline,C6H5ClN2O2,C1=CC(=C(C=C1Cl)N)[N+](=O)[O-],0.0,1.0,3,1,2,172.571,69.16,2.183714,5,2,2,1
9,136315.0,4-(trifluoromethylsulfonyl)aniline,C7H6F3NO2S,C1=CC(=CC=C1N)S(=O)(=O)C(F)(F)F,0.0,1.0,3,1,2,225.191,60.16,2.242309,6,2,1,3
