# Calculating additional descriptors for chemical compounds

## Importing libraries

In [1]:
import pandas as pd
from rdkit import Chem

## Reading the dataframe (an example)

In [20]:
file_path = '../data/IA22+OPERA.csv'

In [21]:
data = pd.read_csv(file_path)

In [22]:
data.columns

Index(['cid', 'std_smiles', 'CompRate_ylog', 'dG_MD_ylog', 'MolWeight',
       'nbAtoms', 'nbHeavyAtoms', 'nbRing', 'nbRotBd', 'nbHBdAcc', 'ndHBdDon',
       'TopoPolSurfAir', 'MolarRefract', 'CombDipolPolariz', 'LogP_pred',
       'MP_pred', 'BP_pred', 'LogVP_pred', 'LogWS_pred', 'LogHL_pred',
       'RT_pred', 'LogKOA_pred', 'ionization', 'pKa_b_pred', 'LogD55_pred',
       'LogD74_pred', 'LogOH_pred', 'nbNring', 'pKa_b_pred_divN',
       'pKa_b_pred_multN', 'frac_OS_N', 'pKa_b_pred_div_O_S', 'frac_N',
       'frac_OS_atoms', 'frac_ring_N', 'frac_Nring_atoms', 'frac_ring_atoms',
       'CN', 'CC', 'CO', 'C', 'N', 'O', 'CS', 'S'],
      dtype='object')

### Caculating additional features from OPERA descriptors

In [23]:
#### We assume that all the necessary columns (e.g. pKa_b_pred, nbN, etc. are available from OPERA)

In [24]:
data['nbN'] = data['std_smiles'].apply(lambda x: sum(1 for atom in Chem.MolFromSmiles(x).GetAtoms() if atom.GetSymbol() == 'N'))
data['nbO'] = data['std_smiles'].apply(lambda x: sum(1 for atom in Chem.MolFromSmiles(x).GetAtoms() if atom.GetSymbol() == 'O'))

In [25]:
data[data['frac_Nring_atoms']!=data['frac_ring_N']]

Unnamed: 0,cid,std_smiles,CompRate_ylog,dG_MD_ylog,MolWeight,nbAtoms,nbHeavyAtoms,nbRing,nbRotBd,nbHBdAcc,...,CN,CC,CO,C,N,O,CS,S,nbN,nbO


In [26]:
data['nbS'] = data['std_smiles'].str.count('S').tolist() # Number of non-aromatic sulfur atoms
data['nbNring'] = data['std_smiles'].str.count('N1').tolist() # Number of cycles containing nitrogen
# various derivative descriptors
data['pKa_b_pred_divN'] = data['pKa_b_pred'] / data['nbN']
data['pKa_b_pred_multN'] = data['pKa_b_pred'] * data['nbN']
data['frac_OS_N'] = (data['nbO'] + data['nbS']) / data['nbN']
data['pKa_b_pred_div_O_S'] = data['pKa_b_pred'] * data['frac_OS_N']
data['frac_N'] = data['nbN'] / data['nbAtoms']
data['frac_OS_atoms'] = (data['nbO'] + data['nbS']) / data['nbAtoms']
data['frac_ring_N'] = data['nbNring'] / data['nbN']
data['frac_Nring_atoms'] = data['nbNring'] / data['nbN']
data['frac_ring_atoms'] = data['nbRing'] / data['nbAtoms']

In [27]:
data

Unnamed: 0,cid,std_smiles,CompRate_ylog,dG_MD_ylog,MolWeight,nbAtoms,nbHeavyAtoms,nbRing,nbRotBd,nbHBdAcc,...,CC,CO,C,N,O,CS,S,nbN,nbO,nbS
0,108-01-0,CN(C)CCO,-0.529534,-1.777381,89.084064,17.0,6.0,0.0,2.0,2.0,...,1.0,1.0,4.0,1.0,1.0,0.0,0.0,1,1,0
1,3179-63-3,CN(C)CCCO,-0.463405,-1.770849,103.099714,20.0,7.0,0.0,3.0,2.0,...,2.0,1.0,5.0,1.0,1.0,0.0,0.0,1,1,0
2,100-37-8,CCN(CC)CCO,-0.684356,-1.806987,117.115364,23.0,8.0,0.0,4.0,2.0,...,3.0,1.0,6.0,1.0,1.0,0.0,0.0,1,1,0
3,622-93-5,CCN(CC)CCCO,-0.704554,-1.816606,131.131014,26.0,9.0,0.0,5.0,2.0,...,4.0,1.0,7.0,1.0,1.0,0.0,0.0,1,1,0
4,108-16-7,CC(O)CN(C)C,-0.619906,-1.797589,103.099714,20.0,7.0,0.0,2.0,2.0,...,2.0,1.0,5.0,1.0,1.0,0.0,0.0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,102-81-8,CCCCN(CCO)CCCC,-0.549957,-1.799773,173.177964,35.0,12.0,0.0,8.0,2.0,...,7.0,1.0,10.0,1.0,1.0,0.0,0.0,1,1,0
120,2158-67-0,CN(CCCO)CCCO,-0.486586,-1.776835,147.125929,27.0,10.0,0.0,6.0,3.0,...,4.0,2.0,7.0,1.0,2.0,0.0,0.0,1,2,0
121,5842-08-0,CCCCN(CCS)CCCC,-0.690977,-1.847296,189.155121,35.0,12.0,0.0,8.0,1.0,...,7.0,0.0,10.0,1.0,0.0,1.0,1.0,1,0,1
122,19031-93-7,CCCN(CCS)CCS,-0.412004,-1.780960,179.080242,27.0,10.0,0.0,6.0,1.0,...,4.0,0.0,7.0,1.0,0.0,2.0,2.0,1,0,2
