# **RDKit Descriptor Calculation and Dataset Preparation for ML models**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [4]:
#Load dataset
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
file1 = "/content/drive/MyDrive/bioactivity/beta_secretase1_bioactivity_data_pIC50.csv"
df = pd.read_csv(file1)
df.head()

Unnamed: 0,molecule_chembl_id,bioactivity_class,standard_value,canonical_smiles,Molecular_Weight,LogP,HBD,HBA,pIC50
0,CHEMBL406146,intermediate,413.0,CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...,999.085,-1.4355,13,13,6.38405
1,CHEMBL78946,active,2.0,CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...,893.005,-1.7361,12,12,8.69897
2,CHEMBL324109,intermediate,460.0,CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...,751.988,2.3535,8,9,6.337242
3,CHEMBL114147,inactive,9000.0,CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...,737.895,1.9626,8,8,5.045757
4,CHEMBL419949,inactive,5600.0,CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...,828.02,3.5739,8,8,5.251812


In [7]:
#  Extract relevant columns from the original DataFrame
df_extracted = df[['molecule_chembl_id', 'canonical_smiles', 'pIC50']]

In [8]:
print(len(Descriptors._descList))
print(Descriptors._descList[:5])

208
[('MaxEStateIndex', <function MaxEStateIndex at 0x7b6b03b8f520>), ('MinEStateIndex', <function MinEStateIndex at 0x7b6b03b8f5b0>), ('MaxAbsEStateIndex', <function MaxAbsEStateIndex at 0x7b6b03b8f640>), ('MinAbsEStateIndex', <function MinAbsEStateIndex at 0x7b6b03b8f6d0>), ('qed', <function qed at 0x7b6b03bc6710>)]


In [9]:
# function to caculate RDKit descriptors
def getMolDescriptors(mol, missingVal=None):
    '''Calculate the full list of descriptors for a molecule.'''
    res = {}
    for nm, fn in Descriptors._descList:
        try:
            val = fn(mol)  # Compute descriptor value
        except:
            import traceback
            traceback.print_exc()  # Handle exceptions and assign missing value
            val = missingVal
        res[nm] = val
    return res

In [10]:
# Extract SMILES strings
smi = df_extracted["canonical_smiles"].values.tolist()

In [11]:
# Initialize an empty list to store descriptors
allDescrs = []

# Loop over SMILES list to calculate descriptors
for i in range(len(smi)):
    m = Chem.MolFromSmiles(smi[i])  # Convert SMILES to RDKit Mol object
    descr = getMolDescriptors(m)  # Calculate descriptors
    allDescrs.append(descr)  # Store descriptors


In [12]:
# Create a DataFrame from the descriptors list
df_descr = pd.DataFrame(allDescrs)
df_descr.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,14.032746,-2.195009,14.032746,0.017379,0.041154,999.085,932.557,998.459678,390,0,...,0,0,0,0,0,0,0,0,0,0
1,13.595406,-1.573089,13.595406,0.09781,0.042501,893.005,828.493,892.454199,352,0,...,0,0,0,0,0,0,0,0,0,0
2,13.243577,-1.303772,13.243577,0.114162,0.077027,751.988,690.5,751.419,294,0,...,1,0,0,0,0,0,0,0,0,0
3,13.416202,-1.312338,13.416202,0.118038,0.09937,737.895,682.455,737.399978,290,0,...,0,0,0,0,0,0,0,0,0,0
4,13.721715,-1.361064,13.721715,0.112353,0.074085,828.02,766.532,827.446929,324,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Concatenate the extracted columns (molecule_chembl_id, canonical_smiles, pIC50) with the descriptors
df_final = pd.concat([df_extracted, df_descr], axis=1)

# Save the final DataFrame as a CSV file
df_final.to_csv('beta_secretase1_bioactivity_data_pIC50_rdkit_descriptors.csv', index=False)

print(df_final.head())

  molecule_chembl_id                                   canonical_smiles  \
0       CHEMBL406146  CC(C)C[C@H](NC(=O)[C@@H](NC(=O)[C@@H](N)CCC(=O...   
1        CHEMBL78946  CC(C)C[C@H](NC(=O)[C@H](CC(N)=O)NC(=O)[C@@H](N...   
2       CHEMBL324109  CCC(C)C[C@H](NC(=O)[C@H](CC(C)C)NC(C)=O)[C@@H]...   
3       CHEMBL114147  CC(=O)NCC(=O)N[C@@H](Cc1ccccc1)[C@@H](O)CC(=O)...   
4       CHEMBL419949  CC(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](Cc1ccccc1...   

      pIC50  MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  \
0  6.384050       14.032746       -2.195009          14.032746   
1  8.698970       13.595406       -1.573089          13.595406   
2  6.337242       13.243577       -1.303772          13.243577   
3  5.045757       13.416202       -1.312338          13.416202   
4  5.251812       13.721715       -1.361064          13.721715   

   MinAbsEStateIndex       qed    MolWt  HeavyAtomMolWt  ...  fr_sulfide  \
0           0.017379  0.041154  999.085         932.557  ...           0   


In [14]:
! cp beta_secretase1_bioactivity_data_pIC50_rdkit_descriptors.csv "/content/drive/MyDrive/bioactivity/"

In [15]:
! ls "/content/drive/MyDrive/bioactivity/"

beta_secretase1_bioactivity_data_pIC50.csv		      bioactivity_data.csv
beta_secretase1_bioactivity_data_pIC50_rdkit_descriptors.csv  bioactivity_preprocessed_data.csv
