IMPORTING LIBRARIES FOR MOLECULAR DESCRIPTORS 

In [7]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

MOLECULAR DESCRIPTORS CALCULATION

In [8]:
def calc_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles) if isinstance(smiles, str) else None
    if mol :
        return {
            Descriptors.MolWt(mol),
            Descriptors.NumRotatableBonds(mol),
            Descriptors.NumHAcceptors(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.TPSA(mol),
            Descriptors.MolLogP(mol),
        }
        
    return [None]*6
    

READING PRE-PROCESSED DATA, WHICH IS SAVED IN PREVIOUS PROCESSING STEPS & CALCULATING DESCRIPTORS

In [11]:
df = pd.read_csv('ChEMBL221_14062025_preprocessed.csv').dropna()

descriptors_cols = [
    'MolWt',
    'NumRotatableBonds',
    'NumHAcceptors',
    'NumHDonors',
    'TPSA',
    'MolLogP'
]

CALCULATING MOLECULAR DESCRIPTORS FROM SMILES & ADDING THEM AS NEW COLUMN TO THE DATASET 

In [12]:
df['descriptors'] = df['smiles'].apply(calc_descriptors)
df[descriptors_cols] = pd.DataFrame(df['descriptors'].tolist(), index=df.index)


df.head()

Unnamed: 0,molecule_chembl_id,smiles,standard_value,bioactivity_class,descriptors,MolWt,NumRotatableBonds,NumHAcceptors,NumHDonors,TPSA,MolLogP
0,CHEMBL420628,Cc1ccc(-c2ncc(Cl)cc2-c2ccc(S(N)(=O)=O)cc2)cn1,800.0,active,"{1, 3, 4, 3.4198200000000014, 359.838000000000...",1.0,3.0,4.0,3.41982,359.838,85.94
1,CHEMBL323732,CS(=O)(=O)c1ccc(C2=C(c3ccc(C(F)(F)F)c(F)c3)CCC...,100000.0,inactive,"{384.3940000000001, 0, 2, 3, 34.14, 5.34270000...",384.394,0.0,2.0,3.0,34.14,5.3427
2,CHEMBL139,O=C(O)Cc1ccccc1Nc1c(Cl)cccc1Cl,150.0,active,"{2, 4, 4.364100000000001, 296.153, 49.33}",2.0,4.0,4.3641,296.153,49.33,
3,CHEMBL355781,CS(=O)(=O)c1ccc(-c2csc(CC(=O)O)c2-c2ccc(F)cc2)cc1,50000.0,inactive,"{1, 4, 5, 390.45700000000005, 71.44, 4.2518000...",1.0,4.0,5.0,390.457,71.44,4.2518
4,CHEMBL406881,Nc1ccccc1NC(=O)c1ccccc1C(F)(F)F,100000.0,inactive,"{280.24899999999997, 2, 3.539900000000001, 55.12}",280.249,2.0,3.5399,55.12,,


SAVING THE FILE OF BIOACTIVITY DATA WITH DESCRIPTORS 

In [13]:
df.dropna().to_csv('excel_bioactivity_data_with_descriptors2.csv', index=False)

In [14]:
# Compute descriptors
descriptor_cols = ['MolWt', 'NumRotatableBonds', 'NumHAcceptors', 'NumHDonors', 'TPSA', 'MolLogP']
df['descriptors'] = df['smiles'].apply(calc_descriptors)
df[descriptor_cols] = pd.DataFrame(df['descriptors'].tolist(), index=df.index)

# Find rows with any missing descriptor
invalid_rows = df[df[descriptor_cols].isna().any(axis=1)]


print("❌ Molecules with descriptor failure:", len(invalid_rows))


invalid_rows.to_csv('failed_descriptor_molecules.csv', index=False)

❌ Molecules with descriptor failure: 1026
