In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

In [2]:
def getMolDescriptors(mol, missingVal=0):
    ''' calculate the full list of descriptors for a molecule
    
        missingVal is used if the descriptor cannot be calculated
    '''
    res = []
    for nm, fn in Descriptors._descList:
        if nm != 'Ipc':
            try:
                val = fn(mol)
                res.append(val)
            except:
                print(nm)
                return None
    return res

# Read CSV file containing ChEMBL ID and SMILES
data_df = pd.read_csv("fda_original.csv", delimiter=';')

# Create DataFrame to store descriptors
descriptors_df = pd.DataFrame(columns=[nm for nm, _ in Descriptors._descList if nm != 'Ipc'])

# Initialize lists to store ChEMBL IDs and SMILES
chembl_id_list = []
smiles_list = []

# Iterate through ChEMBL IDs and SMILES, and fetch descriptors
for c, (chembl_id, smiles) in enumerate(zip(data_df["ChEMBL ID"], data_df["Smiles"])):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        descriptors = getMolDescriptors(mol)
        if descriptors is not None and len(descriptors) == len(descriptors_df.columns):
            descriptors_df.loc[c] = descriptors
            chembl_id_list.append(chembl_id)
            smiles_list.append(smiles)

# Add ChEMBL ID and SMILES columns to descriptors DataFrame
descriptors_df['ChEMBL ID'] = chembl_id_list
descriptors_df['SMILES'] = smiles_list


In [3]:
# Write descriptors DataFrame to a new CSV file
descriptors_df.to_csv("FDA_features.csv", index=False)