# Compute Features

This notebook uses RDKit to compute 2D molecular [descriptors](https://www.rdkit.org/docs/GettingStartedInPython.html#list-of-available-descriptors) for our sets of ligands, starting from the SMILES string we obtained from ChEMBL.

In [9]:
import json

import pandas as pd

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

# Helper function to compute descriptors for a single molecule
def compute_descriptors(molecule):
    descriptors = {d[0]: d[1](molecule) for d in Descriptors.descList}
    descriptors = pd.Series(descriptors)
    return descriptors

# List the targets that have associated ChEMBL IDs
with open('dude_target_chembl_ids.json') as f:
    target_chembl_ids = json.load(f)


In [12]:
for target in target_chembl_ids:
    data = pd.read_csv(f'./chembl_data/{target}_extracted_chembl_data.csv', index_col=0)

    # For simplicity, drop any molecules with multiple measurements. 
    # Could explore averaging experimental data, or prioritising e.g. Ki over Kd/IC50
    data = data.loc[data.index.drop_duplicates(keep=False)]

    # Create 'Molecule' objects from SMILES strings, dropping anything that RDKit can't sanitise
    molecules = data['canonical_smiles'].apply(Chem.MolFromSmiles).dropna()

    # Compute descriptors
    descriptors = molecules.apply(compute_descriptors)

    # Save named descriptors as a csv file
    descriptors.to_csv(f'./features/{target}_ligands_descriptors.csv')