In [2]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

In [3]:
# Read in the chembl data: contains chembl_id, pchembl_value, assay_type and canonical_smiles – among other columns
file_name = 'cah2_extracted_chembl_data_ours.csv'
df = pd.read_csv(os.path.join('../DSProject/data',file_name)) # ! diff path
## select columns of interest
### note that for all columns, standard_type='Ki', standard_relation='=', data_validity_comment=NaN, potential_duplicate=0, assay_type='B'
df = df[['target_chembl_id','molecule_chembl_id','assay_chembl_id','pchembl_value','canonical_smiles','standard_type','assay_type','description']]
# df

In [4]:
# Identify duplicated entries
# df = df.sort_values(by='molecule_chembl_id')
# df[df.duplicated(subset=['molecule_chembl_id'])]

# Obtain the mean pchembl_value of duplicated entries – all entries are Ki
# df.groupby('molecule_chembl_id')['pchembl_value'].mean()
df = df.groupby(['target_chembl_id','molecule_chembl_id','canonical_smiles','standard_type','assay_type']).mean().reset_index()

# Remove molecules for which no 3D coordinates can be computed, as electroshape requires 3D coordinates
## Entry 1187; CHEMBL2105487; smiles=[S]
# df = df.drop(df.loc[df['molecule_chembl_id'] == 'CHEMBL2105487'].index, axis=0)
# ## Entry
df

Unnamed: 0,target_chembl_id,molecule_chembl_id,canonical_smiles,standard_type,assay_type,pchembl_value
0,CHEMBL205,CHEMBL100075,CC(C)c1cc(-c2ccccc2)cc(C(C)C)[n+]1CC(=O)OCCOc1...,Ki,B,8.15
1,CHEMBL205,CHEMBL100266,CN(C)CCOC(=O)c1cccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O...,Ki,B,8.68
2,CHEMBL205,CHEMBL100329,CCN(CC)CCNC(=O)c1cccc(S(=O)(=O)Nc2nnc(S(N)(=O)...,Ki,B,8.77
3,CHEMBL205,CHEMBL100456,CCc1cc(-c2ccccc2)cc(CC)[n+]1CC(=O)Oc1ccc2nc(S(...,Ki,B,8.15
4,CHEMBL205,CHEMBL100580,Cc1cc(C)[n+](CC(=O)NNc2ccc(S(N)(=O)=O)cc2)c(C)...,Ki,B,6.50
...,...,...,...,...,...,...
4961,CHEMBL205,CHEMBL99697,CCc1cc(-c2ccccc2)cc(CC)[n+]1CC(=O)Nc1ccc(S(N)(...,Ki,B,7.82
4962,CHEMBL205,CHEMBL99736,Cc1cc(C)[n+](CC(=O)NCCC(=O)Nc2nnc(S(N)(=O)=O)s...,Ki,B,8.15
4963,CHEMBL205,CHEMBL99855,Cc1cc(-c2ccccc2)cc(C)[n+]1CC(=O)NCCC(=O)Nc1nnc...,Ki,B,8.40
4964,CHEMBL205,CHEMBL99927,COCCOC(=O)c1ccc(S(=O)(=O)Nc2nnc(S(N)(=O)=O)s2)cc1,Ki,B,8.55


In [5]:
# Compute features

In [6]:
## Compute molecular descriptors using RDKit

In [7]:
# Get rdkit molecule info – code from Fergus Boyles
molecules = df['canonical_smiles'].apply(Chem.MolFromSmiles).dropna()
df['RDKit_Molecule'] = molecules

In [7]:
### Obtain RDKit molecule descriptors

# Helper function to compute descriptors for a single molecule – from Fergus B
def compute_descriptors(molecule):
    descriptors = {d[0]: d[1](molecule) for d in Descriptors.descList}
    descriptors = pd.Series(descriptors)
    return descriptors

descriptors = df['RDKit_Molecule'].apply(compute_descriptors)

In [1]:
## Compute fingerprint features

bond_radius = 2
nBits = 2048

fps = [
    AllChem.GetMorganFingerprintAsBitVect(mol, bond_radius, nBits=nBits)
    for mol in df['RDKit_Molecule']
]

fps_arr = [np.array(bitvec) for bitvec in fps]
fps_df = pd.DataFrame(fps_arr)

NameError: name 'df' is not defined

In [8]:
df_w_rdkit_desc = pd.concat([df, descriptors], axis=1)
# df_w_rdkit_desc.to_csv('data/cah2_chembl_data_plus_rdkit_descriptors.csv', index=False)