In [6]:
import os
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors, MACCSkeys, rdFingerprintGenerator
import pandas as pd
import numpy as np

In [7]:
preprocessed_data_directory = '/Users/Avi/Dissertation/Data/Curated/Preprocessed'

fingerprinted_data_directory = '/Users/Avi/Dissertation/Data/Curated/Fingerprinted'
os.makedirs(fingerprinted_data_directory, exist_ok=True)

In [8]:
target_CHEMBL4078 = pd.read_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL4078_Curated_Preprocessed.csv'))
target_CHEMBL279 = pd.read_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL279_Curated_Preprocessed.csv'))
target_CHEMBL5763 = pd.read_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL5763_Curated_Preprocessed.csv'))
target_CHEMBL240 = pd.read_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL240_Curated_Preprocessed.csv'))
target_CHEMBL4005 = pd.read_csv(os.path.join(preprocessed_data_directory, 'Target_CHEMBL4005_Curated_Preprocessed.csv'))

### Defining Functions to Generate Fingerprints

In [9]:
def generate_ecfp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    return DataStructs.BitVectToText(fp)

def generate_maccs_keys(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = MACCSkeys.GenMACCSKeys(mol)
    return DataStructs.BitVectToText(fp)

def generate_atom_pair(smiles, min_distance, max_distance):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    generator = rdFingerprintGenerator.GetAtomPairGenerator(minDistance=min_distance, maxDistance=max_distance, fpSize=1024)
    fp = generator.GetFingerprint(mol)
    return DataStructs.BitVectToText(fp)

def combine_ap2_ap3(smiles):
    ap2 = generate_atom_pair(smiles, 2, 2)
    ap3 = generate_atom_pair(smiles, 3, 3)
    if ap2 is None or ap3 is None:
        return None
    return ap2 + ap3

### Generating and Saving Fingerprinted Data

In [10]:
for target, name in [(target_CHEMBL4078, 'Target_CHEMBL4078_Curated_Fingerprinted.csv'),
                     (target_CHEMBL279, 'Target_CHEMBL279_Curated_Fingerprinted.csv'),
                     (target_CHEMBL5763, 'Target_CHEMBL5763_Curated_Fingerprinted.csv'),
                     (target_CHEMBL240, 'Target_CHEMBL240_Curated_Fingerprinted.csv'),
                     (target_CHEMBL4005, 'Target_CHEMBL4005_Curated_Fingerprinted.csv')]:
    
    target['ECFP'] = target['canonical_smiles'].apply(generate_ecfp)
    target['MACCS Keys'] = target['canonical_smiles'].apply(generate_maccs_keys)
    target['AP2'] = target['canonical_smiles'].apply(lambda x: generate_atom_pair(x, 2, 2))
    target['AP3'] = target['canonical_smiles'].apply(lambda x: generate_atom_pair(x, 3, 3))
    target['AP2+AP3'] = target['canonical_smiles'].apply(combine_ap2_ap3) 
    
    target.to_csv(os.path.join(fingerprinted_data_directory, name), index=False)

