# RDKIT Fingerprints

importing modules and data

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
from rdkit.Chem import AllChem, rdMolDescriptors, Draw, PandasTools

In [3]:
fpgen = AllChem.GetRDKitFPGenerator()

In [4]:
src = pd.read_csv("../output/backup_unique.csv")
smiles = pd.DataFrame(src['Smiles'])
smiles
# make mol object from smiles column and add it to the dataframe
PandasTools.AddMoleculeColumnToFrame(smiles,'Smiles', 'mol')

defining function to generate fingerprints

In [5]:
def generate_fpts(data):

# loops over all the mols and calculates their Morgan fingerprints
# then returns the 2D list as a numpy array

    Mfpts = []
    
    for mol in tqdm(data):
        
        # Calculate Morgan fingerprints and append to the empty list above
        mfpt = fpgen.GetFingerprint(mol)
        Mfpts.append(mfpt)
        
    return np.array(Mfpts)

In [6]:
mfpts = generate_fpts(smiles['mol'])

100%|██████████| 23485/23485 [01:33<00:00, 249.93it/s]


add fingerprints back with smiles 

In [7]:
final = pd.concat([src,pd.DataFrame(mfpts)], axis=1)
out = final[final.columns[1:]]

In [8]:
out

Unnamed: 0,Molecule ChEMBL ID,Smiles,a/i,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,CHEMBL99951,N=C(N)c1ccc2nc(Cc3nc4ccc(C(=N)N)cc4[nH]3)[nH]c2c1,1.0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
1,CHEMBL9995,CC(C)(C)C(=O)OCc1cc(=O)c(OC(=O)C(C)(C)C)co1,-1.0,0,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
2,CHEMBL99939,CC(Cc1c[nH]c2ccccc12)(NC(=O)OC1[C@H]2C[C@@H]3C...,1.0,1,1,0,0,0,1,1,...,1,0,0,1,0,1,1,0,0,1
3,CHEMBL9991,CC(C)(C)C(=O)Oc1ccc(C(=O)c2ccccc2)cc1CCC(=O)O,1.0,0,0,0,0,0,1,0,...,1,0,0,0,0,1,0,1,0,1
4,CHEMBL9990,CC(C)(C)C(=O)Oc1cc(C(=O)OCc2cc(=O)c(OC(=O)C(C)...,1.0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23480,CHEMBL100059,C/C(=C(\F)C(=O)Nc1ccc(-c2ccccc2S(N)(=O)=O)cc1)...,1.0,1,1,1,1,1,1,0,...,1,0,0,0,1,1,0,0,1,1
23481,CHEMBL100049,Cc1cnc(NCC(F)(F)c2ccccc2)c(=O)n1CC(=O)NCc1ccc2...,1.0,1,0,1,1,1,1,0,...,1,0,1,1,0,0,1,1,1,1
23482,CHEMBL10003,O=c1oc(NCCc2ccccc2I)nc2ccccc12,-1.0,1,1,0,0,0,1,0,...,0,1,1,1,1,0,0,0,0,1
23483,CHEMBL100013,Nc1nccc2ccc(C(=O)N3Cc4ccccc4C[C@H]3C(=O)Nc3ccc...,1.0,1,1,1,1,1,1,1,...,1,1,0,1,1,1,0,0,0,1


In [11]:
out.to_csv("../rdkit_fgprnt/rdkit_fgprnt.csv")