# Morgan Fingerprints

importing modules and data

In [1]:
!source myenv/bin/activate

In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors, Draw, PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.ML.Cluster import Butina

In [4]:
src = pd.read_csv("../data/merged.csv")
src = src.drop_duplicates(subset='Molecule ChEMBL ID')
selected_cols = ['Molecule ChEMBL ID', 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Units']
filtered_df = src[selected_cols]
df_ic50ki = filtered_df[(filtered_df['Standard Type']=='IC50') | (filtered_df['Standard Type']=='Ki')]
df_ic50ki = df_ic50ki[df_ic50ki['Standard Units'] == 'nM' ]
df_ic50ki = df_ic50ki.dropna(subset=selected_cols)

uni  = pd.read_csv('../new_csv_out/uniquestandarvalueandchem.csv')

merg = pd.merge(df_ic50ki, uni, on = 'Molecule ChEMBL ID', how = 'right')
# merg = merg[~merg.applymap(lambda x: x == 'NaN').any(axis=1)]
merg = merg.dropna()
merg
merg.to_csv('../new_csv_out/newmerge.csv', index = False)

In [6]:
# src = pd.read_csv("../output/threshold_ic50_noisy.csv")
# smiles = pd.DataFrame(src['Smiles'])

# # make mol object from smiles column and add it to the dataframe
# PandasTools.AddMoleculeColumnToFrame(smiles,'Smiles', 'mol')
# smiles

# d 

src = pd.read_csv("../new_csv_out/newmerge.csv")
smiles = pd.DataFrame(src['Smiles'])

# make mol object from smiles column and add it to the dataframe
# PandasTools.AddMoleculeColumnToFrame(smiles,'Smiles', 'mol')
# smiles
src.shape

(21056, 6)

defining function to generate fingerprints

In [7]:
def generate_fpts(data):
    # Dictionary to store fingerprints
    dicto = {}

    # Add RDKit molecule column to the DataFrame using PandasTools
    PandasTools.AddMoleculeColumnToFrame(data, 'Smiles', 'mol')
    
    # Iterate over the SMILES and molecule columns
    for smile, mol in tqdm(zip(data['Smiles'], data['mol'])):
        if mol is None:
            print(f"Invalid SMILES: {smile}")
            continue

        mfpt = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048)

        fingerprint = np.array(mfpt)
        dicto[smile] = fingerprint
    fpt_df = pd.DataFrame.from_dict(dicto, orient='index')

    # Rename the columns with numeric values (bit indices)
    fpt_df.columns = [f'Bit_{i}' for i in range(fpt_df.shape[1])]

    return fpt_df
# Generate fingerprints for each SMILES string
fingerprints_dict = generate_fpts(smiles)

# Output the resulting dictionary
print(fingerprints_dict)

21056it [01:13, 286.55it/s]


                                                    Bit_0  Bit_1  Bit_2  \
CCOC(=O)c1cc(-c2ccc(NC(=O)C(F)(F)F)cc2)[nH]n1           0      0      0   
CCOC(=O)c1cc(-c2ccc(NC(=O)c3ccc(C)cc3)cc2)[nH]n1        0      0      0   
NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccc(Cl)cc2)cc1          0      0      0   
CCCOC(=O)C1CCN(CC(=O)Nc2ccc(S(=O)(=O)N=C(N)N)cc...      0      0      0   
NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccccc2)cc1              0      0      0   
...                                                   ...    ...    ...   
NCCCCCCOC[C@@H]1O[C@@H](OCCc2cc3ccccc3[nH]2)[C@...      0      0      0   
C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H](...      0      1      0   
C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)C2C(=O)[C@@H...      0      1      0   
C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H](...      0      1      0   
C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H](...      0      1      0   

                                                    Bit_3  Bit_4  Bit_5  \
CCOC(=O)c1cc(-c2ccc(NC(=

add fingerprints back with smiles 

In [8]:
x = pd.DataFrame(fingerprints_dict)
x= x.reset_index()
x = x.rename(columns =  {'index': 'Smiles'})

In [9]:
x

Unnamed: 0,Smiles,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,...,Bit_2038,Bit_2039,Bit_2040,Bit_2041,Bit_2042,Bit_2043,Bit_2044,Bit_2045,Bit_2046,Bit_2047
0,CCOC(=O)c1cc(-c2ccc(NC(=O)C(F)(F)F)cc2)[nH]n1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CCOC(=O)c1cc(-c2ccc(NC(=O)c3ccc(C)cc3)cc2)[nH]n1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccc(Cl)cc2)cc1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCCOC(=O)C1CCN(CC(=O)Nc2ccc(S(=O)(=O)N=C(N)N)c...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccccc2)cc1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21051,NCCCCCCOC[C@@H]1O[C@@H](OCCc2cc3ccccc3[nH]2)[C...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
21052,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H]...,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21053,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)C2C(=O)[C@@...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21054,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H]...,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
src

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Relation,Standard Units,Standard Value
0,CHEMBL1834549,CCOC(=O)c1cc(-c2ccc(NC(=O)C(F)(F)F)cc2)[nH]n1,IC50,'=',nM,7900000.00
1,CHEMBL1834506,CCOC(=O)c1cc(-c2ccc(NC(=O)c3ccc(C)cc3)cc2)[nH]n1,IC50,'=',nM,34600000.00
2,CHEMBL1911249,NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccc(Cl)cc2)cc1,IC50,'=',nM,9350000.00
3,CHEMBL1911241,CCCOC(=O)C1CCN(CC(=O)Nc2ccc(S(=O)(=O)N=C(N)N)c...,IC50,'=',nM,3310000.00
4,CHEMBL1911245,NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccccc2)cc1,IC50,'=',nM,2300000.00
...,...,...,...,...,...,...
21051,CHEMBL2303623,NCCCCCCOC[C@@H]1O[C@@H](OCCc2cc3ccccc3[nH]2)[C...,IC50,'=',nM,11000.00
21052,CHEMBL376484,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H]...,IC50,'=',nM,0.50
21053,CHEMBL1790896,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)C2C(=O)[C@@...,IC50,'=',nM,15000.00
21054,CHEMBL376485,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H]...,IC50,'=',nM,0.35


In [11]:

final = pd.merge(src, x, how='left', on='Smiles')
final

Unnamed: 0,Molecule ChEMBL ID,Smiles,Standard Type,Standard Relation,Standard Units,Standard Value,Bit_0,Bit_1,Bit_2,Bit_3,...,Bit_2038,Bit_2039,Bit_2040,Bit_2041,Bit_2042,Bit_2043,Bit_2044,Bit_2045,Bit_2046,Bit_2047
0,CHEMBL1834549,CCOC(=O)c1cc(-c2ccc(NC(=O)C(F)(F)F)cc2)[nH]n1,IC50,'=',nM,7900000.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL1834506,CCOC(=O)c1cc(-c2ccc(NC(=O)c3ccc(C)cc3)cc2)[nH]n1,IC50,'=',nM,34600000.00,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,CHEMBL1911249,NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccc(Cl)cc2)cc1,IC50,'=',nM,9350000.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL1911241,CCCOC(=O)C1CCN(CC(=O)Nc2ccc(S(=O)(=O)N=C(N)N)c...,IC50,'=',nM,3310000.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL1911245,NC(N)=NS(=O)(=O)c1ccc(NC(=O)COc2ccccc2)cc1,IC50,'=',nM,2300000.00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21051,CHEMBL2303623,NCCCCCCOC[C@@H]1O[C@@H](OCCc2cc3ccccc3[nH]2)[C...,IC50,'=',nM,11000.00,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
21052,CHEMBL376484,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H]...,IC50,'=',nM,0.50,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
21053,CHEMBL1790896,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)C2C(=O)[C@@...,IC50,'=',nM,15000.00,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
21054,CHEMBL376485,C[C@@H](O)[C@@H]1NC(=O)[C@H](CCCCN)NC(=O)[C@H]...,IC50,'=',nM,0.35,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
final.to_csv('../new_csv_out/morgan_mfpts.csv', index = False)

In [11]:
final.shape

(21055, 2055)

In [36]:
pd.DataFrame(mfpts)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15718,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15719,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15720,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15721,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
final.to_csv("../output/ic50_fingerprint.csv", index=False)

#  **Concat Std. Vals**

In [16]:
final = pd.concat([smiles, src['Standard Value'], pd.DataFrame(mfpts)], axis=1)
final

Unnamed: 0,Smiles,mol,Standard Value,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,COC(=O)c1cccc(OC(=O)c2ccc(NC(=N)N)cc2)c1.Cl,<rdkit.Chem.rdchem.Mol object at 0x7a6d0f208f20>,5.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CCOc1ccc(NC(=O)c2cc(-c3ccc(NC(=N)N)cc3)on2)cc1.Cl,<rdkit.Chem.rdchem.Mol object at 0x7a6d0f208f90>,6400.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCc1ccc(OC(=O)c2ccc(NC(=N)N)cc2)cc1O,<rdkit.Chem.rdchem.Mol object at 0x7a6d0f209000>,99.7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cc1ccc(C(C)C)c(OC(=O)c2ccc(NC(=N)N)cc2)c1.Cc1c...,<rdkit.Chem.rdchem.Mol object at 0x7a6d0f209070>,300.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,COc1ccc(OC(=O)c2ccc(NC(=N)N)cc2)cc1,<rdkit.Chem.rdchem.Mol object at 0x7a6d0f2090e0>,16.3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15718,CCN(C(=O)C1CCC=C1C(=O)NCc1ccc(C(=N)N)cc1)c1ccc...,<rdkit.Chem.rdchem.Mol object at 0x7a6d0ffc2880>,93000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15719,[2H]C(=C=C1C(=O)N2C(C(=O)OC(c3ccccc3)c3ccccc3)...,<rdkit.Chem.rdchem.Mol object at 0x7a6d0ffc28f0>,13500000.0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15720,CC[C@H](NC(=O)[C@H](CCCN=C(N)N)NC(=O)[C@@H](NC...,<rdkit.Chem.rdchem.Mol object at 0x7a6d0ffc2960>,26000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15721,CCN(c1nc2ccccc2c(=O)o1)S(=O)(=O)c1ccc(Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x7a6d0ffc29d0>,30000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
final.to_csv('../output/ic50_fingerprints_w_stdvals.csv')