# Classical fingerprints generation and benchmarking
Morgan Fingerprints, Topological Torsion, MACCS, Atom Pairs.

## 1. Checking fingerprints calculation

In [1]:
# add parent folder to path
import sys
sys.path.insert(0, '..')

%load_ext autoreload
%autoreload 2

In [2]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# suppress RDKit warnings
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [3]:
from fingerprints import *

In [4]:
smi = 'Cc1ccccc1'

morgan = morgan_fp(smi)
print(morgan)

[0 0 0 ... 0 0 0]


In [5]:
top_torsion = topological_torsion_fp(smi)

In [6]:
print(top_torsion)

[0 0 0 ... 0 0 0]


In [7]:
len(morgan), len(top_torsion)

(2048, 2048)

In [8]:
pair = pair_fp(smi)

In [9]:
pair.shape

(2048,)

In [10]:
maccs = macc_fp(smi)

In [11]:
maccs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0])

In [12]:
len(maccs)

167

## 2. Calculate fingerprints from the file

In [21]:
import pandas as pd
from pathlib import Path
import seaborn as sns
from tqdm.notebook import tqdm
import pickle
import matplotlib.pyplot as plt
import os

In [14]:
path_papyrus = Path("../data/2023_09_12_papyrus1k_dataset_more_params_STD_MFP_lessColumns.parquet")
assert path_papyrus.exists()

In [15]:
df = pd.read_parquet(path_papyrus)
df.head()

Unnamed: 0,target_id,class_label,STD_SMILES,STD_SELFIES,split
0,B2RXH2_WT,0,Oc1ccccc1C=NNC(=S)NC1CC2C=CC1C2,[O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=...,train
1,P00352_WT,0,Oc1ccccc1C=NNC(=S)NC1CC2C=CC1C2,[O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=...,train
2,P43220_WT,0,Oc1ccccc1C=NNC(=S)NC1CC2C=CC1C2,[O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=...,train
3,P51450_WT,0,Oc1ccccc1C=NNC(=S)NC1CC2C=CC1C2,[O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=...,train
4,Q03164_WT,0,Oc1ccccc1C=NNC(=S)NC1CC2C=CC1C2,[O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=...,train


In [16]:
df.shape

(881081, 5)

Can we calculate fingerprints for the entry from `STD_SMILES`?

In [17]:
smi = df.STD_SMILES[0]
print(smi)
morgan = morgan_fp(smi)
print(morgan)

Oc1ccccc1C=NNC(=S)NC1CC2C=CC1C2
[0 0 0 ... 0 0 0]


In [18]:
df.STD_SMILES.unique().shape

(489402,)

In [19]:
d_morgan = {}

for smi in tqdm(df.STD_SMILES.unique()):
    d_morgan[smi] = morgan_fp(smi)

  0%|          | 0/489402 [00:00<?, ?it/s]

In [22]:
# create folfer for pickles
os.makedirs("../out/fingerprints_dicts", exist_ok=True)

In [23]:
pickle.dump(d_morgan, open("../out/fingerprints_dicts/d_morgan.pkl", "wb"))

In [26]:
del d_morgan

In [27]:
d_topological_torsion = {}
for smi in tqdm(df.STD_SMILES.unique()):
    d_topological_torsion[smi] = topological_torsion_fp(smi)
pickle.dump(d_topological_torsion, open("../out/fingerprints_dicts/d_topological_torsion.pkl", "wb"))
del d_topological_torsion

  0%|          | 0/489402 [00:00<?, ?it/s]

In [28]:
d_pair = {}
for smi in tqdm(df.STD_SMILES.unique()):
    d_pair[smi] = pair_fp(smi)
pickle.dump(d_pair, open("../out/fingerprints_dicts/d_pair.pkl", "wb"))
del d_pair

  0%|          | 0/489402 [00:00<?, ?it/s]

In [29]:
d_maccs = {}
for smi in tqdm(df.STD_SMILES.unique()):
    d_maccs[smi] = macc_fp(smi)
pickle.dump(d_maccs, open("../out/fingerprints_dicts/d_maccs.pkl", "wb"))
del d_maccs

  0%|          | 0/489402 [00:00<?, ?it/s]