In [15]:
from collections import defaultdict
import pandas as pd
import numpy as np

from rdkit.Chem import AllChem
from rdkit import DataStructs

In [5]:
file_path = '../data/ion_mobility_paper/ion_mobility_paper_data_w_inchikey.csv'
data = pd.read_csv(file_path)

The Wei et al, 2019, paper "Rapid Prediction of Electron−Ionization Mass Spectrometry Using Neural Networks" uses Extended Circular Fingerprints as the input to their data gen model. They describe Extended Circular Fingerprints as similar to [Extended-Connectivity Fingerprints](https://pubs.acs.org/doi/10.1021/ci100050t), "in that they record molecular subgraphs made up from local neighborhoods around each atom node in the molecule, but differ in that they count the occurrences for each subgroup. This information is then hashed into a vector representation. The difference is that additive fingerprints record the frequency that each bit is set, rather than just the presence." In what follows, Extended Circular Fingerprints will be referred to as additive Extended-Connectivity Fingerprints (additive ECFPs).

Using code from Wei et al's [GitHub codebase](https://github.com/brain-research/deep-molecular-massspec/tree/main) and [this](https://www.blopig.com/blog/2022/11/how-to-turn-a-smiles-string-into-an-extended-connectivity-fingerprint-using-rdkit/) blog as guides for getting Extended Circular Fingerprints from RDKit. Using Wei et al's parameters: radius of 2 and fingerprint length of 4096.

In [20]:
smiles_list = data['SMILES']
additive_ecfp_dict = defaultdict()

for smiles in smiles_list:
    mol = AllChem.MolFromSmiles(smiles)
    additive_ecfp = AllChem.GetHashedMorganFingerprint(mol, radius=2, nBits=4096)
    additive_ecfp_arr = np.zeros(1)
    DataStructs.ConvertToNumpyArray(additive_ecfp, additive_ecfp_arr)
    additive_ecfp_dict[smiles] = additive_ecfp_arr

In [29]:
data.tail(10)

Unnamed: 0,Compound,m/z,SV 1500,SV 2000,SV 2500,SV 3000,SV 3250,SV 3500,SV 3750,SV 4000,Boltzmann-weighted CCS,InChIKey,SMILES
358,Midazolam,326.1,0.46,1.0,1.98,3.53,4.5,5.74,7.15,8.83,172.96,DDLIGBOFAVUZHB-UHFFFAOYSA-N,CC1=NC=C2N1C3=C(C=C(C=C3)Cl)C(=NC2)C4=CC=CC=C4F
359,Norcodeine,286.1,0.32,0.75,1.47,2.52,3.24,4.15,5.26,6.52,166.2,HKOIXWVRNLGFOR-KOFBORESSA-N,COC1=C2C3=C(CC4C5C3(CCN4)C(O2)C(C=C5)O)C=C1
360,Prazosin,383.8,0.48,1.18,2.28,3.84,4.82,6.02,7.38,8.89,197.63,IENZQIKPVFGBNW-UHFFFAOYSA-N,COC1=C(C=C2C(=C1)C(=NC(=N2)N3CCN(CC3)C(=O)C4=C...
361,Prednisolone,361.1,0.42,0.93,1.79,3.0,3.69,4.78,5.82,7.08,177.84,OIGNJSKKLXVSLS-VWUMJDOOSA-N,CC12CC(C3C(C1CCC2(C(=O)CO)O)CCC4=CC(=O)C=CC34C)O
362,Safranin O,315.1,0.31,0.86,1.82,3.24,4.14,5.31,6.71,8.33,176.27,OARRHUQTFTUEOS-UHFFFAOYSA-N,CC1=CC2=C(C=C1N)[N+](=C3C=C(C(=CC3=N2)C)N)C4=C...
363,Sotalol,273.1,0.45,1.05,2.08,3.64,4.7,6.01,7.58,9.38,162.81,ZBMZVLHSJCTVON-UHFFFAOYSA-N,CC(C)NCC(C1=CC=C(C=C1)NS(=O)(=O)C)O
364,Tamoxifen,372.1,0.59,1.52,2.99,5.13,6.52,8.06,9.85,11.98,200.25,NKANXQFJJICGDU-QPLCGJKRSA-N,CCC(=C(C1=CC=CC=C1)C2=CC=C(C=C2)OCCN(C)C)C3=CC...
365,Taurocholic Acid,516.3,0.85,1.82,3.42,5.76,7.34,9.24,11.55,14.18,195.22,WBWWGRHZICKQGZ-HZAMXZRMSA-N,CC(CCC(=O)NCCS(=O)(=O)O)C1CCC2C1(C(CC3C2C(CC4C...
366,Terfenadine,472.3,0.75,1.53,2.77,4.49,5.59,6.84,8.35,9.99,231.07,GUGOEEXESWIERI-UHFFFAOYSA-N,CC(C)(C)C1=CC=C(C=C1)C(CCCN2CCC(CC2)C(C3=CC=CC...
367,Urapidil,388.2,0.65,1.17,2.06,3.51,4.47,5.54,6.85,8.43,184.83,ICMGLRUYEQNHPF-UHFFFAOYSA-N,CN1C(=CC(=O)N(C1=O)C)NCCCN2CCN(CC2)C3=CC=CC=C3OC


In [25]:
smiles_list[0]

'CC1=NC2=C(C=CC(=C2C=C1)Cl)O'