# How to use the functions related to Fingerprints in the repository?

# SMILES -> Fingerprint

First, it's important to import some of the modules that have been built.


In [1]:
from src.utils import path_check
from src.data.mgf_tools.mgf_get import mgf_get_smiles, mgf_get_spectra
from src.data.fingerprints_tools.fingerprint_generator import *

import warnings
warnings.filterwarnings("ignore")

And to have the data loaded, in this case a .mgf file.

In [2]:
mgf_data = r"/Users/carla/PycharmProjects/Mestrado/Transformer-Based-Models-for-Chemical-Fingerprint-Prediction/datasets/raw/cleaned_gnps_library.mgf"

path_check(mgf_data)

In [22]:
spectra = mgf_get_spectra(mgf_data)

Once the data has been loaded, use the following function to obtain the molecules SMILES.

In [23]:
smiles_data = mgf_get_smiles(spectra)

Once you have the data loaded, you can generate the Fingerprints for the SMILES using the following steps

In [8]:
smiles_data

In [24]:
smiles = smiles_data['smiles']
ids = smiles_data['spectrum_id']


In [10]:
dataset = smiles_to_fingerprint(smiles_data=smiles, ids=ids)

In [14]:
dataset.X

In [21]:
import numpy as np

n_ones = np.count_nonzero(dataset.X)
n_total = dataset.X.size

print(n_ones, n_total)
n_zeros = n_total - n_ones

pos_weight = n_zeros / n_ones
print(pos_weight)

In this way, you have obtained Morgan Fingerprints generated from Deepmol.

You can check the generated fingerprints (Smiles dataset by DeepMol) using the following code:

In [9]:
dataset.X[0]

In [10]:
np.unique(dataset.X[0], return_counts=True)

In [24]:
for i in range(5):
    print(f"Fingerprint {i+1} â†’ {np.unique(dataset.X[i], return_counts=True)}")


You can check the generated fingerprints (Dataframe) using the following code:


In [12]:
dataframe = smiles_to_fingerprint(smiles_data=smiles, ids=ids, return_df=True)

In [21]:
dataframe

# .mgf -> Fingerprint

In [1]:
from src.utils import *
from src.mgf_tools.mgf_get import mgf_get_spectra
import numpy as np

In [2]:
mgf_data = r"/Users/carla/PycharmProjects/Mestrado/Transformer-Based-Models-for-Chemical-Fingerprint-Prediction/datasets/raw/cleaned_gnps_library.mgf"

path_check(mgf_data)

In [3]:
data = mgf_get_spectra(mgf_data=mgf_data, num_spectra=10)

In [4]:
mz_vocabs = np.arange(50, 2000, 0.1)

mgf_deconvoluter(mgf_data=data, mz_vocabs=mz_vocabs ,min_num_peaks=2, max_num_peaks=100, noise_rmv_threshold=0.01, mass_error=0.01, log=True, plot=True)