# Real vs Pred fingerprints

This notebook is used to compare the fingerprints generated by the model with the actual fingerprints obtained through SMILES using Rdkit.

We start by importing the necessary packages and modules, as well as defining some important variables such as the seed and the dataset.

In [None]:
from src.models.Transformer import Transformer
from src.utils import calculate_tanimoto
from src.data.data_loader import *
from src.data.mgf_tools.mgf_get import *
from rdkit import Chem
from rdkit.Chem import Draw
from IPython.display import display
import numpy as np
import json

seed = ''
mgf_path = ''

This notebook is based on the assumption that a checkpoint of the trained model already exists, as well as its entire pipeline.

Therefore, let us look at some of the values related to the dataset and its respective processing.

In [None]:
artifacts_dir = ''

artifacts_dir = Path(artifacts_dir) / str(seed)

with open(artifacts_dir / 'pipeline_config.json', 'r') as f:
            pipeline_config = json.load(f)

max_num_peaks = pipeline_config['max_num_peaks']
max_seq_len = pipeline_config['max_seq_len']
mz_vocabs = pipeline_config['mz_vocabs']
vocab_size = pipeline_config['vocab_size']

So, we created a data loader with the previous values. It is important to have the same seed as the split of the dataset to be used.

In [None]:
loaders = data_loader(seed=seed, batch_size=batch_size, num_workers=2, mgf_path=mgf_path, max_num_peaks=max_num_peaks, mz_vocabs=mz_vocabs)

Then, we choose the checkpoint of the model to use and make predictions.

In [None]:
checkpoint_path = ''

model = Transformer.load_model(checkpoint_path, seed=seed)

preds = model.predict(loaders['test'], return_probabilities=False, save_results=True)

print(f'Preds size {len(preds)}')

Then, we use the data loader to retrieve the IDs of the predicted spectra, as well as their true fingerprints.

In [None]:
y_true_list = []
ids_list = []


for batch in loaders['test']:
    targes_batch = batch[4]
    ids_batch = batch[3]

    y_true_list.append(targes_batch.numpy())
    ids_list.extend(ids_batch)


y_true = np.vstack(y_true_list)

print(f'True size {len(y_true)}')

Using the IDs, we go to the main dataset to retrieve the SMILES.

In [None]:
full_dataset = mgf_get_spectra(mgf_path)

smiles_dict = {
    espectro["params"]["spectrum_id"]: espectro["params"]["smiles"] 
    for espectro in full_dataset
    if "spectrum_id" in espectro["params"] and "smiles" in espectro["params"]
}

smiles_list = [smiles_dict[spectrum_id] for spectrum_id in ids_list]

print(f'Smiles size {len(smiles_list)}')

And we confirmed whether everything went according to plan by checking the size of the lists.

In [None]:
assert len(preds) == len(y_true) == len(smiles_list)

Next, we calculated the Tanimoto similarity between the predicted and real fingerprints.

In [None]:
tanimoto_scores = calculate_tanimoto(y_pred=preds, y_true=y_true)

And we identified which ones had the best scores, and which ones had the worst scores.

In [None]:
sorted_idx = np.argsort(tanimoto_scores)

worsts_idx = sorted_idx[:10]
bests_idx = sorted_idx[-10:][::-1]

analysis_data = []

print('Top 10 worsts scores')
for idx in worsts_idx:
    score = tanimoto_scores[idx]
    spectrum_id = ids_list[idx]
    smiles = smiles_list[idx]

    analysis_data.append({
        'Category': 'Worst',
        'Spectrum_ID': spectrum_id,
        'Tanimoto_Score' : score,
        'SMILES' : smiles
    })


print('Top 10 bests scores')
for idx in bests_idx:
    score = tanimoto_scores[idx]
    spectrum_id = ids_list[idx]
    smiles = smiles_list[idx]

    
    analysis_data.append({
        'Category': 'Best',
        'Spectrum_ID': spectrum_id,
        'Tanimoto_Score' : score,
        'SMILES' : smiles
    })

And we save everything in a CSV file.

In [None]:
df_analysis = pd.DataFrame(analysis_data)

df_analysis.to_csv('error_analysis_top_bottom_10.csv', index=False)

Then, we can use rdkit to understand the structure of the compounds with the best and worst scores.

In [None]:
def desenhar_grelha_rdkit(df, categoria):

    df_filtrado = df[df['Category'] == categoria]

    mols = []
    legendas = []

    for _, row in df_filtrado.iterrows():
        smile = row['SMILES']
        mol = Chem.MolFromSmiles(smile)
        
        if mol is not None:
            mols.append(mol)
            legenda = f"#{row['Spectrum_ID']}\nTanimoto: {row['Tanimoto_Score']:.2f}"
            legendas.append(legenda)
        else:
            print(f"RDKit couldn't interpret the SMILES.: {smile}")
    
    img = Draw.MolsToGridImage(
        mols, 
        molsPerRow=5, 
        subImgSize=(350, 300), 
        legends=legendas,
        returnPNG=False 
    )

    display(img)

    img.save(f'image_{categoria}_predictions.png')


In [None]:
desenhar_grelha_rdkit(df_analysis, 'Best')
desenhar_grelha_rdkit(df_analysis, 'Worst')