In [1]:
import copy
import os
import random

import importlib_resources
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

from cmmrt.projection.data import get_representatives
from cmmrt.rt.predictions import load_cmm_predictions
from cmmrt.projection.models.projector.loader import _load_projector_pipeline_from

In [2]:
def get_ppm_error(mass, ppm_error=10):
    return (round(mass) * ppm_error) / 10 ** 6

In [13]:
def rank_data(train_df, nreps, nid_metabolites):
    index_list = range(0, train_df.shape[0])
    for i in range(0, nreps):
        ii = random.sample(index_list, nid_metabolites)
        train = train_df.iloc[ii]
        id_remain = np.setdiff1d(index_list, ii)
        to_rank = train_df.iloc[id_remain]
        print(' '.join(train['Name']))
        x, y = (
            torch.from_numpy(train.prediction.values.reshape(-1, 1)),
            torch.from_numpy(train.rt.values*60)
                )
        projector.projector.prepare_metatesting()
        projector.fit(x, y)
        mass_error_seed = 123
        if mass_error_seed is not None:
            np.random.seed(mass_error_seed)
            
        candidates_list = []

        for index, row in to_rank.iterrows():
            # Skip if the compound is not in the test set (since it wouldn't have a chance to be in the top results)
            error = get_ppm_error(row.calc_mw)
            candidates = predicted_pubchem[
                (predicted_pubchem["ExactMass"] >= (row.calc_mw - error))
                & (predicted_pubchem["ExactMass"] <= (row.calc_mw + error))
                ].copy()

            candidates = candidates.drop(['Unnamed: 0', 'MolecularWeight', 'cmm_id'], axis=1)
            candidates = candidates.rename(columns={'prediction':'rt_predicted'})

            if candidates.shape[0] > 0:
                candidates['FeatureID'] = row.FeatureID
                candidates['rt_experimental'] = row.rt*60
                candidates['mass_experimental'] = row.calc_mw
                candidates['z_score'] = pd.NA
                candidates['mass_error'] = abs(candidates.ExactMass - row.calc_mw)
                # add small noise to unbreak ties
                candidates['mass_error'] = candidates['mass_error'] + np.random.uniform(0, 1e-6, candidates.shape[0])
                candidates.sort_values(by='mass_error', inplace=True)
                scores = projector.z_score(candidates[['rt_predicted']].values, np.array([row.rt*60]))
                scores = scores.cpu().numpy()
                candidates.loc[:, 'z_score'] = scores
                candidates.sort_values("z_score", inplace=True)
                candidates = candidates.nlargest(3, ['z_score'])
                candidates_list.append(candidates)

        candidates_final = pd.concat(candidates_list).reset_index(drop=True)
        candidates_final = candidates_final[['FeatureID', 'mass_experimental', 'rt_experimental', 
                                             'rt_predicted', 'mass_error', 'z_score', 'Title', 'MolecularFormula',
                                             'ExactMass', 'InChIKey', 'InChI', 'pid']]

        candidates_final.to_csv(f'results/results_test_loop_2/Candidate_annotation_nannot_{str(nid_metabolites)}_rep_{str(i)}.csv', 
                                index=False)
        
        # plotting
        sorted_x = torch.arange(x.min() - 0.5, x.max() + 0.5, 0.1, dtype=torch.float32)
        plt.scatter(predicted_pubchem.prediction.values,
                    projector.predict(predicted_pubchem.prediction.values)[0])
        preds_mean, lb, ub = projector.predict(sorted_x)
        plt.scatter(x, y, marker='x')
        plt.fill_between(sorted_x, lb, ub, alpha=0.2, color='orange')
        plt.plot(sorted_x, preds_mean, color='orange')
        plt.title(f'N_annotated {str(nid_metabolites)}, rep {str(i)}')
        plt.xlabel("Predicted RT")
        plt.ylabel("Projected/Experimental RT")
        with torch.no_grad():
            sorted_x_ = torch.from_numpy(projector.x_scaler.transform(sorted_x.numpy().reshape(-1, 1)))
            tmp = projector.projector.gp.mean_module(sorted_x_)
            tmp = projector.y_scaler.inverse_transform(tmp.reshape(-1, 1)).flatten()
            plt.plot(sorted_x, tmp, color='green')
        plt.savefig(f'results/results_test_loop_2/Candidate_plot_nannot_{str(nid_metabolites)}_rep_{str(i)}.png')
        plt.close()

## Adding own data

In [4]:
pubchem_db = pd.read_csv('data/final_pubchem_results.csv')
pubchem_db = pubchem_db.astype({'CID': 'str'})

predicted_pubchem = pd.read_csv('results/predicted_rt_db.csv', )
predicted_pubchem = predicted_pubchem.astype({'pid': 'str'})
predicted_pubchem = predicted_pubchem.merge(pubchem_db, left_on='pid', right_on='CID', how='left')

In [5]:
data_to_rank = pd.read_csv('data/RP_skin_all.csv')
data_annotated = pd.read_csv('data/RP_skin_id.csv')
data_annotated = data_annotated.astype({'CID': 'str'})
# data_annotated = data_annotated.merge(pubchem_db, left_on='Name', right_on='Title', how='left')
data_to_rank.head()

Unnamed: 0,FeatureID,Name,Formula,calc_mw,mz,rt,annot_source,Annot. Source: mzVault Search
0,Feature2369,Indole-3-carboxaldehyde,C9 H7 N O,145.05268,146.05996,12.402,mzVault and mzCloud,Full match
1,Feature2368,Ornithine,C5 H12 N2 O2,115.06321,116.07049,1.057,mzVault and mzCloud,Invalid mass
2,Feature2367,Ornithine,C5 H12 N2 O2,132.08982,133.09709,1.058,mzVault and mzCloud,Full match
3,Feature2366,Histamine,C5 H9 N3,94.053,112.08682,1.003,mzVault and mzCloud,Invalid mass
4,Feature2365,D-Aspartic acid,C4 H7 N O4,133.03737,134.04465,1.215,mzVault and mzCloud,Full match


In [8]:
projector = _load_projector_pipeline_from(f"../cmmrt/cmmrt/data/metalearned_projectors/p2e_rl.pt", mean='constant',
                                          kernel='rbf+linear')
pubchems = np.array(data_annotated.CID)

In [9]:
train_logical = data_annotated['CID'].isin(pubchems)
train = data_annotated[train_logical]
train = train.merge(predicted_pubchem, on='CID', how='left').dropna()

In [14]:
rank_data(train_df = train, nreps=10, nid_metabolites=10)

L-Carnitine Glycyl-L-leucine D-Serine L-Carnitine L-Carnitine L(-)-Pipecolinic acid L-Carnitine L-Carnosine L-Norleucine D-Arginine






D-Histidine L-Asparagine 2-Deoxy-D-glucose D-Arginine L-Cystine 2-Pyrrolidinone Betaine Creatinine 4-Aminosalicylic acid Glycyl-L-leucine
N-Acetylglutamic acid Glycyl-L-leucine Cytidine Maltotriose Creatinine N-Acetylneuraminic acid Maltotriose Creatinine Uric acid L-Pyroglutamic acid






L-Norleucine Creatine trans-Urocanic Acid Phosphocholine Indole-3-carboxaldehyde L-Carnitine N-Acetyl-L-arginine Cytidine-5'-monophosphate N-Acetylneuraminic acid trans-Urocanic Acid






4-Aminosalicylic acid L-Alanine O-Phosphorylethanolamine Bicine sn-glycero-3-Phosphocholine L-Carnitine N-Acetylglycine N-Acetylneuraminic acid Uric acid L-Tyrosine
Creatine O-Phosphorylethanolamine Phosphocholine Hippuric acid Adenine Uridine Ornithine L-Asparagine Bicine 5-Methylcytidine
N-Acetylglycine trans-Urocanic Acid sn-glycero-3-Phosphocholine Choline L-Carnosine 2-Pyrrolidinone L-Asparagine Phenylac-Gly-OH D-Glutamic acid Indole-3-carboxaldehyde








Cytidine L-Cystine L(-)-Pipecolinic acid Adenine 2-Deoxy-D-glucose N-Acetylneuraminic acid L-Carnitine Uridine-5'-phosphate trans-Urocanic Acid Uridine
Choline Ornithine L-Tyrosine Trigonelline 2'-Deoxycytidine Citicoline Bicine 4-(Acetylamino)butanoic acid D-Valine Uridine-5'-phosphate










D-Aspartic acid Phosphocholine L(-)-Pipecolinic acid D-Histidine Phosphocholine Niacinamide Uridine-5'-phosphate D-Valine Choline L-Pyroglutamic acid








In [15]:
rank_data(train_df = train, nreps=10, nid_metabolites=20)

Cytidine-5'-monophosphate L-Carnitine 2'-Deoxyadenosine D-Arginine Phosphocholine trans-Urocanic Acid Phosphocholine Uridine 2-Deoxy-D-glucose D-Glutamine 4-(Acetylamino)butanoic acid L-Carnitine L-Norleucine Niacinamide Thymine L-Carnitine L-Pyroglutamic acid L-Pyroglutamic acid D-Glutamic acid L-Carnitine










Cytidine-5'-monophosphate trans-Urocanic Acid Maltotriose D-Glutamic acid N-Acetyl-D-mannosamine D-Threonine Citicoline L-Carnitine N-Acetylneuraminic acid Ornithine 5-Methylcytidine Xanthine L-Alanine D-Histidine N-Acetyl-D-galactosamine Indole-3-carboxaldehyde 4-Aminosalicylic acid N-Acetylneuraminic acid D-Glutamic acid Indole-3-carboxaldehyde






4-Guanidinobutyric acid D-Aspartic acid Hippuric acid L(-)-Pipecolinic acid D-Leucine trans-Urocanic Acid Bicine L-Norleucine L-Carnosine L-Carnitine N-Acetyl-D-galactosamine Choline L-Asparagine Citicoline Maltotriose Glycyl-L-leucine DL-Kynurenine L-Pyroglutamic acid 2-Pyrrolidinone trans-Urocanic Acid






trans-Urocanic Acid D-Arginine L-Pyroglutamic acid N-Acetylneuraminic acid 2-Deoxy-D-glucose trans-Urocanic Acid 4-(Acetylamino)butanoic acid N-Acetyl-D-mannosamine Creatinine L(-)-Pipecolinic acid Creatinine Uric acid Maltotriose Phenylac-Gly-OH 4-Aminosalicylic acid L-Cystine L-Carnitine L-Carnosine L-Carnitine N-Acetylglycine






L-Norleucine D-Threonine L-Asparagine Cytidine-5'-monophosphate Creatine Ornithine O-Phosphorylethanolamine trans-Urocanic Acid N-Acetyl-D-galactosamine Maltotriose 4-(Acetylamino)butanoic acid Choline L-Cystine 2'-Deoxycytidine Phosphocholine Adenine Taurine L-Pyroglutamic acid 2'-Deoxyadenosine N-Acetyl-D-mannosamine










L-Carnitine Phenylac-Gly-OH 2'-Deoxycytidine 5-Methylcytidine Maltotriose trans-Urocanic Acid L-Alanine sn-glycero-3-Phosphocholine D-Serine Ornithine L-Norleucine Allantoin D-Valine Choline Xanthine D-Histidine trans-Urocanic Acid L-Alanine Uridine-5'-phosphate L-Carnitine






L-Carnosine Phosphocholine Indole-3-carboxaldehyde Citicoline L-Carnitine trans-Urocanic Acid N-Acetyl-D-mannosamine N-Acetylneuraminic acid N-Acetylneuraminic acid Trigonelline L-Carnitine 2-Pyrrolidinone trans-Urocanic Acid Uridine D-Arginine L-Carnitine Niacinamide Maltotriose Indole-3-carboxaldehyde 5-Methylcytidine






D-Threonine Creatinine Creatinine L-Carnitine L-Carnitine Cytidine-5'-monophosphate Ornithine L-Norleucine Hippuric acid Thymine Maltotriose L-Carnitine 5-Methylcytidine 4-Guanidinobutyric acid Glycyl-L-leucine L-Asparagine Adenosine Taurine 2'-Deoxyadenosine L-Carnitine






L-Carnitine Taurine L-Asparagine D-Arginine Thymine D-Histidine N-Acetylglycine Creatine Uridine Maltotriose Cytidine Maltotriose L(-)-Pipecolinic acid D-Histidine Adenosine Ornithine L-Alanine N-Acetyl-D-mannosamine D-Aspartic acid Bicine










Trigonelline L-Norleucine 2'-Deoxyadenosine D-Glutamine L(-)-Pipecolinic acid Citicoline Adenine Creatine D-Glutamic acid Hippuric acid Adenosine L-Cystine L-Pyroglutamic acid Creatinine D-Valine L-Carnitine Ornithine N-Acetyl-D-mannosamine D-Aspartic acid Maltotriose










In [16]:
rank_data(train_df = train, nreps=10, nid_metabolites=30)

Phosphocholine Betaine Maltotriose Adenine Cytidine Hippuric acid Creatine Taurine 2-Deoxy-D-glucose N-Acetylglycine L(-)-Pipecolinic acid L-Carnitine Indole-3-carboxaldehyde L(-)-Pipecolinic acid trans-Urocanic Acid D-Histidine Trigonelline Ornithine L-Carnitine Maltotriose L-Carnitine Choline Indole-3-carboxaldehyde Creatinine D-Glutamic acid Phenylac-Gly-OH L-Asparagine L-Carnitine Uridine-5'-phosphate sn-glycero-3-Phosphocholine








L-Carnitine N-Acetylglutamic acid D-Histidine trans-Urocanic Acid trans-Urocanic Acid Uridine Choline Nicotinic acid N-Acetylneuraminic acid Phosphocholine Phosphocholine D-Arginine trans-Urocanic Acid Creatine D-Leucine D-Glutamine Uric acid 2-Deoxy-D-glucose L-Carnitine 4-Aminosalicylic acid Phenylac-Gly-OH Creatine D-Glutamic acid L-Norleucine Niacinamide L-Carnitine Ornithine Bicine sn-glycero-3-Phosphocholine Thymine








N-Acetyl-L-arginine N-Acetylglutamic acid L-Alanine Betaine L-Asparagine 5-Methylcytidine Cytidine-5'-monophosphate D-Glutamine L-Pyroglutamic acid 2-Pyrrolidinone N-Acetyl-D-mannosamine Ornithine Indole-3-carboxaldehyde Maltotriose Maltotriose Bicine trans-Urocanic Acid Niacinamide L-Carnitine trans-Urocanic Acid L-Pyroglutamic acid Cytidine Creatinine L-Cystine L-Tyrosine Phosphocholine 4-Aminosalicylic acid L-Norleucine Phosphocholine Uric acid








L-Carnosine 2'-Deoxycytidine Hippuric acid Creatine Betaine L-Carnitine D-Serine L-Asparagine DL-Kynurenine D-Arginine Bicine L(-)-Pipecolinic acid 4-Guanidinobutyric acid 4-Aminosalicylic acid N-Acetylneuraminic acid trans-Urocanic Acid D-Valine Indole-3-carboxaldehyde Adenine Ornithine L-Carnitine L-Carnitine Taurine Glycyl-L-leucine D-Aspartic acid N-Acetylglutamic acid Allantoin Creatine Phosphocholine N-Acetyl-D-mannosamine








L-Carnitine Phosphocholine Creatine L(-)-Pipecolinic acid Phosphocholine Xanthine N-Acetylneuraminic acid D-Leucine Choline N-Acetyl-D-mannosamine 2-Deoxy-D-glucose N-Acetyl-D-mannosamine Thymine L-Alanine Cytidine-5'-monophosphate N-Acetylglutamic acid L-Carnosine Ornithine L-Cystine L-Tyrosine Bicine trans-Urocanic Acid L-Pyroglutamic acid Creatine L-Carnitine Adenine N-Acetylneuraminic acid Phenylac-Gly-OH L-Carnitine 4-Guanidinobutyric acid










L-Carnitine DL-Kynurenine Maltotriose D-Proline N-Acetylneuraminic acid Maltotriose L-Tyrosine Creatinine Creatinine Glycyl-L-leucine D-Leucine Taurine Creatine D-Valine 2-Pyrrolidinone trans-Urocanic Acid Creatine trans-Urocanic Acid L-Carnitine N-Acetylglycine Indole-3-carboxaldehyde N-Acetyl-D-galactosamine N-Acetyl-D-mannosamine D-Histidine trans-Urocanic Acid 2'-Deoxycytidine 4-Guanidinobutyric acid L-Carnosine L-Citrulline Hippuric acid










Trigonelline Indole-3-carboxaldehyde L-Carnitine N-Acetyl-L-arginine Creatinine 2'-Deoxyadenosine N-Acetylglutamic acid Hippuric acid D-Glutamine L-Carnitine L-Carnitine Choline N-Acetylneuraminic acid D-Threonine L-Pyroglutamic acid L-Asparagine trans-Urocanic Acid N-Acetyl-D-mannosamine Phosphocholine Phosphocholine Cytidine Adenine L-Carnitine D-Histidine trans-Urocanic Acid Betaine L-Carnitine N-Acetyl-D-galactosamine Taurine L-Alanine










Creatine Creatinine L(-)-Pipecolinic acid L-Norleucine D-Serine L-Carnitine 2'-Deoxyadenosine D-Arginine Taurine D-Glutamic acid Indole-3-carboxaldehyde N-Acetyl-D-galactosamine L-Carnitine L-Citrulline Indole-3-carboxaldehyde D-Valine trans-Urocanic Acid Choline N-Acetylneuraminic acid D-Threonine L-Carnitine L-Pyroglutamic acid 4-(Acetylamino)butanoic acid Maltotriose L(-)-Pipecolinic acid 2-Deoxy-D-glucose 4-Guanidinobutyric acid O-Phosphorylethanolamine 4-Aminosalicylic acid Creatine










Cytidine Maltotriose Phenylac-Gly-OH trans-Urocanic Acid Creatinine L-Alanine Betaine L-Cystine L-Carnitine sn-glycero-3-Phosphocholine Choline 4-Guanidinobutyric acid D-Glutamic acid Hippuric acid Adenine L(-)-Pipecolinic acid Indole-3-carboxaldehyde trans-Urocanic Acid 2'-Deoxycytidine Creatine Phosphocholine 4-(Acetylamino)butanoic acid L-Carnosine L-Carnitine Uridine-5'-phosphate 4-Aminosalicylic acid 5-Methylcytidine L-Carnitine D-Glutamine D-Threonine










L-Asparagine Citicoline L-Norleucine L(-)-Pipecolinic acid L-Cystine L-Carnitine trans-Urocanic Acid L(-)-Pipecolinic acid D-Threonine 4-(Acetylamino)butanoic acid Creatine 5-Methylcytidine N-Acetyl-D-mannosamine L-Carnitine 2'-Deoxyadenosine 4-Aminosalicylic acid Maltotriose L-Carnitine Adenine Creatinine N-Acetyl-D-mannosamine N-Acetylglycine D-Serine Bicine Phosphocholine Thymine D-Valine D-Glutamic acid O-Phosphorylethanolamine Indole-3-carboxaldehyde








