In [1]:
# Will reload modeules after this when they change!
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import json
import os

# RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

from main_functions import *

# Molecule database merge with experimental MS/MS data for METASPACE.

This script merges a molecule databae with experimental MS/MS data.  These data can then be submitted to METASPACE to search for nuetral loss and in-source fragments within datasets.

See: /Users/dis/PycharmProjects/word2vec/main_functions.py

In [3]:
# Load reference database by path:
path = '/Users/dis/PycharmProjects/core_metabolome/core_metabolome_v3.pickle'
ref_db = load_molecule_database(path)

In [4]:
# Parses GNPS_json to GNPS_df:
GNPS_json = '/Users/dis/PycharmProjects/word2vec/ALL_GNPS.json'
GNPS_df = parse_GNPS_json(GNPS_json)

In [5]:
# Identifies subset of GNPS with experimental specrta in ref db:
# Result: # 3920 spectra for 508 can_no_stereo_smiles
GNPS_hits_df = search_GNPS_targets(ref_db, GNPS_df)

In [6]:
GNPS_hits_df.Adduct.value_counts()

M+H        1982
M-H         923
M+Na        464
M+K         219
M+NH4+      176
M-H2O+H     135
M+NH4        13
M+            6
M+Cl          1
M-H2O-H       1
Name: Adduct, dtype: int64

In [7]:
# Identifies subset of Mona with experimental specrta in ref db, runs in a few hours
# Result: # 7646 spectra for 648 inchi matches
# Mona_hits_df = search_MONA(ref_db)

# Reload existing:
Mona_hits_df = pd.read_pickle('mona_2020_Apr_15.pickle')

# Filters Mona_hits_df for high-res instruments, good adducts, and <20 ppm error.
# Result: # 3437 spectra for 535 inchi matches
Mona_hits_df = parse_MONA_out(Mona_hits_df)

In [8]:
Mona_hits_df.adduct.value_counts()

M+H        2805
M+Na        254
M+K         153
M+NH4+      142
M-H2O+H      56
M+           27
Name: adduct, dtype: int64

In [9]:
Mona_hits_df.source.value_counts()

MassBank                                   1197
Vaniya/Fiehn Natural Products Library       836
unknown                                     471
Fiehn HILIC Library                         379
GNPS                                        296
ReSpect                                     199
RIKEN PlaSMA Authentic Standard Library      59
Name: source, dtype: int64

In [22]:
# Cleans up ref db for only entries with experimental MS/MS spectra
df = preparser_Sirius(ref_db, GNPS_hits_df, Mona_hits_df)

In [None]:
# Main loop for running Sirius, leave
# Have to delete old spectra and trees before rerunning
sirius_output_dict = {}
mona_df = Mona_hits_df[['inchi', 'adduct', 'spectrum']].copy(deep=True)
gnps_df = GNPS_hits_df[['can_smiles', 'Adduct', 'peaks_json']].copy(deep=True)

# If positive mode
current_adducts = ['M+', 'M+H', 'M+Na', 'M+K']
mona_df = mona_df[mona_df.adduct.isin(current_adducts)]
gnps_df = gnps_df[gnps_df.Adduct.isin(current_adducts)]

# Loops over dataframe
index_list = list(df.index)
for idx in index_list:
    print('\n',idx)
    ser = df.loc[idx]
    formula = ser.formula
    inchi = ser.inchi
    can_smiles = ser.can_smiles
    db_index = ser.db_index
    mo_df = mona_df[mona_df.inchi == inchi]
    gn_df = gnps_df[gnps_df.can_smiles == can_smiles]
    unique_adducts = list(set(list(mo_df.adduct) + list(gn_df.Adduct)))
    print(unique_adducts)
    add_counter = 0
    for add in unique_adducts:
        print(idx, ' ', add)
        add_counter += 1
        output_dir = '/Users/dis/PycharmProjects/word2vec/trees/' + db_index
        m_df = mo_df[mo_df.adduct == add]
        g_df = gn_df[gn_df.Adduct == add]
        spectra_list = ms_format_writer(m_df, g_df, db_index, add)
        t_add = adduct_translate(add)
        sirius_input = runner_Sirius(formula, t_add, spectra_list, output_dir, db_index)
        sirius_output = !{sirius_input}
        sirius_output_dict[db_index] = output_Sirius_parser(sirius_output, output_dir, 
                                                            db_index, add_counter)


 0
['M+H']
0   M+H
ran sirius

 5
['M+H']
5   M+H
ran sirius

 15
['M+H']
15   M+H
ran sirius

 16
['M+H']
16   M+H
ran sirius

 17
['M+H']
17   M+H
ran sirius

 18
['M+H']
18   M+H
ran sirius

 19
['M+H']
19   M+H
ran sirius

 20
['M+H']
20   M+H
ran sirius

 21
['M+H']
21   M+H
ran sirius

 23
['M+H']
23   M+H
ran sirius

 24
['M+H']
24   M+H
ran sirius

 25
[]

 33
['M+H']
33   M+H
ran sirius

 38
['M+H']
38   M+H
ran sirius

 39
['M+H']
39   M+H
ran sirius

 50
['M+H']
50   M+H
ran sirius

 52
['M+H']
52   M+H
ran sirius

 53
['M+H']
53   M+H
ran sirius

 54
['M+H']
54   M+H
ran sirius

 55
['M+H']
55   M+H
ran sirius

 56
['M+H']
56   M+H
ran sirius

 57
['M+H']
57   M+H
ran sirius

 58
['M+H']
58   M+H
ran sirius

 59
['M+H']
59   M+H
ran sirius

 60
['M+H']
60   M+H
ran sirius

 61
['M+H']
61   M+H
ran sirius

 63
['M+H']
63   M+H
ran sirius

 65
['M+H']
65   M+H
ran sirius

 67
[]

 68
['M+H']
68   M+H
ran sirius

 69
['M+H']
69   M+H
ran sirius

 70
['M+', 'M+H']
70   M+
ran 

ran sirius

 543
[]

 545
[]

 550
['M+H']
550   M+H
ran sirius

 552
['M+H']
552   M+H
ran sirius

 563
['M+H']
563   M+H
ran sirius

 567
['M+H']
567   M+H
ran sirius

 568
['M+H']
568   M+H
ran sirius

 571
['M+H']
571   M+H
ran sirius

 573
['M+H', 'M+Na']
573   M+H
ran sirius
573   M+Na
ran sirius

 574
['M+H', 'M+Na']
574   M+H
ran sirius
574   M+Na
ran sirius

 575
['M+H', 'M+Na']
575   M+H
ran sirius
575   M+Na
ran sirius

 578
['M+H', 'M+Na', 'M+K']
578   M+H
ran sirius
578   M+Na
ran sirius
578   M+K
ran sirius

 580
['M+H', 'M+Na', 'M+K']
580   M+H
ran sirius
580   M+Na
ran sirius
580   M+K
ran sirius

 582
['M+H']
582   M+H
ran sirius

 585
['M+H']
585   M+H
ran sirius

 589
['M+Na']
589   M+Na
ran sirius

 591
['M+H', 'M+Na']
591   M+H
ran sirius
591   M+Na
ran sirius

 592
['M+H', 'M+Na']
592   M+H
ran sirius
592   M+Na
ran sirius

 593
['M+H', 'M+Na']
593   M+H
ran sirius
593   M+Na
ran sirius

 601
['M+H']
601   M+H
ran sirius

 604
['M+H']
604   M+H
ran sirius

 605
['

In [17]:
sirius_output_df = pd.DataFrame.from_dict(sirius_output_dict, orient='index', columns=['file'])

In [18]:
def exists(ms_path):
    if os.path.isfile(ms_path):
        return 1
    else:
        return 0

In [19]:
sirius_output_df['exists'] = sirius_output_df['file'].apply(lambda x: exists(x))

In [20]:
sirius_output_df.exists.value_counts()

1    46
Name: exists, dtype: int64

In [21]:
list(sirius_output_df[sirius_output_df.exists == 0].file)

[]

# Wedneday:
1. Parse collected daughters.
2. Determine delta exactmass and explanation, modify formula for m only
3. Optional: filtering mechanism from MS1 first search
4. Generate METASPACE database of parsed parents and daughters (formula/adduct?).
5. Run METASPACE (off-line)
6. API access ion images and put in folders as pairs