In [1]:
# Will reload modeules after this when they change!
%load_ext autoreload
%autoreload 2

In [202]:
import pandas as pd
import numpy as np
import json
import os
from string import digits

# RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

from molmass import Formula

from main_functions import *

# Molecule database merge with experimental MS/MS data for METASPACE.

This script merges a molecule databae with experimental MS/MS data.  These data can then be submitted to METASPACE to search for nuetral loss and in-source fragments within datasets.

See: /Users/dis/PycharmProjects/word2vec/main_functions.py

In [3]:
# Load reference database by path:
path = '/Users/dis/PycharmProjects/core_metabolome/core_metabolome_v3.pickle'
ref_db = load_molecule_database(path)

In [4]:
# Parses GNPS_json to GNPS_df:
GNPS_json = '/Users/dis/PycharmProjects/word2vec/ALL_GNPS.json'
GNPS_df = parse_GNPS_json(GNPS_json)

In [5]:
# Identifies subset of GNPS with experimental specrta in ref db:
# Result: # 3920 spectra for 508 can_no_stereo_smiles
GNPS_hits_df = search_GNPS_targets(ref_db, GNPS_df)

In [6]:
GNPS_hits_df.Adduct.value_counts()

M+H        1982
M-H         923
M+Na        464
M+K         219
M+NH4+      176
M-H2O+H     135
M+NH4        13
M+            6
M+Cl          1
M-H2O-H       1
Name: Adduct, dtype: int64

In [7]:
# Identifies subset of Mona with experimental specrta in ref db, runs in a few hours
# Result: # 7646 spectra for 648 inchi matches
# Mona_hits_df = search_MONA(ref_db)

# Reload existing:
Mona_hits_df = pd.read_pickle('mona_2020_Apr_15.pickle')

# Filters Mona_hits_df for high-res instruments, good adducts, and <20 ppm error.
# Result: # 3437 spectra for 535 inchi matches
Mona_hits_df = parse_MONA_out(Mona_hits_df)

In [8]:
Mona_hits_df.adduct.value_counts()

M+H        2805
M+Na        254
M+K         153
M+NH4+      142
M-H2O+H      56
M+           27
Name: adduct, dtype: int64

In [9]:
Mona_hits_df.source.value_counts()

MassBank                                   1197
Vaniya/Fiehn Natural Products Library       836
unknown                                     471
Fiehn HILIC Library                         379
GNPS                                        296
ReSpect                                     199
RIKEN PlaSMA Authentic Standard Library      59
Name: source, dtype: int64

In [22]:
# Cleans up ref db for only entries with experimental MS/MS spectra
df = preparser_Sirius(ref_db, GNPS_hits_df, Mona_hits_df)

In [23]:
# Main loop for running Sirius, leave
# Have to delete old spectra and trees before rerunning
sirius_output_dict = {}
mona_df = Mona_hits_df[['inchi', 'adduct', 'spectrum']].copy(deep=True)
gnps_df = GNPS_hits_df[['can_smiles', 'Adduct', 'peaks_json']].copy(deep=True)

# If positive mode
current_adducts = ['M+', 'M+H', 'M+Na', 'M+K']
mona_df = mona_df[mona_df.adduct.isin(current_adducts)]
gnps_df = gnps_df[gnps_df.Adduct.isin(current_adducts)]

# Loops over dataframe
index_list = list(df.index)
for idx in index_list:
    print('\n',idx)
    ser = df.loc[idx]
    formula = ser.formula
    inchi = ser.inchi
    can_smiles = ser.can_smiles
    db_index = ser.db_index
    mo_df = mona_df[mona_df.inchi == inchi]
    gn_df = gnps_df[gnps_df.can_smiles == can_smiles]
    unique_adducts = list(set(list(mo_df.adduct) + list(gn_df.Adduct)))
    print(unique_adducts)
    add_counter = 0
    for add in unique_adducts:
        print(idx, ' ', add)
        add_counter += 1
        output_dir = '/Users/dis/PycharmProjects/word2vec/trees/' + db_index
        m_df = mo_df[mo_df.adduct == add]
        g_df = gn_df[gn_df.Adduct == add]
        spectra_list = ms_format_writer(m_df, g_df, db_index, add)
        t_add = adduct_translate(add)
        sirius_input = runner_Sirius(formula, t_add, spectra_list, output_dir, db_index)
        sirius_output = !{sirius_input}
        sirius_output_dict[db_index] = output_Sirius_parser(sirius_output, output_dir, 
                                                            db_index, add_counter)


 0
['M+H']
0   M+H
ran sirius

 5
['M+H']
5   M+H
ran sirius

 15
['M+H']
15   M+H
ran sirius

 16
['M+H']
16   M+H
ran sirius

 17
['M+H']
17   M+H
ran sirius

 18
['M+H']
18   M+H
ran sirius

 19
['M+H']
19   M+H
ran sirius

 20
['M+H']
20   M+H
ran sirius

 21
['M+H']
21   M+H
ran sirius

 23
['M+H']
23   M+H
ran sirius

 24
['M+H']
24   M+H
ran sirius

 25
[]

 33
['M+H']
33   M+H
ran sirius

 38
['M+H']
38   M+H
ran sirius

 39
['M+H']
39   M+H
ran sirius

 50
['M+H']
50   M+H
ran sirius

 52
['M+H']
52   M+H
ran sirius

 53
['M+H']
53   M+H
ran sirius

 54
['M+H']
54   M+H
ran sirius

 55
['M+H']
55   M+H
ran sirius

 56
['M+H']
56   M+H
ran sirius

 57
['M+H']
57   M+H
ran sirius

 58
['M+H']
58   M+H
ran sirius

 59
['M+H']
59   M+H
ran sirius

 60
['M+H']
60   M+H
ran sirius

 61
['M+H']
61   M+H
ran sirius

 63
['M+H']
63   M+H
ran sirius

 65
['M+H']
65   M+H
ran sirius

 67
[]

 68
['M+H']
68   M+H
ran sirius

 69
['M+H']
69   M+H
ran sirius

 70
['M+', 'M+H']
70   M+
ran 

ran sirius

 543
[]

 545
[]

 550
['M+H']
550   M+H
ran sirius

 552
['M+H']
552   M+H
ran sirius

 563
['M+H']
563   M+H
ran sirius

 567
['M+H']
567   M+H
ran sirius

 568
['M+H']
568   M+H
ran sirius

 571
['M+H']
571   M+H
ran sirius

 573
['M+H', 'M+Na']
573   M+H
ran sirius
573   M+Na
ran sirius

 574
['M+H', 'M+Na']
574   M+H
ran sirius
574   M+Na
ran sirius

 575
['M+H', 'M+Na']
575   M+H
ran sirius
575   M+Na
ran sirius

 578
['M+H', 'M+Na', 'M+K']
578   M+H
ran sirius
578   M+Na
ran sirius
578   M+K
ran sirius

 580
['M+H', 'M+Na', 'M+K']
580   M+H
ran sirius
580   M+Na
ran sirius
580   M+K
ran sirius

 582
['M+H']
582   M+H
ran sirius

 585
['M+H']
585   M+H
ran sirius

 589
['M+Na']
589   M+Na
ran sirius

 591
['M+H', 'M+Na']
591   M+H
ran sirius
591   M+Na
ran sirius

 592
['M+H', 'M+Na']
592   M+H
ran sirius
592   M+Na
ran sirius

 593
['M+H', 'M+Na']
593   M+H
ran sirius
593   M+Na
ran sirius

 601
['M+H']
601   M+H
ran sirius

 604
['M+H']
604   M+H
ran sirius

 605
['

ran sirius

 1030
['M+H']
1030   M+H
ran sirius

 1031
['M+H']
1031   M+H
ran sirius

 1035
['M+H', 'M+Na']
1035   M+H
ran sirius
1035   M+Na
ran sirius

 1037
['M+H', 'M+Na', 'M+K']
1037   M+H
ran sirius
1037   M+Na
ran sirius
1037   M+K
ran sirius

 1040
['M+H', 'M+Na', 'M+K']
1040   M+H
ran sirius
1040   M+Na
ran sirius
1040   M+K
ran sirius

 1041
['M+H', 'M+Na', 'M+K']
1041   M+H
ran sirius
1041   M+Na
ran sirius
1041   M+K
ran sirius

 1044
['M+H']
1044   M+H
ran sirius

 1045
['M+H', 'M+Na']
1045   M+H
ran sirius
1045   M+Na
ran sirius

 1054
['M+H']
1054   M+H
ran sirius

 1056
['M+H']
1056   M+H
ran sirius

 1057
['M+H']
1057   M+H
ran sirius

 1074
['M+H']
1074   M+H
ran sirius

 1093
[]

 1098
['M+H']
1098   M+H
ran sirius

 1106
['M+H']
1106   M+H
ran sirius

 1110
['M+H', 'M+Na']
1110   M+H
ran sirius
1110   M+Na
ran sirius

 1114
['M+H']
1114   M+H
ran sirius

 1118
['M+H']
1118   M+H
ran sirius

 1120
['M+H']
1120   M+H
ran sirius

 1125
['M+H']
1125   M+H
ran sirius

 1

ran sirius

 1791
['M+H']
1791   M+H
ran sirius

 1805
['M+H']
1805   M+H
ran sirius

 1806
['M+H']
1806   M+H
ran sirius

 1812
['M+H']
1812   M+H
ran sirius

 1815
['M+H']
1815   M+H
ran sirius

 1820
['M+H', 'M+Na']
1820   M+H
ran sirius
1820   M+Na
ran sirius

 1829
['M+Na']
1829   M+Na
ran sirius

 1831
['M+H', 'M+Na']
1831   M+H
ran sirius
1831   M+Na
ran sirius

 1835
['M+H']
1835   M+H
ran sirius

 1837
['M+H', 'M+Na']
1837   M+H
ran sirius
1837   M+Na
ran sirius

 1840
['M+H']
1840   M+H
ran sirius

 1841
[]

 1845
['M+H']
1845   M+H
ran sirius

 1846
['M+H']
1846   M+H
ran sirius

 1855
['M+H']
1855   M+H
ran sirius

 1857
['M+H']
1857   M+H
ran sirius

 1868
['M+H']
1868   M+H
ran sirius

 1870
['M+H']
1870   M+H
ran sirius

 1874
[]

 1879
[]

 1880
[]

 1881
[]

 1882
[]

 1883
[]

 1886
['M+H', 'M+Na']
1886   M+H
ran sirius
1886   M+Na
ran sirius

 1887
['M+H', 'M+Na']
1887   M+H
ran sirius
1887   M+Na
ran sirius

 1888
['M+H', 'M+Na']
1888   M+H
ran sirius
1888   M+Na
ra

ran sirius
2456   M+Na
ran sirius
2456   M+K
ran sirius

 2457
['M+H', 'M+Na', 'M+K']
2457   M+H
ran sirius
2457   M+Na
ran sirius
2457   M+K
ran sirius

 2458
['M+H', 'M+K', 'M+Na']
2458   M+H
ran sirius
2458   M+K
ran sirius
2458   M+Na
ran sirius

 2459
[]

 2460
['M+H', 'M+Na', 'M+K']
2460   M+H
ran sirius
2460   M+Na
ran sirius
2460   M+K
ran sirius

 2461
['M+H', 'M+Na', 'M+K']
2461   M+H
ran sirius
2461   M+Na
ran sirius
2461   M+K
ran sirius

 2462
['M+H', 'M+Na', 'M+K']
2462   M+H
ran sirius
2462   M+Na
ran sirius
2462   M+K
ran sirius

 2463
['M+H', 'M+Na', 'M+K']
2463   M+H
ran sirius
2463   M+Na
ran sirius
2463   M+K
ran sirius

 2464
['M+H', 'M+Na', 'M+K']
2464   M+H
ran sirius
2464   M+Na
ran sirius
2464   M+K
ran sirius

 2465
['M+H']
2465   M+H
ran sirius

 2466
['M+H', 'M+Na', 'M+K']
2466   M+H
ran sirius
2466   M+Na
ran sirius
2466   M+K
ran sirius

 2467
['M+H', 'M+Na', 'M+K']
2467   M+H
ran sirius
2467   M+Na
ran sirius
2467   M+K
ran sirius

 2468
['M+H', 'M+Na', '

In [24]:
sirius_output_df = pd.DataFrame.from_dict(sirius_output_dict, orient='index', columns=['file'])

In [25]:
def exists(ms_path):
    if os.path.isfile(ms_path):
        return 1
    else:
        return 0

In [26]:
sirius_output_df['exists'] = sirius_output_df['file'].apply(lambda x: exists(x))

In [27]:
sirius_output_df.exists.value_counts()

1    776
0      2
Name: exists, dtype: int64

In [28]:
list(sirius_output_df[sirius_output_df.exists == 0].file)



In [30]:
sirius_output_df = sirius_output_df[sirius_output_df.exists == 1]

In [31]:
sirius_output_df.to_pickle('sirius_output_df.pickle')

In [33]:
sirius_output_df = pd.read_pickle('sirius_output_df.pickle')

In [36]:
has_MS2_df = df
has_MS2_df.to_pickle('has_MS2_df.pickle')
has_MS2_df = pd.read_pickle('has_MS2_df.pickle')
has_MS2_df

Unnamed: 0,id,name,formula,inchi,can_smiles,temp,db_index
0,HMDB0003361,Pyrimidine,C4H4N2,InChI=1S/C4H4N2/c1-2-5-4-6-3-1/h1-4H,c1cncnc1,0,0_HMDB0003361_Pyrimidine
5,HMDB0029713,Thiazole,C3H3NS,InChI=1S/C3H3NS/c1-2-5-3-4-1/h1-3H,c1cscn1,5,5_HMDB0029713_Thiazole
15,HMDB0001873,Isobutyric acid,C4H8O2,"InChI=1S/C4H8O2/c1-3(2)4(5)6/h3H,1-2H3,(H,5,6)",CC(C)C(=O)O,15,15_HMDB0001873_Isobutyric_acid
16,HMDB0000039,Butyric acid,C4H8O2,"InChI=1S/C4H8O2/c1-2-3-4(5)6/h2-3H2,1H3,(H,5,6)",CCCC(=O)O,16,16_HMDB0000039_Butyric_acid
17,HMDB0001414,Putrescine,C4H12N2,InChI=1S/C4H12N2/c5-3-1-2-4-6/h1-6H2,NCCCCN,17,17_HMDB0001414_Putrescine
...,...,...,...,...,...,...,...
7605,HMDB0005393,TG(18:0/18:0/18:0),C57H110O6,InChI=1S/C57H110O6/c1-4-7-10-13-16-19-22-25-28...,CCCCCCCCCCCCCCCCCC(=O)OCC(COC(=O)CCCCCCCCCCCCC...,7605,7605_HMDB0005393_TG180180180
10679,11966124,C14-CoA,C35H62N7O17P3S,InChI=1S/C35H62N7O17P3S/c1-4-5-6-7-8-9-10-11-1...,CCCCCCCCCCCCCC(=O)SCCN=C(O)CCN=C(O)C(O)C(C)(C)...,10679,10679_11966124_C14CoA
11001,HMDB0001035,Angiotensin II,C50H71N13O12,InChI=1S/C50H71N13O12/c1-5-28(4)41(47(72)59-36...,CCC(C)C(N=C(O)C(Cc1ccc(O)cc1)N=C(O)C(N=C(O)C(C...,11001,11001_HMDB0001035_Angiotensin_II
11026,HMDB0061196,Angiotensin I,C62H89N17O14,InChI=1S/C62H89N17O14/c1-7-35(6)51(78-56(87)44...,CCC(C)C(N=C(O)C(Cc1ccc(O)cc1)N=C(O)C(N=C(O)C(C...,11026,11026_HMDB0061196_Angiotensin_I


In [101]:
MS2_meta_df = pd.merge(has_MS2_df, sirius_output_df, how='inner',
                       left_on='db_index', right_index=True)

In [55]:
def ms_pd_reader(ms_path, db_index):
    df = pd.read_csv(ms_path, sep='\t', header=0)
    df = df[['exactmass', 'explanation']]
    df['db_index'] = db_index
    return df

In [125]:
def df_merge(input_df):
    # Merges MS2 data with metadata and output to 
    # pre-METASPACE df
    out_df = pd.DataFrame()
    
    # Loops over dataframe
    index_list = list(input_df.index)
    for idx in index_list:
        ser = input_df.loc[idx]
        df = ms_pd_reader(ser.file, ser.db_index)
        out_df = pd.concat([out_df, df])
    
    out_df = pd.merge(out_df, input_df, how='left', on='db_index')
    return out_df

In [416]:
# Joins MS2 spectra to metadata
pre_METASPACE_df = df_merge(MS2_meta_df)

In [417]:
pre_METASPACE_df = pre_METASPACE_df.dropna() 

In [418]:
def ex(formula):
    if type(formula) is str:
        return Formula(formula).isotope.mass
    else:
        print(formula)
        return formula

In [419]:
pre_METASPACE_df['expl_ex'] = pre_METASPACE_df.explanation.apply(lambda x: 
                                                                 ex(x))

In [420]:
pre_METASPACE_df['dmass'] = pre_METASPACE_df['exactmass'] - pre_METASPACE_df['expl_ex']
pre_METASPACE_df = pre_METASPACE_df.astype({'dmass': int})

In [421]:
pre_METASPACE_df = pre_METASPACE_df[pre_METASPACE_df.dmass >= 0]
pre_METASPACE_df[['formula', 'explanation', 'exactmass', 'expl_ex', 'dmass']]

Unnamed: 0,formula,explanation,exactmass,expl_ex,dmass
0,C4H4N2,C4H4,53.038577,52.031300,1
1,C4H4N2,C3H3N,54.033826,53.026549,1
2,C4H4N2,C4H4N2,81.044725,80.037448,1
3,C3H3NS,C2H2S,58.994998,57.987721,1
4,C3H3NS,C3H3NS,86.005897,84.998620,1
...,...,...,...,...,...
10044,C62H89N17O14,C35H53N11O8,756.415134,755.407858,1
10045,C62H89N17O14,C40H60N12O9,853.467898,852.460622,1
10046,C62H89N17O14,C49H69N13O10,1000.536312,999.529035,1
10047,C62H89N17O14,C55H76N16O11,1137.595224,1136.587947,1


In [422]:
pre_METASPACE_df['dmass'].value_counts()

1     7690
22    1835
38     522
Name: dmass, dtype: int64

In [423]:
def add_A(A, formula, n):
    # Safely add/subtract elements such as H to molecular formulas
    # E.g. A = 'H', formula = 'C6H12O6', n = 1
    x = formula
    if n == 0:
        return x
    
    elif A not in x and n >= 0:
        prefix = x
        if n == 1:       
            final_suffix = A
        else:
            final_suffix = A + str(n)
    
    elif A not in x and n < 0:
        print('Error!')
        return np.nan
    
    else:       
        x = x.split(A)
        prefix = x[0]
        N_suffix = x[1]
        if N_suffix == '':
            if n == -1:
                final_suffix = ''
            else:
                final_suffix = A + str(n + 1)
        else:
            ln = len(N_suffix)
            suffix = N_suffix.lstrip(digits)
            ls = len(suffix)
            #print(ln, " ", ls)
            
            
            if ln - ls == 0:
                N = 1
            elif ln - ls ==1:
                N = N_suffix[0:1]
            elif ln - ls ==2:
                N = N_suffix[0:2]
            elif ln - ls ==3:
                N = N_suffix[0:3]
            else:
                print('Bad formula!')
                return
            #print(N)
            if int(N) + n == 0:
                final_suffix = suffix
            else:
                final_suffix = A + str(int(N) + n) + suffix
    return prefix + final_suffix    

In [425]:
def ion_form(formula, dmass):
    # Generates ion formula from formula and dmass
    # Will need to be updated for new adducts and - mode
    #print(formula, '\t', type(formula), '\t', dmass)
    
    if dmass == 0:
        formula = formula
    elif dmass == 1:
        formula = add_A('H', formula, 1)
    elif dmass == 2:
        formula = add_A('H', formula, 2)
    elif dmass == 22:
        #formula = add_A('H', formula, -1)
        formula = add_A('Na', formula, 1)
    elif dmass == 23:
        formula = add_A('Na', formula, 1)
    elif dmass == 38:
        #formula = add_A('H', formula, -1)
        formula = add_A('K', formula, 1)
    elif dmass == 39:
        formula = add_A('K', formula, 1)
    else:
        print('dmass not known!')
        formula = np.nan
    return formula

In [426]:
# Ions which break mass calculations
pre_METASPACE_df = pre_METASPACE_df.dropna()
pre_METASPACE_df['H_check'] = pre_METASPACE_df.explanation.str.contains('H')
df = pre_METASPACE_df
df = df[~((df.dmass == 38) & (df.H_check == False))]
df = df[~((df.dmass == 22) & (df.H_check == False))]
pre_METASPACE_df = df

In [427]:
pre_METASPACE_df.H_check.value_counts()

True     9966
False      53
Name: H_check, dtype: int64

In [428]:
pre_METASPACE_df['ion_formula'] = pre_METASPACE_df.apply(lambda x:
                                                        ion_form(x.explanation,
                                                                x.dmass),
                                                        axis =1)

In [430]:
def ionmasspos(ionformula):
    return ex(ionformula) - 0.00055
    

In [431]:
pre_METASPACE_df['bad_if'] = pre_METASPACE_df.ion_formula.str.contains('-')
pre_METASPACE_df = pre_METASPACE_df[pre_METASPACE_df.bad_if == False]

In [432]:
pre_METASPACE_df = pre_METASPACE_df.copy(deep=True)
pre_METASPACE_df['ion_mass'] = pre_METASPACE_df.ion_formula.apply(lambda x: ionmasspos(x))

In [433]:
pre_METASPACE_df[['formula', 'exactmass', 'expl_ex', 'dmass', 'ion_formula', 'ion_mass']]

Unnamed: 0,formula,exactmass,expl_ex,dmass,ion_formula,ion_mass
0,C4H4N2,53.038577,52.031300,1,C4H5,53.038575
1,C4H4N2,54.033826,53.026549,1,C3H4N,54.033824
2,C4H4N2,81.044725,80.037448,1,C4H5N2,81.044723
3,C3H3NS,58.994998,57.987721,1,C2H3S,58.994996
4,C3H3NS,86.005897,84.998620,1,C3H4NS,86.005895
...,...,...,...,...,...,...
10044,C62H89N17O14,756.415134,755.407858,1,C35H54N11O8,756.415133
10045,C62H89N17O14,853.467898,852.460622,1,C40H61N12O9,853.467897
10046,C62H89N17O14,1000.536312,999.529035,1,C49H70N13O10,1000.536311
10047,C62H89N17O14,1137.595224,1136.587947,1,C55H77N16O11,1137.595222


In [434]:
def mass_check(em, im):
    if em - im <= 0.001:
        return True
    else:
        return False

In [435]:
df = pre_METASPACE_df
df['good_mass_calc'] = df.apply(lambda x: mass_check(x.exactmass,
                                                    x.ion_mass),
                               axis=1)
pre_METASPACE_df = df

In [436]:
pre_METASPACE_df.good_mass_calc.value_counts()

True    10019
Name: good_mass_calc, dtype: int64

In [437]:
pre_METASPACE_df[pre_METASPACE_df.good_mass_calc == False]

Unnamed: 0,exactmass,explanation,db_index,id,name,formula,inchi,can_smiles,temp,file,exists,expl_ex,dmass,H_check,ion_formula,bad_if,ion_mass,good_mass_calc


In [451]:
df = pre_METASPACE_df[['explanation', 'formula', 'id', 'name', 
                 'ion_formula', 'inchi']].copy(deep=True)

In [453]:
df['f_num'] = df.groupby(['name']).cumcount()+1

In [455]:
def f_or_p_ion(d, p):
    if d == p:
        return 'p'
    else:
        return 'f'

In [458]:
df['f_or_p'] = df.apply(lambda x: f_or_p_ion(x.explanation,
                                            x.formula), axis=1)

In [464]:
df['out_id'] = df.id + '_' + df.f_num.astype(str) + df.f_or_p
df['out_name'] = df.out_id + '_' + df.name

In [480]:
df_prefilter = df[['out_id', 'out_name', 'ion_formula', 'inchi', 'id']]

In [481]:
df_prefilter = df_prefilter.rename(columns={'out_id':'id', 'out_name':'name', 
                                            'ion_formula':'formula', 'id':'old_id'})
df_prefilter.iloc[:,0:4].to_pickle('whole_body_msms_test_v2')
df_prefilter.iloc[:,0:4].to_csv('whole_body_msms_test_v2', sep='\t')
df_prefilter

Unnamed: 0,id,name,formula,inchi,old_id
0,HMDB0003361_1f,HMDB0003361_1f_Pyrimidine,C4H5,InChI=1S/C4H4N2/c1-2-5-4-6-3-1/h1-4H,HMDB0003361
1,HMDB0003361_2f,HMDB0003361_2f_Pyrimidine,C3H4N,InChI=1S/C4H4N2/c1-2-5-4-6-3-1/h1-4H,HMDB0003361
2,HMDB0003361_3p,HMDB0003361_3p_Pyrimidine,C4H5N2,InChI=1S/C4H4N2/c1-2-5-4-6-3-1/h1-4H,HMDB0003361
3,HMDB0029713_1f,HMDB0029713_1f_Thiazole,C2H3S,InChI=1S/C3H3NS/c1-2-5-3-4-1/h1-3H,HMDB0029713
4,HMDB0029713_2p,HMDB0029713_2p_Thiazole,C3H4NS,InChI=1S/C3H3NS/c1-2-5-3-4-1/h1-3H,HMDB0029713
...,...,...,...,...,...
10044,HMDB0061196_10f,HMDB0061196_10f_Angiotensin I,C35H54N11O8,InChI=1S/C62H89N17O14/c1-7-35(6)51(78-56(87)44...,HMDB0061196
10045,HMDB0061196_11f,HMDB0061196_11f_Angiotensin I,C40H61N12O9,InChI=1S/C62H89N17O14/c1-7-35(6)51(78-56(87)44...,HMDB0061196
10046,HMDB0061196_12f,HMDB0061196_12f_Angiotensin I,C49H70N13O10,InChI=1S/C62H89N17O14/c1-7-35(6)51(78-56(87)44...,HMDB0061196
10047,HMDB0061196_13f,HMDB0061196_13f_Angiotensin I,C55H77N16O11,InChI=1S/C62H89N17O14/c1-7-35(6)51(78-56(87)44...,HMDB0061196


In [482]:
f2_ids = list(pd.read_csv('f2.txt').moleculeIds)
df_prefilter = df_prefilter[df_prefilter.old_id.isin(f2_ids)]
df_prefilter.iloc[:,0:4].to_pickle('whole_body_msms_test_v3')
df_prefilter.iloc[:,0:4].to_csv('whole_body_msms_test_v3', sep='\t')

# Wedneday:
1. Run Vitally/Lachlan
2. API access ion images and put in folders as pairs