In [1]:
#
# Import libraries
#

import os
import pandas as pd
from dotmap import DotMap
import requests
import json
import numpy as np
from tqdm import tqdm
import time

In [2]:
modes = ['cp', 'cn', 'hp']
f2i_path = r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA_V2\OriginalFiles\RBR_f2i.xlsx"
f2i = DotMap({
    'cp': pd.read_excel(f2i_path, sheet_name='C18P').set_index('fid'),
    'cn': pd.read_excel(f2i_path, sheet_name='C18N').set_index('fid'),
    'hp': pd.read_excel(f2i_path, sheet_name='HILP').set_index('fid'),
})

In [3]:
#
# Obtain feature data for each CMM table and run TurboPutative
#

xm = pd.read_csv(
    r"S:\U_Proteomica\UNIDAD\software\MacrosRafa\data\Metabolomics\PESA_Integromics\Data\Metabolomics\PESA_V2\WorkingFiles\Xm_norm.tsv", 
    sep='\t', index_col=0)

In [4]:
#
# Adducts
#

posAdd = ["M+H", "M+2H", "M+Na", "M+K", "M+H-H2O", "M+H+HCOONa"]
negAdd = ["M-H", "M-2H", "M+Cl", "M-H-H20", "M-H+HCOONa", "M+Na-2H", "M+HCOOH-H"]

C18P

In [5]:
mz = f2i.cp.loc[
    np.isin(f2i.cp.index, xm.columns),
    'Apex m/z'
].tolist()

n=5
mz = [mz[i:i+n] for i in range(0, len(mz), n)]

In [6]:
cmm = DotMap()

uri = "http://ceumass.eps.uspceu.es/mediator/api/v3/batch"

# C18P
res_all = []

for mzi in tqdm(mz):
    res = requests.post(
        uri, headers={'Content-Type': 'application/json; charset=utf-8'}, 
        json={
        "metabolites_type": "all-except-peptides",
        "databases": ["all-except-mine"],
        "masses_mode": "mz",
        "ion_mode": "positive",
        "adducts": posAdd,
        "tolerance": 10.0,
        "tolerance_mode": "ppm",
        "masses": mzi
        }
    )
    res_all.extend(res.json()['results'])
    time.sleep(2)

cmm.cp = pd.DataFrame(res_all)

100%|██████████| 83/83 [03:09<00:00,  2.28s/it]


C18N

In [7]:
mz = f2i.cn.loc[
    np.isin(f2i.cn.index, xm.columns),
    'Apex m/z'
].tolist()

n=5
mz = [mz[i:i+n] for i in range(0, len(mz), n)]

In [8]:
# C18N
res_all = []

for mzi in tqdm(mz):
    res = requests.post(
        uri, headers={'Content-Type': 'application/json; charset=utf-8'}, 
        json={
        "metabolites_type": "all-except-peptides",
        "databases": ["all-except-mine"],
        "masses_mode": "mz",
        "ion_mode": "negative",
        "adducts": ['all'],
        "tolerance": 10.0,
        "tolerance_mode": "ppm",
        "masses": mzi
        }
    )
    res_all.extend(res.json()['results'])
    time.sleep(2)

cmm.cn = pd.DataFrame(res_all)

  0%|          | 0/78 [00:00<?, ?it/s]

100%|██████████| 78/78 [03:33<00:00,  2.74s/it]


HILP

In [9]:
mz = f2i.hp.loc[
    np.isin(f2i.hp.index, xm.columns),
    'Apex m/z'
].tolist()

n=2
mz = [mz[i:i+n] for i in range(0, len(mz), n)]

In [10]:
# HILP
res_all = []

for mzi in tqdm(mz):
    res = requests.post(
        uri, headers={'Content-Type': 'application/json; charset=utf-8'}, 
        json={
        "metabolites_type": "all-except-peptides",
        "databases": ["all-except-mine"],
        "masses_mode": "mz",
        "ion_mode": "positive",
        "adducts": posAdd,
        "tolerance": 5.0,
        "tolerance_mode": "ppm",
        "masses": mzi
        }
    )
    res_all.extend(res.json()['results'])
    time.sleep(2)

cmm.hp = pd.DataFrame(res_all)

 29%|██▉       | 74/253 [09:48<23:43,  7.95s/it]


JSONDecodeError: [Errno Expecting value] <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>502 Proxy Error</title>
</head><body>
<h1>Proxy Error</h1>
<p>The proxy server received an invalid
response from an upstream server.<br />
The proxy server could not handle the request<p>Reason: <strong>Error reading from remote server</strong></p></p>
<hr>
<address>Apache/2.4.10 (Debian) Server at ceumass.eps.uspceu.es Port 80</address>
</body></html>
: 0

In [12]:
cmmF = DotMap()
cmmF.cp = cmm.cp[np.isin(cmm.cp.adduct, posAdd)]
cmmF.cn = cmm.cn[np.isin(cmm.cn.adduct, negAdd)]
# cmmF.hp = cmm.hp[np.isin(cmm.hp.adduct, posAdd)]

In [13]:
for i in ['cp', 'cn']:#, 'hp']:
    print(i)
    print('Number of features: ' + str((~cmm[i].EM.duplicated()).sum()))
    print('Number of annotations: ' + str(cmm[i].shape[0]))
    print('Number of filtered annotations: ' + str(cmmF[i].shape[0]))

cp
Number of features: 357
Number of annotations: 11961
Number of filtered annotations: 11961
cn
Number of features: 387
Number of annotations: 41776
Number of filtered annotations: 19666


In [15]:
# Adapt columns to TP

for i in ['cp', 'cn']:#, 'hp']:
    cmmF[i] = cmmF[i].rename(columns={
        'identifier': 'Identifier',
        'EM': 'Experimental mass',
        'adduct': 'Adduct',
        'error_ppm': 'mz Error (ppm)',
        'molecular_weight': 'Molecular Weight',
        'name': 'Name',
        'formula': 'Formula'
    })

In [16]:
# Identifiers to be removed
remid = [188282]

In [18]:
# Write MS_Table.tsv
for i in ['cp', 'cn']:#, 'hp']:
    cmmF[i][~np.isin(cmmF[i].Identifier, remid)].to_csv(f'CMM_{i}.tsv', sep='\t', index=False)

['seqn_6',
 'seqn_8',
 'seqn_9',
 'seqn_11',
 'seqn_16',
 'seqn_17',
 'seqn_26',
 'seqn_34',
 'seqn_35',
 'seqn_38',
 'seqn_41',
 'seqn_45',
 'seqn_52',
 'seqn_56',
 'seqn_57',
 'seqn_58',
 'seqn_73',
 'seqn_84',
 'seqn_87',
 'seqn_91',
 'seqn_96',
 'seqn_97',
 'seqn_109',
 'seqn_115',
 'seqn_121',
 'seqn_125',
 'seqn_131',
 'seqn_138',
 'seqn_140',
 'seqn_169',
 'seqn_175',
 'seqn_188',
 'seqn_194',
 'seqn_207',
 'seqn_219',
 'seqn_227',
 'seqn_243',
 'seqn_255',
 'seqn_257',
 'seqn_266',
 'seqn_269',
 'seqn_271',
 'seqn_279',
 'seqn_280',
 'seqn_296',
 'seqn_299',
 'seqn_302',
 'seqn_307',
 'seqn_319',
 'seqn_344',
 'seqn_345',
 'seqn_361',
 'seqn_364',
 'seqn_370',
 'seqn_385',
 'seqn_398',
 'seqn_411',
 'seqn_439',
 'seqn_442',
 'seqn_443',
 'seqn_444',
 'seqn_449',
 'seqn_456',
 'seqn_461',
 'seqn_466',
 'seqn_496',
 'seqn_508',
 'seqn_520',
 'seqn_522',
 'seqn_525',
 'seqn_546',
 'seqn_551',
 'seqn_559',
 'seqn_575',
 'seqn_592',
 'seqn_600',
 'seqn_603',
 'seqn_610',
 'seqn_630'

In [19]:
# Write Feature_Info.tsv
xm_tp = xm.copy()
xm_tp.index = [f'seqn_{i}' for i in xm_tp.index]
for i in ['cp', 'cn']:#, 'hp']:
    f2i[i].loc[
        np.isin(f2i[i].index, xm.columns),
        ['Apex m/z', 'RT [min]']
    ].join(xm_tp.T, how='inner')\
            .reset_index(names='FeatureInfo_Name')\
                .to_csv(f'FInfo_{i}.tsv', sep='\t', index=False)
