In [4]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import os
from toolsets.file_io import save_df, read_df
data_path = '/Users/fanzhoukong/Documents/GitHub/Libgen_data/masswiki'
def format_method_string(input_string):
    input_string = input_string.strip()
    # Replace " | " with "%20%7C%"
    replaced_string = input_string.replace(" ", "%20")
    replaced_string = replaced_string.replace("|", "%7C")
    # Replace spaces with "%"
    
    return replaced_string
import requests
import toolsets.chem_utils as cu
def get_masswiki_data(method, splash, version='LCB2023'):
    # Base URL
    base_url = "https://api.metabolomics.us/cis/compound/{}/{}/{}".format(method, splash, version)
    
    # Headers with the API key
    headers = {
        "x-api-key": "lcb-fzkong-PeLfQ8Iuaz4PHwoDtBsc46"
    }
    
    # Make the GET request
    response = requests.get(base_url, headers=headers)
    
    # Check if the request was successful
    if response.status_code == 200:
        return response.json()  # Return the JSON response
    else:
        print ({"error": "Request failed with status code {}".format(response.status_code)})
        return np.NAN

import re
def extract_smiles(input_string):
    pattern = r'SMILES: "(.*?)"'
    match = re.search(pattern, input_string)
    if match:
        return match.group(1)
    else:
        return np.nan
from tqdm import tqdm
def complete_metadata(hilic, method_long = '5m hilic premier | orbitrap | beh amide | negative', version = 'LCB2023'):
    annotated = hilic[hilic['user_annotation-name'].notnull()]
    annotated = annotated[['wiki_id', 'scan', 'rt', 'precursor_mz', 'charge','binbase-splash', 'user_annotation-name', 'key','library_search-identity_score' ]]
    method = format_method_string(method_long)
    for index, row in tqdm(annotated.iterrows(), total = len(annotated)):
        splash = row['binbase-splash']
        result = get_masswiki_data(method, splash, version)
        if result:
            if len(result[0]['associated_names'])>0:
                smiles = extract_smiles(result[0]['associated_names'][0]['comment'])
                msms = result[0]['msms_raw']
                hidden = result[0]['hidden']
                ms1_inte = result[0]['pre_cursors_intensity']
                adduct = result[0]['preferred_adduct']
                if smiles:
                    annotated.loc[index, 'smiles'] = smiles
                    annotated.loc[index, 'msms'] = msms
                    annotated.loc[index, 'hidden'] = hidden
                    annotated.loc[index, 'ms1_inte'] = ms1_inte
                    annotated.loc[index, 'adduct'] = adduct
    annotated['method']=method_long
    return annotated

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
c18_pos = pd.read_csv(os.path.join(data_path, 'c18_orbi_pos.csv'))
c18_neg = pd.read_csv(os.path.join(data_path, 'c18_orbi_neg.csv'))

In [6]:
from toolsets.search import string_search

In [7]:
c18_pos_meta = complete_metadata(c18_pos, method_long = '5m splash one premier | orbitrap | beh c18 | positive')

  0%|          | 0/1780 [00:00<?, ?it/s]

100%|██████████| 1780/1780 [05:02<00:00,  5.88it/s] 


In [8]:
c18_neg_meta = complete_metadata(c18_neg, method_long = '5m splash one premier | orbitrap | beh c18 | negative')

100%|██████████| 1509/1509 [04:18<00:00,  5.84it/s]


In [9]:
c18_all = pd.concat([c18_pos_meta, c18_neg_meta], ignore_index=True)

In [10]:
c18_all

Unnamed: 0,wiki_id,scan,rt,precursor_mz,charge,binbase-splash,user_annotation-name,key,library_search-identity_score,smiles,msms,hidden,ms1_inte,adduct,method
0,aYSUJ8A/WNJMTV4W,1,251.400000,947.879800,1,splash10-0002-6300009000-31dd6f8fee8a76f152a0,1_TG 18:1-21:2-18:1-d5,1,0.940749,[2H]C([2H])(OC(=O)CCCCCCCC=CCCCCCCCC)C([2H])(O...,67.054350:251807.000000 69.069950:199305.00000...,False,0.0,[M+NH4]+,5m splash one premier | orbitrap | beh c18 | p...
1,aYSUJ8A/6QML39WN,2,138.600000,766.678290,1,splash10-0006-0900000000-c07a35ed3ee1d6c25cf9,1_SM 20:1 (d18:1/20:1)-d9,2,0.757248,[2H]C([2H])([2H])[N+](CCOP(=O)([O-])OC[CH](N=C...,64.104000:12043.000000 66.116620:4521.000000 6...,False,0.0,[M+H]+,5m splash one premier | orbitrap | beh c18 | p...
2,aYSUJ8A/GQSUQ82R,3,12.600000,319.267070,1,splash10-00kr-9105000000-a38bc2632afcb3c80728,"acylcarnitine, 10:0- D3",3,0.785778,[2H]C([2H])([2H])[N+](C)(C)CC(CC(=O)O)OC(=O)CC...,63.099770:65655.000000 69.070080:2148.000000 7...,False,0.0,[M+H]+,5m splash one premier | orbitrap | beh c18 | p...
3,aYSUJ8A/389CCTTR,4,170.400000,655.603180,1,splash10-00lr-6519000000-88a04fa0493c3930f681,1_17:0-20:3 DG-d5,4,0.949732,[2H]C([2H])(O)[C]([2H])(OC(=O)CCCCCCCCCC=CCC=C...,61.058770:19477.000000 67.054320:76147.000000 ...,False,0.0,[M+NH4]+,5m splash one premier | orbitrap | beh c18 | p...
4,aYSUJ8A/2W9WOA60,5,16.200000,341.279870,1,splash10-014i-0690000000-4e51f8e4594cd5f41b77,1_CUDA,5,0.942374,C1CCC(CC1)NC(=O)NCCCCCCCCCCCC(=O)O,68.584220:50929.000000 83.085400:309526.000000...,False,0.0,[M+H]+,5m splash one premier | orbitrap | beh c18 | p...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3284,aA9N54F/ZI550AQ3,16618,125.997952,661.528267,-1,splash10-03dl-3500009000-43e0f2fc3f4d5e5bd1ee,PE-Cer d34:0,16618,0.814259,CCCCCCCCCCCCCCCC(=O)NC(COP(=O)(O)OCCN)C(O)CCCC...,73.504677:22172.000000 78.953117:6580.000000 7...,False,1993218.0,[M-H]-,5m splash one premier | orbitrap | beh c18 | n...
3285,aA9N54F/16FFB09Y,16653,86.454934,736.491984,-1,splash10-002r-0180000900-27be6f05db5c907d749e,PE 18:2_18:3,16653,0.691011,CCC=CCC=CCC=CCCCCCCCC(=O)OC(COC(=O)CCCCCCCC=CC...,78.959068:13464.000000 81.833855:14360.000000 ...,False,781115.0,[M-H]-,5m splash one premier | orbitrap | beh c18 | n...
3286,aA9N54F/1R75NGBQ,16730,115.849657,999.624759,-1,splash10-004i-0092000000-e72c264c2a01a0da4397,DGDG 18:2_18:2,16730,0.672181,,,,,,5m splash one premier | orbitrap | beh c18 | n...
3287,aA9N54F/G3B7PWVO,16774,232.457968,1406.013767,-1,splash10-0a4i-0262900000-1be80916a2cf594d3284,CL 16:0_18:0_16:0_18:1,16774,0.675722,,,,,,5m splash one premier | orbitrap | beh c18 | n...


In [12]:
import toolsets.spectra_operations as so

In [13]:
msms =[]
for index, row in c18_all.iterrows():
    msms.append(so.convert_lcb_to_arr(row['msms']))
c18_all['msms']=msms

In [14]:
save_df(c18_all, os.path.join(data_path, 'c18_all_with_scores.csv'))

['msms']


100%|██████████| 3289/3289 [00:00<00:00, 24917.62it/s]


In [136]:
hilic_neg = pd.read_csv(os.path.join(data_path, 'hilic_orbi_neg.csv'))
hilic_pos = pd.read_csv(os.path.join(data_path, 'hilic_orbi_pos.csv'))

Index(['wiki_id', 'scan', 'rt', 'precursor_mz', 'charge', 'binbase-splash',
       'binbase-name', 'binbase-adduct', 'entropy', 'entropy_quality',
       'peak_number', 'normalized_rt', 'user_annotation-name',
       'user_annotation-adduct', 'annotation_search-name',
       'annotation_search-adduct', 'annotation_search-identity_score',
       'annotation_search-fuzzy_score', 'library_search-name',
       'library_search-adduct', 'library_search-identity_score',
       'library_search-fuzzy_score', 'name', 'adduct', 'identity_score',
       'annotation_method', 'fuzzy_score', 'key'],
      dtype='object')

In [169]:
hilic_neg_meta = complete_metadata(hilic_neg)

  0%|          | 0/1788 [00:00<?, ?it/s]

100%|██████████| 1788/1788 [04:26<00:00,  6.70it/s]


In [173]:
msms =[]
for index, row in hilic_neg_meta.iterrows():
    msms.append(so.convert_lcb_to_arr(row['msms']))
hilic_neg_meta['msms']=msms
        

In [177]:
save_df(hilic_neg_meta, os.path.join(data_path, 'hilic_orbi_neg_meta.csv'))

['msms']


100%|██████████| 1788/1788 [00:00<00:00, 23769.34it/s]


In [179]:
hilic_pos_meta = complete_metadata(hilic_pos, method_long = '5m hilic premier | orbitrap | beh amide | positive')

100%|██████████| 3250/3250 [08:24<00:00,  6.44it/s]


In [180]:
msms =[]
for index, row in hilic_pos_meta.iterrows():
    msms.append(so.convert_lcb_to_arr(row['msms']))
# hilic_pos_meta['msms']=msms

In [182]:
hilic_pos_meta['msms']=msms

In [183]:
save_df(hilic_pos_meta, os.path.join(data_path, 'hilic_pos_meta.csv'))

['msms']


100%|██████████| 3250/3250 [00:00<00:00, 18489.99it/s]


In [184]:
hilic_all = pd.concat([hilic_neg_meta, hilic_pos_meta], ignore_index=True)

In [185]:
save_df(hilic_all, os.path.join(data_path, 'hilic_all_meta.csv'))   

['msms']


100%|██████████| 5038/5038 [00:00<00:00, 17017.46it/s]
