In [355]:
import re
from rdkit import Chem
import pandas as pd
import numpy as np
from rdkit.Chem.PandasTools import LoadSDF
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem.rdmolfiles import MolFromSmiles
from rdkit.Chem.inchi import MolToInchi
import json
import pickle
import zipfile

from CM_curate_utils import *

## Import required data

### Load MDBs and LipidMaps

In [None]:
with zipfile.ZipFile('data/LMSD_2022.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

In [None]:
hmdb_v4 = pd.read_csv('data/HMDB_v4.csv')
ecmdb_df = pd.read_json('data/ecmdb.json')
ymdb_df = LoadSDF('data/ymdb.sdf', smilesName='smiles')
lm_df = LoadSDF('data/LMSD.sdf/structures.sdf', smilesName='smiles')

### Load and parse KEGG biological compounds

In [167]:
json_file_path = 'data/kegg_bio_cmpds.json'
KEGG_df = parse_KEGG_json_to_df(json_file_path)
KEGG_df.to_csv('data/KEGG_bio_cmpds_flat.csv', index=False)

The KEGG IDs from `KEGG_df` were used as query to [CTS](http://cts.fiehnlab.ucdavis.edu/batch) and a batch conversion was performed on the webapp where `KEGG` was source type and both `InChIKey` and `Human Metabolome Database` as target types. The output was downloaded and saved in `data/KEGG_CTS_convert_res.csv`

In [19]:
KEGG_CTS_res = pd.read_csv('data/KEGG_CTS_convert_res.csv')

In [48]:
KEGG_inchi_no_hmdb = KEGG_CTS_res[KEGG_CTS_res['Human Metabolome Database'] == 'No result']
KEGG_inchi_no_hmdb = KEGG_inchi_no_hmdb[~KEGG_inchi_no_hmdb.InChIKey.isna()]
KEGG_inchi_no_hmdb.to_csv('data/KEGG_inchi_no_hmdb.csv', index=False)

KEGG IDs that are not mapped to an HMDB ID, but have a corresponding InChIKey were exported and provided as query to [CTS](http://cts.fiehnlab.ucdavis.edu/batch) like before and a batch conversion was performed on the webapp where `KEGG` was source type and `InChi Code` as target type. The output was downloaded and saved in `data/KEGG_inchi_no_hmdb_CTS_convert_res.csv`

In [49]:
KEGG_inchi_no_hmdb_CTS_convert_res = pd.read_csv('data/KEGG_inchi_no_hmdb_CTS_convert_res.csv')

### Load custom data and standards

In [15]:
msmls = pd.read_csv('data/MSMLS2015PlateMap_withMW - Sheet1.csv')
custom_df = pd.read_csv('data/Core_metabolites_manual - Sheet1.csv')

## Preprocessing 

### Preprocess MDBs and LipidMaps

In [None]:
hmdb_v4 = preprocess_db(hmdb_v4, smiles_col='smiles')
hmdb_detected = hmdb_v4[hmdb_v4.status.isin(['quantified', 'detected'])]

ecmb_df_processed = preprocess_db(ecmdb_df, smiles_col='moldb_smiles')
ymdb_df_processed = preprocess_db(ymdb_df, smiles_col='SMILES')
lm_df_processed = preprocess_db(lm_df, smiles_col='SMILES')

### Preprocess KEGG 

In [51]:
kegg_hmdbs = KEGG_CTS_res[KEGG_CTS_res['Human Metabolome Database'] != 'No result']
kegg_struct_df = preprocess_db(KEGG_inchi_no_hmdb_CTS_convert_res, inchi_col='InChI Code')

### Preprocess custom and standards

In [None]:
msmls_processed = preprocess_db(msmls, smiles_col='SMILES')

## Overlap analysis

In [94]:
he_core = intersect_smiles(hmdb_v4, ecmb_df_processed, smiles_col1='can_smiles', smiles_col2='can_smiles')
hey_core = intersect_smiles(pd.DataFrame(he_core,columns=['Smiles']), 
                            ymdb_df_processed, smiles_col1='Smiles', smiles_col2='can_smiles')

hmdb_hey_df = hmdb_v4[hmdb_v4.can_smiles.isin(hey_core)].copy(deep=True)

In [171]:
KEGG_hmdb_struct_ids = list(hmdb_v4[hmdb_v4.can_smiles.isin(kegg_struct_df.can_smiles)].copy(deep=True).accession)
KEGG_hmdbs_list = list(kegg_hmdbs['Human Metabolome Database'])
KEGG_hmdbs_list = [id for ids in [elem.split('\n') for elem in KEGG_hmdbs_list] for id in ids]
kegg_hmdb_df = hmdb_v4[hmdb_v4.accession.isin(KEGG_hmdbs_list + KEGG_hmdb_struct_ids)].copy(deep=True)
kegg_not_hmdb = kegg_struct_df[~kegg_struct_df['can_smiles'].isin(list(kegg_hmdb_df.can_smiles))]


In [160]:
hmdb_lm = intersect_smiles(hmdb_v4, lm_df_processed, smiles_col1='can_smiles', smiles_col2='can_smiles')
hmdb_lm_df = hmdb_v4[hmdb_v4.can_smiles.isin(hmdb_lm)].copy(deep=True)

In [128]:
msmls_hmdb = intersect_smiles(hmdb_v4, msmls_processed, smiles_col1='can_smiles', smiles_col2='can_smiles')
hmdb_ms_df = hmdb_v4[hmdb_v4.can_smiles.isin(msmls_hmdb)].copy(deep=True)
msmls_not_hm = msmls_processed[~msmls_processed.can_smiles.isin(hmdb_v4.can_smiles)]

In [200]:
hmdb_man_df = hmdb_v4[hmdb_v4.accession.isin(custom_df.HDMB)].copy(deep=True)
not_hmdb = custom_df[custom_df.HDMB.isna()]
not_hmdb = not_hmdb[~not_hmdb.SMILES.isna()]

## Adding formulas not found in HMDB

In [191]:
to_add_KEGG = pd.merge(kegg_not_hmdb, KEGG_df, left_on='KEGG', right_on='cpd', how='left').drop(columns='cpd')
to_add_KEGG['formula'] = to_add_KEGG['can_smiles'].apply(lambda x: CalcMolFormula(Chem.MolFromSmiles(x)))
to_add_KEGG = to_add_KEGG.rename(columns={'cpd_name':'name','KEGG' : 'id', 'InChI Code' : 'inchi'})
to_add_KEGG = to_add_KEGG[['id','name','formula','inchi']].drop_duplicates()

In [None]:
to_add_msmls = msmls_not_hm[['can_smiles','CNAME','FORMULA']].reset_index()
to_add_msmls['index'] = to_add_msmls.index
to_add_msmls['id'] = to_add_msmls['index'].apply(lambda x: 'msmls' + str(x + 1))
to_add_msmls['inchi'] = to_add_msmls['can_smiles'].apply(lambda x: MolToInchi(MolFromSmiles(x)))
to_add_msmls = to_add_msmls.rename(columns={'CNAME':'name','FORMULA' : 'formula'})
to_add_msmls = to_add_msmls[['id','name','formula','inchi']].drop_duplicates()

In [None]:
to_add_custom = not_hmdb[['SMILES','Name','Formula','PubChem CID:']].reset_index()
to_add_custom['inchi'] = to_add_custom['SMILES'].apply(lambda x: MolToInchi(MolFromSmiles(x)))
to_add_custom = to_add_custom.rename(columns={'Name':'name', 'PubChem CID:': 'id','Formula': 'formula'})
to_add_custom = to_add_custom[['id', 'name', 'formula', 'inchi']].drop_duplicates()

## Compiling CoreMetabolome-V1

In [None]:
core_df_v1 = pd.concat([hmdb_hey_df, kegg_hmdb_df,hmdb_lm_df,
                     hmdb_ms_df,hmdb_man_df,hmdb_detected]).drop_duplicates()
core_df_v1['inchi'] = core_df_v1['smiles'].apply(lambda x: MolToInchi(MolFromSmiles(x)))
cols = ['accession', 'name', 'chemical_formula', 'inchi']
core_df_v1 = core_df_v1[cols].rename(columns={'accession': 'id', 'chemical_formula' : 'formula'})


In [266]:
to_add_new = pd.concat([to_add_KEGG, to_add_msmls,to_add_custom])
core_metabolome_v1 = pd.concat([core_df_v1, to_add_new]).drop_duplicates()
core_metabolome_v1.to_csv('coremetabolome_curated/core_metabolome_v1.csv')

## Optimizing V1 to V2

In [344]:
core_metabolome_v2 = core_metabolome_v1.copy(deep = True)
core_metabolome_v2['formula'] = core_metabolome_v2['formula'].apply(lambda x: neutral(x))
metals = list(core_metabolome_v2[core_metabolome_v2.formula.str.contains('\+')].name)
core_metabolome_v2 = core_metabolome_v2[~core_metabolome_v2.name.isin(metals)]
core_metabolome_v2.to_csv('coremetabolome_curated/core_metabolome_v2.csv', index = False)