Import necessary packages

In [1]:
import pandas as pd
import numpy as np

Import conversion factor

In [2]:
#Import conversion
bigg_df = pd.read_csv("../Data/bigg_models_metabolites.txt", sep = "\t")
bigg_df = bigg_df.set_index("universal_bigg_id") 
bigg = bigg_df["name"]
bigg = bigg.drop_duplicates()

Import metabolite data

In [None]:
metabolite_df = pd.read_csv("../Data/original_metabolite_data.csv")
metabolite_df =metabolite_df.set_index("CCLE_ID", drop = True)

Find identical matches in conversion factor

In [4]:
perfect_fits = set(bigg).intersection(set(metabolite_df.columns))
len(perfect_fits)

0

Remove prefixes

In [None]:
bigg = bigg.apply(lambda x: str(x).replace("D-",""))
bigg = bigg.apply(lambda x: str(x).replace("L-",""))

Convert names to lower case

In [6]:
bigg = bigg.apply(lambda x: str(x).lower())
metabolite_df.columns= metabolite_df.columns.str.lower()

Find Identical Matches between the two again

In [8]:
perfect_fits = set(bigg).intersection(set(metabolite_df.columns))
len(perfect_fits)

57

Replace names followed by cytosol/formula

In [9]:
for col in metabolite_df.columns:
    if col not in perfect_fits:
        test_str = " ".join([col,"c"])
        test = bigg.apply(lambda x: str(x).startswith(test_str))
        bigg[test] = col

Find Identical Matches between the two again

In [10]:
perfect_fits = set(bigg).intersection(set(metabolite_df.columns))
len(perfect_fits)

72

Manual correction of abrevviations, hyphens and inversions
# ONLY FOR IDs that can be confidently mapped

In [11]:

dict_correct = {
    '3-phosphoglycerate': '3-phospho-glycerate',
    'alpha-ketoglutarate': '2-oxoglutarate',
    'erythrose-4-phosphate': 'erythrose 4-phosphate',
    'glutathione oxidized': 'oxidized glutathione',
    'glutathione reduced': 'reduced glutathione',
    'hippurate': 'o-methylhippurate',
    'kynurenine': 'l kynurenine c10h12n2o3',
    'nad': 'nicotinamide adenine dinucleotide',
    'nadp': 'nicotinamide adenine dinucleotide phosphate',
    'pantothenate': '(r)-pantothenate',
    'pep': 'phosphoenolpyruvate',
    '6-phosphogluconate': '6-phospho-gluconate',
    'alpha-hydroxybutyrate': '4-hydroxybutyrate',
    'inositol': 'myo-inositol',
    'malondialdehyde': 'malonate semialdehyde',
    'cis/trans-hydroxyproline': 'cis 4 hydroxy d proline',
    'gaba': 'gamma-glutamyl-gamma aminobutyric acid',
    'acetylglycine': 'acetyl-glycine',
    'dimethylglycine': 'n,n-dimethylglycine',
    'anthranilic acid': 'anthranilate',
    'kynurenic acid': '4-hydroxy-2-quinolinecarboxylic acid',
    'thiamine': 'thiamin',
    'niacinamide': 'nicotinamide',
    'betaine': 'glycine betaine',
    'trimethylamine-n-oxide': 'trimethylamine n-oxide',
    '2-deoxyadenosine': 'deoxyadenosine',
    '2-deoxycytidine': 'deoxycytidine',
    'pipecolic acid': 'pipecolic acid; piperidine-2-carboxylic acid',
    '1-methylnicotinamide': '1 methylnicotinamide c7h9n2o',
    'butyrobetaine': 'butyro-betaine',
    'acetylcarnitine': 'o acetylcarnitine c9h17no4',
    'propionylcarnitine': 'propionyl-carnitine',
    'malonylcarnitine': 'malonyl carnitine',
    'hexanoylcarnitine': 'hexanoyl carnitine',
    'heptanoylcarnitine': 'heptanoyl-l-carnitine',
    'lauroylcarnitine': 'lauroyl carnitine'
}

metabolite_df = metabolite_df.rename(dict_correct, axis=1)

Find Identical Matches between the two again

In [13]:
# Identical Matches between the two
perfect_fits = set(bigg).intersection(set(metabolite_df.columns))
len(perfect_fits)

108

Annotate the dataframe with newfound BiGG IDs

In [14]:
bigg_ids = {}
for (index, name) in bigg.iteritems():
    if name in metabolite_df:
        bigg_ids[name] = index
metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
metabolite_df_with_ids.index = metabolite_df.index.append(pd.Index(["BIGG_ID"]))
metabolite_df_with_ids = metabolite_df_with_ids.dropna(axis = 1)
metabolite_df_with_ids.columns = metabolite_df_with_ids.loc["BIGG_ID"]
metabolite_df_with_ids = metabolite_df_with_ids[:-1]

  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)


Write to processed metabolomic file

In [16]:
metabolite_df_with_ids.to_csv("../Data/Generated Data/processed_metabolite_data.csv", index_label= False)

Include this next section for adding required IDs to data
(in this case we add IDs from Human Metabolome Database)

In [31]:
import re

In [53]:
required_ids = {}
for met in bigg_ids.keys():
    links = None
    if isinstance(bigg_df.loc[bigg_ids[met]]["database_links"], pd.core.series.Series):
        links = bigg_df.loc[bigg_ids[met]]["database_links"][0]
    elif isinstance(bigg_df.loc[bigg_ids[met]]["database_links"], str):
        links = bigg_df.loc[bigg_ids[met]]["database_links"]
    if isinstance(links,str):
        match = re.search("Human Metabolome Database: http://identifiers.org/hmdb/", links)
        if match:
            required_id = re.split("Human Metabolome Database: http://identifiers.org/hmdb/",links)[1]
            required_id = re.split(";",required_id)[0]
            required_ids[met] = required_id

In [55]:
metabolite_df_with_required_ids = metabolite_df_with_ids.append(required_ids, ignore_index = True)
metabolite_df_with_required_ids.index = metabolite_df_with_ids.index.append(pd.Index(["HMDB_ID"]))

In [56]:
metabolite_df_with_required_ids.to_csv("metabolite_df_with_required_ids.csv")