**Import necessary packages**

In [10]:
import pandas as pd
import numpy as np

**Import conversion factor**

In [11]:
bigg_df = pd.read_csv("../Data/bigg_models_metabolites.txt", sep = "\t")
bigg_df = bigg_df.set_index("universal_bigg_id") 
bigg = bigg_df["name"]
bigg = bigg.drop_duplicates()

**Reindex metabolites as necessary and write out**

In [31]:
list_of_tumor_types = ["BRCA1", "BRCA2","ccRCC1","ccRCC2", "ccRCC3","ccRCC4","COAD", "DLBCL", "GBM", "HCC", "HurthleCC", "ICC", "OV", "PDAC", "PRAD"]
for tumor_type in list_of_tumor_types:
    metabolite_df = pd.read_excel("/Users/tcong/MetabolismResearch/Data/Tumour Data/tumour_metabolomics_processed/PreprocessedData_%s.xlsx" %tumor_type, sheet_name = 4, index_col= 0).transpose()
    # Identical Matches between the two
    perfect_fits = set(bigg).intersection(set(metabolite_df.columns))
    len(perfect_fits)
    # Remove "D-" and "L-" prefixes
    bigg = bigg.apply(lambda x: str(x).replace("D-",""))
    bigg = bigg.apply(lambda x: str(x).replace("L-",""))
    # Convert to lower case
    bigg = bigg.apply(lambda x: str(x).lower())
    metabolite_df.columns= metabolite_df.columns.str.lower()
    # Identical Matches between the two
    perfect_fits = set(bigg).intersection(set(metabolite_df.columns))
    len(perfect_fits)
    # Replace names followed by cytosol/formula 
    for col in metabolite_df.columns:
        if col not in perfect_fits:
            test_str = " ".join([col,"c"])
            test = bigg.apply(lambda x: str(x).startswith(test_str))
            bigg[test] = col
    # Identical Matches between the two
    perfect_fits = set(bigg).intersection(set(metabolite_df.columns))
    len(perfect_fits)
    # Manual correction of abrevviations, hyphens and inversions
    # ONLY FOR IDs that can be confidently mapped
    dict_correct = {
        '3-phosphoglycerate': '3-phospho-glycerate',
        'alpha-ketoglutarate': '2-oxoglutarate',
        'erythrose-4-phosphate': 'erythrose 4-phosphate',
        'glutathione oxidized': 'oxidized glutathione',
        'glutathione reduced': 'reduced glutathione',
        'hippurate': 'o-methylhippurate',
        'kynurenine': 'l kynurenine c10h12n2o3',
        'nad': 'nicotinamide adenine dinucleotide',
        'nadp': 'nicotinamide adenine dinucleotide phosphate',
        'pantothenate': '(r)-pantothenate',
        'pep': 'phosphoenolpyruvate',
        '6-phosphogluconate': '6-phospho-gluconate',
        'alpha-hydroxybutyrate': '4-hydroxybutyrate',
        'inositol': 'myo-inositol',
        'malondialdehyde': 'malonate semialdehyde',
        'cis/trans-hydroxyproline': 'cis 4 hydroxy d proline',
        'gaba': 'gamma-glutamyl-gamma aminobutyric acid',
        'acetylglycine': 'acetyl-glycine',
        'dimethylglycine': 'n,n-dimethylglycine',
        'anthranilic acid': 'anthranilate',
        'kynurenic acid': '4-hydroxy-2-quinolinecarboxylic acid',
        'thiamine': 'thiamin',
        'niacinamide': 'nicotinamide',
        'betaine': 'glycine betaine',
        'trimethylamine-n-oxide': 'trimethylamine n-oxide',
        '2-deoxyadenosine': 'deoxyadenosine',
        '2-deoxycytidine': 'deoxycytidine',
        'pipecolic acid': 'pipecolic acid; piperidine-2-carboxylic acid',
        '1-methylnicotinamide': '1 methylnicotinamide c7h9n2o',
        'butyrobetaine': 'butyro-betaine',
        'acetylcarnitine': 'o acetylcarnitine c9h17no4',
        'propionylcarnitine': 'propionyl-carnitine',
        'malonylcarnitine': 'malonyl carnitine',
        'hexanoylcarnitine': 'hexanoyl carnitine',
        'heptanoylcarnitine': 'heptanoyl-l-carnitine',
        'lauroylcarnitine': 'lauroyl carnitine'
    }
    metabolite_df = metabolite_df.rename(dict_correct, axis=1)
    bigg_ids = {}
    for (index, name) in bigg.iteritems():
        if name in metabolite_df:
            bigg_ids[name] = index
    metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
    metabolite_df_with_ids.index = metabolite_df.index.append(pd.Index(["BIGG_ID"]))
    metabolite_df_with_ids = metabolite_df_with_ids.dropna(axis = 1)
    metabolite_df_with_ids.columns = metabolite_df_with_ids.loc["BIGG_ID"]
    metabolite_df_with_ids = metabolite_df_with_ids[:-1]
    metabolite_df_with_ids.to_csv("../Data/Generated Data/processed_%s_metabolite_data.csv" %tumor_type, index_label= False)

  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():
  metabolite_df_with_ids = metabolite_df.append(bigg_ids, ignore_index = True)
  for (index, name) in bigg.iteritems():