# DATABASE GENERATOR

In this notebook we will generate the final Food/Drug database applying a merge operation over "partial" databases obtained from different repositories (HMDB, FooDB, DrugBank and KeggDrug)

In [1]:
# import modules

import sys
import os
import numpy as np
import pandas as pd

In [2]:
# Lists with file paths to databases we want to merge

food_DB_paths = ["./HMDB/hmdb_food.tsv"]#, "./FooDB/fooDB_food.tsv"]

drug_DB_paths = ["./HMDB/hmdb_drug.tsv"]#, "./DrugBank/drugbank_drug.tsv", "./KEGG_Drug/keggDrug_drug.tsv"]

In [3]:
# df.loc[df.loc[:, df.columns[1]] == df.loc[df.loc[:, "Name"] != "Val", df.columns[1]].to_numpy()[0], :][:4]

In [4]:
def removeAminoAcid(df):
    """
    
    """
    
    aa_list = ["Ala", "Arg", "Asn", "Asp", "Cys", "Gln", "Glu", "Gly", "His", "Ile",
               "Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
               "Alanine", "Arginine", "Asparagine", "Aspartate", "Cysteine", "Glutamine",
               "Glutamate", "Glycine", "Histidine", "Isoleucine", "Leucine", "Lysine",
               "Methionine", "Phenylalanine", "Proline", "Serine", "Threonine", "Tryptophan",
               "Tyrosine", "Valine"]
    
    for aa in aa_list:
        
        id_array = df.loc[df.loc[:, df.columns[0]] == aa, df.columns[1]].to_numpy()
        
        for aa_id in id_array:
            
            df = df.loc[df.loc[:, df.columns[1]] != aa_id, :]
    
    return df
        

def mergeDB(paths_list):
    """
    
    """
    
    df = pd.read_csv(paths_list[0], header=0, sep="\t", na_filter=False)
    
    df = removeAminoAcid(df)
    
    for path in paths_list[1:]:
        
        df_i = pd.read_csv(path, header=0, sep="\t", na_filter=False)
        
        df_i = removeAminoAcid(df_i)
        
        df = pd.merge(df, df_i, how='outer', on='Name')
    
    return df

In [5]:
food_DB = mergeDB(food_DB_paths)

In [6]:
food_DB

Unnamed: 0,Name,HMDB_ID
0,"0211, SPI",HMDB0015180
1,"10,10-dibromo-9(10H)-Anthracenone",HMDB0041543
2,"10,11-dihydro-8-(1-Methylethyl)-11-(2-methylpr...",HMDB0035218
3,"10,11-Epidioxycalamene",HMDB0041062
4,"10,12-Dotriacontanedione",HMDB0035530
...,...,...
29832,δ-hydroxyproline,HMDB0036576
29833,δ-hydroxyvalerate,HMDB0061927
29834,δ-hydroxyvaleric acid,HMDB0061927
29835,δ-phenylvalerate,HMDB0002043


In [7]:
food_DB.to_csv("food_database.tsv", index=False, sep="\t")

In [8]:
drug_DB = mergeDB(drug_DB_paths)

In [9]:
drug_DB

Unnamed: 0,Name,HMDB_ID
0,"001, RAD",HMDB0015529
1,"0211, SPI",HMDB0015180
2,"027, FK",HMDB0014809
3,03-(2-Morpholinoethyl)-morphine,HMDB0041984
4,"073, AMG",HMDB0015147
...,...,...
33643,δ-hydroxyvalerate,HMDB0061927
33644,δ-hydroxyvaleric acid,HMDB0061927
33645,δ-methorphan,HMDB0001920
33646,δ-phenylvalerate,HMDB0002043


In [10]:
drug_DB.to_csv("drug_database.tsv", index=False, sep="\t")