In [6]:
'''
Downloading the FoodEx database if not already done from their public wiki
'''
import os
import requests
from tqdm import tqdm    

url = 'https://github.com/openefsa/efsa-catalogues/releases/download/12.0/MTX_FULL_12_0.ecf'
f = 'data/MTX_FULL_12_0.ecf'
force = False
if not os.path.exists(f) or force:
    response = requests.get(url, stream=True)
    with open(f, "wb") as handle:
        for data in tqdm(response.iter_content(chunk_size=1024), unit="kB"):
            handle.write(data)
    print("File downloaded succesfully")
else:
    print("Skipping download, file already exists")


3715kB [00:00, 5787.73kB/s]

File downloaded succesfully





In [24]:
'''
Storing FoodEx database in parent-child tabular form if not already done
'''

import os
import gc
import pandas as pd
from bs4 import BeautifulSoup
from zipfile import ZipFile
from bs4.element import Tag, NavigableString

def main():
    f_ecf = 'data/MTX_FULL_12_0.ecf'
    f_xl = 'data/MTX_FULL_12_0.xlsx'
    force = False
    if not os.path.exists(f_ecf):
        raise FileNotFoundError("Please execute the previous step to download the ecf file")
    print("reading ecf file")
    with ZipFile(f_ecf) as zf:
        with zf.open('MTX_FULL_12_0.xml') as f:
            soup = BeautifulSoup(f, 'xml')
    if force:
        os.unlink(f_xl)
    if not os.path.exists(f_xl):
        with pd.ExcelWriter(f_xl, engine='xlsxwriter') as writer:  
            hierarchies = get_hierarchies(soup)
            print(f"writing {len(hierarchies)} hierarchies to excel and pickle")
            hierarchies.to_excel(writer,sheet_name='Hierarchies')
            hierarchies.to_pickle("data/hierarchies.pickle")
            attrs = get_attributes(soup)
            print(f"writing {len(attrs)} attributes to excel and pickle")
            attrs.to_excel(writer,sheet_name='Attributes') 
            attrs.to_pickle("data/attributes.pickle")
            terms = get_terms(soup)
            print(f"writing {len(terms)} terms to excel and pickle")
            terms.to_excel(writer,sheet_name='Terms')   
            terms.to_pickle("data/terms.pickle")
            print("Dataframes saved successfully")
    else:
        print("Skipping wrinting to file since it exists already")

def leafmap(node, excluded_parents = {}, force_set = False):
    res = {}
    for desc in node.descendants:
        
        if type(desc)==Tag and len(desc.contents) == 1 and type(desc.contents[0]) == NavigableString and (desc.parent.name not in excluded_parents):
            key = desc.name
            val = desc.contents[0].text
            if key not in res:
                res[key] = val if not force_set else {val}
            else:
                if type(res[key]) is str:
                    res[key] = {res[key]}
                res[key].add(val)
    return res

def get_hierarchies(soup):
    hierarchies = pd.DataFrame([leafmap(h) for h in soup.find_all("hierarchy")])
    hierarchies["hierarchyOrder"] = hierarchies["hierarchyOrder"].astype(int)
    return hierarchies.sort_values("hierarchyOrder", ignore_index = True)

def get_attributes(soup):
   attrs = pd.DataFrame([leafmap(h) for h in soup.find_all("attribute")])
   attrs["attributeOrder"] = attrs["attributeOrder"].astype(int)
   return attrs.sort_values("attributeOrder", ignore_index = True)

def get_attributes(soup):
   attrs = pd.DataFrame([leafmap(h) for h in soup.find_all("attribute")])
   attrs["attributeOrder"] = attrs["attributeOrder"].astype(int)
   return attrs.sort_values("attributeOrder", ignore_index = True)

def get_terms(soup):
   nodes = soup.find_all("term")
   dicts = []
   for node in nodes:
       term = leafmap(node, {"hierarchyAssignment", "implicitAttribute", "attributeValues"}) 
       #extracting hierarchy assignements
       found_ha = False
       for ia in node.find_all("implicitAttribute"):
         attr = ia.find("attributeCode").text
         values = {value.text for value in ia.find_all("attributeValue")}
         if attr in term:
             raise KeyError("The implicit attributs is going to override an existuing value, this is unexpected")
         term[attr] = values
       for ha in node.find_all("hierarchyAssignment"):
           dicts.append({**term, **leafmap(ha)})
           found_ha = True
       if not found_ha:
           dicts.append(term)
   df = pd.DataFrame(dicts)
   df["order"] = df["order"].astype(int)
   return df.sort_values(["hierarchyCode", "parentCode", "order"], ignore_index = True)

main()
gc.collect()

reading ecf file
writing 36 hierarchies to excel and pickle
writing 47 attributes to excel and pickle
writing 79901 terms to excel and pickle
Dataframes saved successfully


13675740