In [4]:
'''
Downloading the case study dataset if not already done
'''

import os
import requests
from tqdm import tqdm    

url = 'https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/tender-details/docs/745b51da-4cfd-4ce9-86f6-ae0d6738f67a-CN/FoodEx2-CaseStudy2-Dataset_V1.xlsx'
f = 'data/FoodEx2-CaseStudy2-Dataset_V1.xlsx'
force = False
if not os.path.exists(f) or force:
    response = requests.get(url, stream=True)
    with open(f, "wb") as handle:
        for data in tqdm(response.iter_content(chunk_size=1024), unit="kB"):
            handle.write(data)
    print("File downloaded succesfully")
else:
    print("Skipping download, file already exists")

1861kB [00:00, 2722.34kB/s]

File downloaded succesfully





In [19]:
'''
Extracting separate training and test datasets for all classification task
1. baseterm (expo)
2. facets
3. F01, F02, etc
'''
import pandas as pd
import numpy as np
import pickle
import os

SEED = 44
def main():
    f_case = "data/FoodEx2-CaseStudy2-Dataset_V1.xlsx"
    f_datasets = "data/datasets-training-test.pickle"
    if not os.path.exists(f_case):
        raise FileNotFoundError("Please run previous notebook to obtain the case dataset")
    force = False
    if not os.path.exists(f_datasets) or force:
        case_df = pd.read_excel(f_case)
        # adding all term and facet categories
        datasets = {
            "baseterm":term_df(case_df),
            "facets":facets_df(case_df)
        }
        # getting all facets categories found
        fcats = datasets["facets"].category.unique().tolist()
        # iterating trough all facets categories and adding the associated training dataset
        for fcat in fcats:
            df = facet_df(case_df, fcat)
            datasets[fcat] = df
    
        # splitting into training 90% and testing 10% of data and saving to disk
        for hierarchy, df in datasets.items():
            if df is not None:
                shuffled = df.sample(frac=1, random_state = SEED)
                limit = int(len(df)/5)
                training = df.iloc[limit:, :]
                test = df.iloc[:limit, :]
            else:
                training = None
                test = None
                
            datasets[hierarchy] = {"hierarchy":hierarchy, "training":training, "test":test}
        with open(f_datasets, "wb") as f:
            pickle.dump(datasets, f)
        print(f"Training test datasets stored in {f_datasets}")
    else:
        print("Training test datasets file already exist, skipping")

def term_df(df):
    df = df.rename(columns={"ENFOODNAME":"text"})
    df["hierarchy"] = "expo"
    df["category"] = df["FACETS"].str.split("#").str[0]
    df = (
        df[["text", "hierarchy", "category"]][pd.notna(df.text) & pd.notna(df.text)]
            .drop_duplicates()
            .reset_index(drop = True)
    
    )
    return df

def facets_df(df):
    df = df.rename(columns={"ENFOODNAME":"text"})
    df["hierarchy"] = "facets"
    # extracting all facets
    df["category"] = (
        df["FACETS"]
            .str.split("#") #split term and facets
            .str[1] #choose only facets
            .str.split("$") #split on each facet
    )
    # tranforming multiple facets into different rows
    df = df[["text", "hierarchy", "category"]][pd.notna(df.text) & pd.notna(df.text)].explode("category")

    # extracting the facet hierarchy from the facet detail
    df["category"] = df["category"].str.split(".").str[0]

    df = df[pd.notna(df.category)].drop_duplicates().reset_index(drop = True)
    return df

def facet_df(df, fcat):
    # gettin a translation from facets to respetive hierrarcy codes
    fmap = {fcat:hcode for i, fcat, hcode in pd.read_pickle("data/attributes.pickle")[["code", "name"]].itertuples()}    
    if fcat not in fmap:
        print(f"Could not find a hierarchy for the code {fcat}, skipping this model") 
        return None
    df = df.rename(columns={"ENFOODNAME":"text"})
    # extracting all facets
    df["category"] = (
        df["FACETS"]
            .str.split("#") #split term and facets
            .str[1] #choose only facets
            .str.split("$") #split on each facet
    )
    df["fcat"] = df.category.str.split("")
    # tranforming multiple facets into different rows
    df = df[["text", "category"]][pd.notna(df.text) & pd.notna(df.text)].explode("category")
    # extracting the facet hierarchy from the facet detail
    df["fcat"] = df["category"].str.split(".").str[0]
    # limiting to the expected category
    df = df[df.fcat == fcat]
    df["category"] = df["category"].str.split(".").str[1]
    df["hierarchy"] = fmap[fcat]


    df = df[pd.notna(df.category)][["text", "hierarchy", "category"]].drop_duplicates().reset_index(drop = True)
    return df


main()

Could not find a hierarchy for the code RISKF04, skipping this model
Could not find a hierarchy for the code F15, skipping this model
Could not find a hierarchy for the code F14, skipping this model
Training test datasets stored in data/datasets-training-test.pickle


In [20]:
# showing training test datasets metricsabs
import pickle
def showdatasets():
    with open("data/datasets-training-test.pickle", "rb") as f:
        d = pickle.load(f)
    for cases in d.values():
        h_n = cases["hierarchy"] + "".join(" " for x in range(0,10 - len(cases["hierarchy"])))

        n_tr = len(cases["training"]) if cases["training"] is not None else None
        n_te = len(cases["test"]) if cases["test"] is not None else None
        print(f'Hierarchy:{ h_n }: \ttraining: { n_tr } rows, \ttest: { n_te } rows')
showdatasets()

Hierarchy:baseterm  : 	training: 26813 rows, 	test: 6703 rows
Hierarchy:facets    : 	training: 43076 rows, 	test: 10769 rows
Hierarchy:F21       : 	training: 799 rows, 	test: 199 rows
Hierarchy:F03       : 	training: 3968 rows, 	test: 992 rows
Hierarchy:F04       : 	training: 11860 rows, 	test: 2965 rows
Hierarchy:F10       : 	training: 8854 rows, 	test: 2213 rows
Hierarchy:F09       : 	training: 556 rows, 	test: 139 rows
Hierarchy:F28       : 	training: 12394 rows, 	test: 3098 rows
Hierarchy:F07       : 	training: 455 rows, 	test: 113 rows
Hierarchy:F19       : 	training: 3986 rows, 	test: 996 rows
Hierarchy:F23       : 	training: 116 rows, 	test: 28 rows
Hierarchy:F27       : 	training: 1327 rows, 	test: 331 rows
Hierarchy:F24       : 	training: 376 rows, 	test: 94 rows
Hierarchy:F02       : 	training: 366 rows, 	test: 91 rows
Hierarchy:F17       : 	training: 2665 rows, 	test: 666 rows
Hierarchy:F26       : 	training: 3060 rows, 	test: 765 rows
Hierarchy:F20       : 	training: 2134 r

In [21]:
# showing base term training dataset
import pickle
def showtraining():
    with open("data/datasets-training-test.pickle", "rb") as f:
        d = pickle.load(f)
    return d["F09"]["training"]
showtraining()

Unnamed: 0,text,hierarchy,category
139,ANONYMOUS C ANONYMOUS D TABLET,fort,A0EXH
140,ANONYMOUS C ANONYMOUS D TABLET,fort,A0EXM
141,ANONYMOUS CD ANONYMOUS BOORI TABLET,fort,A0EXH
142,ANONYMOUS CD ANONYMOUS BOORI TABLET,fort,A0EXM
143,PIRKKA CALCIUM SUPPLEMENT,fort,A0EXH
...,...,...,...
690,ZINC 25MG CAPSULES/TABLETS,fort,A03SM
691,ZINC PREPARATION ANONYMOUS CODE,fort,A0EXE
692,"ZINC SUPPLEMENT, 15MG",fort,A03SK
693,ANONYMOUS COD LIVER OIL 1000MG ONLY,fort,A03SK
