# CFIA Importer
[![Static Badge](https://img.shields.io/badge/Jupyter_Notebook-F37726?style=for-the-badge)](https://jupyter.org/)

<br>

## Requirements
- Python (Version 3.6 or up)

<br>
<br>

### Install Required Dependencies
Run the code block below to install the required dependencies.

<br>

> ***📝 NOTE:*** <br>
>
> If you already have the required dependencies, you can *optionally* run the code block below.
> 
> (For this case, running the code will only import the required libraries without sending any HTTP server requests to Pypi)

In [23]:
###################
# Note: The code below is modified from AGRemap for dynamically installing
#   packages at runtime. That way, we do not need to make extra HTTP requests
#   to Pypi's server if the user's python environment already has the required libraries
#
# Reference:
#    https://github.com/nhok0169/Anime-Game-Remap/blob/nhok0169/Anime%20Game%20Remap%20(for%20all%20users)/api/src/FixRaidenBoss2/tools/PackageManager.py
#    https://github.com/nhok0169/Anime-Game-Remap/blob/nhok0169/Anime%20Game%20Remap%20(for%20all%20users)/api/src/FixRaidenBoss2/tools/PackageData.py


import pip._internal as pip
import importlib
from types import ModuleType
from typing import Optional, Dict


# PackageData: Class to hold data for importing a package
class PackageData():
    def __init__(self, module: str, installName: Optional[str] = None):
        self.module = module
        self.installName = module if (installName is None) else installName


# PackageManager: Class to manage the packages
class PackageManager():
    def __init__(self):
        self._packages: Dict[str, ModuleType] = {}

    # load(module, installName, save): Tries to import a package and install the package if the package
    #   is not installed yet. Can optionally save to cache.
    def load(self, module: str, installName: Optional[str] = None, save: bool = True) -> ModuleType:
        if (installName is None):
            installName = module

        try:
            return importlib.import_module(module)
        except ModuleNotFoundError:
            pip.main(['install', '-U', installName])

        result = importlib.import_module(module)
        if (save):
            self._packages[module] = result
        
        return result
    
    # get(packageData, cache): Retrieves a package and installs the package if the package is not installed yet.
    #   Has optional caching capability.
    def get(self, packageData: PackageData, cache: bool = True) -> ModuleType:
        if (not cache):
            return self.load(packageData.module, installName = packageData.installName, save = cache)

        result = None
        try:
            result = self._packages[packageData.module]
        except KeyError:
            result = self.load(packageData.module, installName = packageData.installName, save = cache)

        return result


##############################
# The required installations #
# ############################
Packages = [
    PackageData("pandas"),
    PackageData("openpyxl")
]

Packager = PackageManager()

for package in Packages:
    Packager.get(package, cache = False)


<br>

## User Settings

Below shows some configurations that may be different depending on the user.

<br>

> ***❇️ Important***
>
> Please ensure settings below are configured correctly.
>

In [24]:
import os

# The folder where the raw data files are located
DataFolder = "data"

# The file location to the output files 
OutputFolder = os.path.join("..", "..", "data")
OutputFileName = "CFIA Data"


<br>

## Running the Importer

The code blocks below cleans up the raw CFIA data files to look similar to the Health Canada data

<br>

In [36]:
import pandas as pd
import numpy as np
import glob
import os
from threading import Lock, Thread
from enum import Enum
from typing import List


# Languages: Different languages available
class Languages(Enum):
    English = "en"
    French = "fr"


# CFIADataCols: Different columns for the CFIA data
class CFIADataCols(Enum):
    FoodGroup = "Food Group"
    FoodName = "Food Name"
    Agent = "Agent"
    Genus = "Genus"
    Species = "Species"
    SeroType = "Serotype"
    EColiCategory = "Ecoli CFIA Category"


# CFIAStrConsts: Some string keywords used in the raw CFIA data
class CFIAStrConsts(Enum):
    LangSeperator = "//"


# index position for each language part in to retrieve from the raw CFIA data
CFIALangPos = {Languages.English: 0, Languages.French: 1}

# Columns to have english and french seperated
CFIALangSeperatedCols = [CFIADataCols.FoodName.value]


# Importer: Class to cleanup the CFIA data
class Importer():
    def __init__(self, dataFolder: str = DataFolder, outputFolder: str = OutputFolder, outputFileName: str = OutputFileName):
        self.dataFolder = dataFolder
        self.outputFolder = outputFolder
        self.outputFileName = outputFileName

        self._outputDataLocks = {}
        self._outputData = {}

        for lang in Languages:
            self._outputDataLocks[lang] = Lock()
            self._outputData[lang] = None

    # _seperateTranslationCol(colValue): Seperate out a value consisting of the combined
    #   translations into individual translated parts
    @classmethod
    def _seperateTranslationVal(cls, colValue: str) -> List[str]:
        result = colValue.split(CFIAStrConsts.LangSeperator.value)
        result = list(map(lambda translatedPart: translatedPart.strip(), result))

        langLen = len(Languages)
        resultLen = len(result)

        if (resultLen == langLen):
            return result
        elif (resultLen > langLen):
            return result[:langLen]
        
        for i in range(resultLen, langLen):
            result.append(result[0])
        
        return result
    
    # _seperateTranslationCol(col, rawData, data, dataLocks): Seperate out the combined translated values in a column
    def _seperateTranslationCol(self, col: str, rawData: pd.DataFrame, data: Dict[Languages, pd.DataFrame], dataLocks: Dict[Languages, Lock]):
        colVals = rawData.get(col)
        colVals = colVals.apply(self._seperateTranslationVal)

        for lang in Languages:
            currentData = data[lang]
            langInd = CFIALangPos[lang]
            lock = dataLocks[lang]

            with lock:
                currentData[col] = colVals.apply(lambda langFoodGroups: langFoodGroups[langInd])

    # _seperateTranslations(rawData, data, rawDataLock, dataLocks): Seperate out the combined translated values from the raw data
    #   for each seperate language result data
    def _seperateTranslations(self, rawData: pd.DataFrame, data: Dict[Languages, pd.DataFrame], dataLocks: Dict[Languages, Lock]):
        colThreads = []
        for col in CFIALangSeperatedCols:
            colThreads.append(Thread(target = self._seperateTranslationCol, args = [col, rawData, data, dataLocks], daemon=True))

        for thread in colThreads:
            thread.start()

        for thread in colThreads:
            thread.join()

    # _createEColiCategory(rawData, data, dataLocks): Creates the E-Coli category for the microorganism breadcrumb
    def _createEColiCategory(self, rawData: pd.DataFrame, data: Dict[Languages, pd.DataFrame], dataLocks: Dict[Languages, Lock]):
        rowCount = len(rawData.index)
        eColiCategoryVals = np.zeros(rowCount)

        agentVals = rawData[CFIADataCols.Agent.value]
        genusVals = rawData[CFIADataCols.Genus.value]
        speciesVals = rawData[CFIADataCols.Species.value]
        seroTypeVals = rawData[CFIADataCols.SeroType.value].fillna("")

        isEColi = ((agentVals == "Bacteria") & (genusVals == "Escherichia") & (speciesVals == "coli"))

        seroTypeIsO157 = seroTypeVals.str.contains(pat = "O157")
        seroTypeNotO157 = ~seroTypeIsO157

        eColiCategoryVals = np.select([isEColi & seroTypeIsO157, isEColi & seroTypeNotO157], ["O157", "Verotoxigenic"], default = "")

        for lang in Languages:
            currentData = data[lang]
            lock = dataLocks[lang]

            with lock:
                currentData[CFIADataCols.EColiCategory.value] = eColiCategoryVals

    # processFile(excelFile): Cleanups a single excel file and merges the
    #   result into the output data
    def processFile(self, excelFile: str):
        rawData = pd.read_excel(excelFile)

        resultData = {}
        resultDataLocks = {}
        for lang in Languages:
            resultData[lang] = rawData.copy()
            resultDataLocks[lang] = Lock()

        # Since the procedure for cleaning/creating the columns do not have any data
        #   that depend on each other, we can run all these procedures in parallel
        colThreads = []
        colThreads.append(Thread(target = self._seperateTranslations, args = [rawData, resultData, resultDataLocks], daemon=True))
        colThreads.append(Thread(target = self._createEColiCategory, args = [rawData, resultData, resultDataLocks], daemon=True))

        for thread in colThreads:
            thread.start()

        for thread in colThreads:
            thread.join()

        # write the processed data back to the combined output
        for lang in Languages:
            lock = self._outputDataLocks[lang]

            with lock:
                outputData = self._outputData[lang]
                self._outputData[lang] = resultData[lang].copy() if (outputData is None) else pd.concat([outputData, resultData[lang]])

    # run(): Runs the importer to create the cleaned-up data
    def run(self):
        excelFiles = glob.glob(os.path.join(f"{self.dataFolder}", "*.xlsx"))

        # simultaneously process multiple excel files at the same time
        excelThreads = []
        for excelFile in excelFiles:
            csvThread = Thread(target = self.processFile, args=[excelFile], daemon=True)
            excelThreads.append(csvThread)
            csvThread.start()

        # wait for all excel processes to finish
        for thread in excelThreads:
            thread.join()

        for lang in Languages:
            outputData = self._outputData[lang]
            if (outputData is None):
                continue
            
            file = os.path.join(self.outputFolder, f"{self.outputFileName}-{lang.value}.csv")
            print(f"Writing CSV output for language: {lang.value} ...")
            outputData.to_csv(file, index = False, encoding = "utf-8")


########
# MAIN #
########
importer = Importer()
importer.run()

Writing CSV output for language: en ...
Writing CSV output for language: fr ...
