# process_data.ipynb

Filters extracted ingredients to remove unwanted categories and branded products.
After filtering ingredient descriptions are doctored to reduce overall length improve readability.

In [1]:
import json
import data_util
from constants import DESCRIPTION_KEY

In [2]:
# loads extracted ingredients
with open("data/intermediate/extracted_ingredients.json") as in_file:
    extracted_ingredient_data = json.load(in_file)

Filters ingredients and updates descriptions.

In [3]:
# filters ingredient data to remove branded data and categories in constants.EXCLUDED_CATEGORIES
filtered_ingredient_data: list[dict] = data_util.filter_ingredients(extracted_ingredient_data)

# doctors ingredient descriptions
for ingredient in filtered_ingredient_data:
    data_util.update_description(ingredient)

Checks for duplicate ingredient descriptions and merges duplicates.

In [4]:
# group entries by description
num_duplicates = 0
descriptions = dict()

for ingredient in filtered_ingredient_data:
    description: str = ingredient.get(DESCRIPTION_KEY)
    if description in descriptions:
        ingredients: list[dict] = descriptions[description]
        ingredients.append(ingredient)
        num_duplicates += 1
    else:
        descriptions[description] = [ ingredient ] 

# conservatively tries to resolve descriptions with multiple
# entries to a single entry, see data_util.resolve_multiples
resolved_ingredient_data = []

for key in descriptions.keys():
    ingredients = descriptions[key]
    if len(ingredients) > 1:
        resolved = data_util.resolve_multiple(ingredients)
        for ing in resolved:
            resolved_ingredient_data.append(ing)
    else:
        resolved_ingredient_data.append(ingredients[0])



num_duplicates_removed = len(filtered_ingredient_data) - len(resolved_ingredient_data)
num_duplicates = num_duplicates - num_duplicates_removed

print("INFO: ", num_duplicates_removed, " duplicates removed!")
print("INFO: ", num_duplicates, " duplicates left in processed data!")

INFO:  22  duplicates removed!
INFO:  171  duplicates left in processed data!


Writes filtered and resolved ingredients to intermediate file.

In [6]:
# Writes filtered ingredients to intermediate/processed_ingredients.json
processed_ingredient_json = json.dumps(resolved_ingredient_data)

with open("data/intermediate/processed_ingredients.json", "w+") as outfile:
    outfile.write(processed_ingredient_json)

{'description': 'Tomatoes, grape, raw', 'fdcId': 321360, 'category': 'Vegetables and Vegetable Products', 'calories': {'value': 27.0, 'unit': 'kcal'}, 'protein': {'value': 0.83, 'unit': 'g'}, 'carbs': {'value': 5.51, 'unit': 'g'}, 'fat': {'value': 0.63, 'unit': 'g'}, 'sodium': {'value': 6.0, 'unit': 'mg'}, 'sugars': {}, 'fiber': {'value': 2.1, 'unit': 'g'}, 'measures': [{'value': 5.0, 'unit': 'tomatoes', 'gWeight': 49.7}, {'value': 1.0, 'unit': 'cup', 'gWeight': 152.0}], 'keywords': ['t', 'to', 'tom', 'toma', 'tomat', 'tomato', 'tomatoe', 'tomatoes', 'o', 'om', 'oma', 'omat', 'omato', 'omatoe', 'omatoes', 'g', 'gr', 'gra', 'grap', 'grape', 'r', 'ra', 'rap', 'rape', 'r', 'ra', 'raw', 'a', 'aw']}
