# Validate Ingredient Data

Basic checks for data before uploading.

In [1]:
import json
import math
from constants import CATEGORY_KEY, CALORIES_KEY, DESCRIPTION_KEY, FDC_ID_KEY

In [2]:
# load processed data
processed_ingredient_file = open("data/intermediate/processed_ingredients.json")

processed_ingredient_data = json.load(processed_ingredient_file)

processed_ingredient_file.close()

In [3]:
# number of entries w/o caloric data (should be very low, < 0.01%)
num_no_caloric = 0
# number of entries w/o fdc id (should be 0)
num_no_id = 0
# number of entries w/o a desc. (should be 0)
num_no_desc = 0
# number of entries w/o category (should be 0)
num_no_cat = 0

for ingredient in processed_ingredient_data:
    if not ingredient.get(CALORIES_KEY):
        num_no_caloric += 1
    if not ingredient.get(FDC_ID_KEY):
        num_no_id += 1
    if not ingredient.get(DESCRIPTION_KEY):
        num_no_desc += 1
    if not ingredient.get(CATEGORY_KEY):
        num_no_cat += 1

In [4]:
# print stats 
print("INFO: number of entires w/o caloric data: ", num_no_caloric)
print("INFO: number of entires w/o fdc id: ", num_no_id)
print("INFO: number of entires w/o category: ", num_no_cat)
print("INFO: number of entires w/o description: ", num_no_desc)

INFO: number of entires w/o caloric data:  27
INFO: number of entires w/o fdc id:  0
INFO: number of entires w/o category:  0
INFO: number of entires w/o description:  0


Write validated data if all checks pass. Writes in batches of 1000. Limit of 0.01 ingredients w/o caloric data.
Limit was determined by analysis of data. May change in future versions.

In [5]:
# check before writing
is_data_valid = False
if num_no_caloric/len(processed_ingredient_data) <= 0.01:
    if num_no_cat == 0 and num_no_desc == 0 and num_no_id == 0:
        is_data_valid = True

if is_data_valid == True:
    entries_per_batch = 1000
    num_entries = len(processed_ingredient_data)
    num_batches = math.ceil(num_entries/entries_per_batch)

    # write in batches of 1000 ingredients
    for i in range(num_batches):
        start_index = i * 1000
        if i == num_batches - 1:
            end_index = len(processed_ingredient_data)
        else:
            end_index = (i+1) * 1000
        batch_ingredients = processed_ingredient_data[start_index:end_index]
        # serialize batch
        batch_json = json.dumps(batch_ingredients)
        # writes batch to output file in data/final
        output_filename = "data/final/batch_" + str(i) + "_ingredients.json"
        with open(output_filename, "w+") as outfile:
            outfile.write(batch_json)