In [1]:
import numpy as np
import json
import inflect
import re

In [2]:
#import data
with open('recipes_raw_nosource_ar.json') as json_file:
    data = json.load(json_file)

A lot of filtering, cleaning, normalizing and adding additional information to ingredients was used in order to make the dataset optimal for our needs. In the end, each cookie recipe should have a list of ingredients, each containing its name, quantity, measure, and role. Separating the quantity and the measure used will make it easier to compare and add in the generating phases. A role for each ingredient will be also added to later assist with the fitness function and the categorization process (where each cookie recipe will be classified by their crunchiness and spread). The possible roles for each ingredient are:
- fat
- sugar
- flour
- binding agent
- rising agent
- flavor texture - aditional ingredients that are not necessary for the structural integrity of the cookie

First, we select only the recipes that have 'Cookie' in the tittle, since we only want cookie recipes

In [3]:
#get the cookies
new_data = []
for value in list(data.values()):
    if 'title' in value:
        if 'Cookie' in value['title'] or 'cookie' in value['title']:
            new_data.append(value)

 The original dataset contained a lot of non-ingredients in the middle of the ingredient list. These did not include a measure at the start of the string, so all elements in the ingredients list that did not start in a measure were removed. All the elements on the ingredients list also contained the word 'ADVERTISEMENT' at the end, so it was removed.

In [4]:
# remove non ingrediets
# remove advertisement at the end of ingredients
for elem in new_data:
    for item in elem['ingredients'][:]:
        if not item[0].isdigit():
            elem['ingredients'].remove(item)
    for i, s in enumerate(elem['ingredients']):
        elem['ingredients'][i] =  elem['ingredients'][i][:len(elem['ingredients'][i]) - 14]

In the middle of each ingredient, there were sometimes many parenthesis that only included additional information or alternative measures. Also after a ',', there it is common to have information about the preparation of the ingredient.  These were deemed unnecessary since they would only make it harder to distinguish between the same ingredient. Since we use the rarity of the ingredient in, for example, the fitness function of a recipe, it is important to not have a lot of noise in the ingredient list.

e.g. '1 (8 ounce) package cream cheese, softened'

In [5]:
#remove () and things after ,
for elem in new_data:
    for i, s in enumerate(elem['ingredients']):
        elem['ingredients'][i] = re.sub('\(.*?\)', '', elem['ingredients'][i])
        elem['ingredients'][i] = elem['ingredients'][i].split(",", 1)[0]

In [6]:

SEPARATOR_RE = re.compile(r'^([\d\s*[\d\.,/]*)\s*(.+)')


def normalize(st):
    """

    :param st:
    :return:
    """
    return re.sub(r'\s+', ' ', SEPARATOR_RE.sub('\g<1> \g<2>', st)).strip()


def escape_re_string(text):
    """

    :param text:
    :return:
    """
    text = text.replace('.', '\.')
    return re.sub(r'\s+', ' ', text)

In [7]:


import re
from itertools import chain


UNITS = {"cup": ["cups", "cup", "c.", "c"], "fluid_ounce": ["fl. oz.", "fl oz", "fluid ounce", "fluid ounces"],
         "package": ["package","packages"],"jar":["jar","jars"], "box":["box","boxes"],
         "gallon": ["gal", "gal.", "gallon", "gallons"], "ounce": ["oz", "oz.", "ounce", "ounces"],
         "pint": ["pt", "pt.", "pint", "pints"], "pound": ["lb", "lb.", "pound", "pounds"],
         "quart": ["qt", "qt.", "qts", "qts.", "quart", "quarts"],
         "tablespoon": ["tbsp.", "tbsp", "T", "T.", "tablespoon", "tablespoons", "tbs.", "tbs"],
         "teaspoon": ["tsp.", "tsp", "t", "t.", "teaspoon", "teaspoons"],
         "gram": ["g", "g.", "gr", "gr.", "gram", "grams"], "kilogram": ["kg", "kg.", "kilogram", "kilograms"],
         "liter": ["l", "l.", "liter", "liters"], "milligram": ["mg", "mg.", "milligram", "milligrams"],
         "milliliter": ["ml", "ml.", "milliliter", "milliliters"], "pinch": ["pinch", "pinches"],
         "dash": ["dash", "dashes"], "touch": ["touch", "touches"], "handful": ["handful", "handfuls"],
         "stick": ["stick", "sticks"], "clove": ["cloves", "clove"], "can": ["cans", "can"],
         "small": ["small"], "scoop": ["scoop", "scoops"], "filets": ["filet", "filets"], "sprig": ["sprigs", "sprig"]}

NUMBERS = ['seventeen', 'eighteen', 'thirteen', 'nineteen', 'fourteen', 'sixteen', 'fifteen', 'seventy', 'twelve',
           'eleven', 'eighty', 'thirty', 'ninety', 'twenty', 'seven', 'fifty', 'sixty', 'forty', 'three', 'eight',
           'four', 'zero', 'five', 'nine', 'ten', 'one', 'six', 'two', 'an', 'a']

prepositions = ["of"]

a = list(chain.from_iterable(UNITS.values()))
a.sort(key=lambda x: len(x), reverse=True)
a = map(escape_re_string, a)

PARSER_RE = re.compile(
    r'(?P<quantity>(?:[\d\.,][\d\.,\s/]*)?\s*(?:(?:%s)\s*)*)?(\s*(?P<unit>%s)\s+)?(\s*(?:%s)\s+)?(\s*(?P<name>.+))?' % (
        '|'.join(NUMBERS), '|'.join(a), '|'.join(prepositions)))


def parse(st):
    """

    :param st:
    :return:
    """
    st = normalize(st)
    res = PARSER_RE.match(st)

    return {
        'measure': ((res.group('quantity') or '').strip() + ' ' + (res.group('unit') or '').strip()).lstrip().rstrip(),
        'name': (res.group('name') or '').strip()
    }


Seperate the measure and the name of the ingredient. In this phase, the measure and quantity are still in the same parameter

In [8]:
#from ingredient_parser.en import parse 
for elem in new_data:
    for i, s in enumerate(elem['ingredients']):
        elem['ingredients'][i] =  parse(elem['ingredients'][i])

#print(new_data)

Make every ingredient name singular, using a library

In [9]:
#singularize
p = inflect.engine()
for elem in new_data:
    for i, s in enumerate(elem['ingredients']):
        #if (elem['ingredients'][i]['name'] == "egg yolks"):
        #    print( p.singular_noun(elem['ingredients'][i]['name']))
        if p.singular_noun(elem['ingredients'][i]['name']) != False:
            elem['ingredients'][i]['name'] = p.singular_noun(elem['ingredients'][i]['name'])

In [10]:
#new_data

Add the roles to each ingredient. The roles.json file was manually created and each ingredient was added until almost all of them had a role.

In [11]:
with open('roles.json') as json_file:
    roles = json.load(json_file)

for elem in new_data:
    for i, s in enumerate(elem['ingredients']):
        for role,role_values in roles.items():
            for ing_role in role_values:
                #print(ing_role)
                if ing_role in elem['ingredients'][i]['name'].lower() and 'role' not in elem['ingredients'][i]:
                    #print(ing_role,elem['ingredients'][i]['name'])
                    elem['ingredients'][i]['role'] = role
        

The few ingredients that have no role assigned are removed from their recipe

In [12]:
#remove non-rolled ingredients
for elem in new_data:
    for ingre in elem['ingredients'][:]:        
        if 'role' not in ingre:
            elem['ingredients'].remove(ingre)

The measures properties in each ingredient still contain fractions and combinations of fractions. To make calculations easier these were simplified to whole numbers.

e.g. 2 1/2 -> 2.5

In [13]:
#remove fractions
from fractions import Fraction
for elem in new_data:
    for i, s in enumerate(elem['ingredients']):
        words = elem['ingredients'][i]['measure'].split()
        total = 0
        if '/' in words[0]:
            total = float(Fraction(words[0]))
        else:
            total = float(words[0])
        if len(words)> 1 and '/' in words[1]:
            total += float(Fraction(words[1]))
            total = round(total,3)
            arr = [str(total)] + words[2:]
            #print(" ".join(arr))
            elem['ingredients'][i]['measure'] = " ".join(arr)
            #print(arr)
            #print([total].extend(words[1:]))
        elif total > 0:
            total = round(total,3)
            arr = [str(total)] + words[1:]
            #print(" ".join(arr))
            elem['ingredients'][i]['measure'] = " ".join(arr)
            #print(arr)
            #print(total)

The measure and quantity were separated on each ingredient

In [14]:
for elem in new_data:
    for i, s in enumerate(elem['ingredients']):
        #elem['ingredients'][i] =  parse(elem['ingredients'][i])
        words = elem['ingredients'][i]['measure'].split()
        elem['ingredients'][i]['quantity'] = words[0]
        if len(words) > 1:
            elem['ingredients'][i]['measure'] = words[1]
        else:
            elem['ingredients'][i]['measure'] = 'units'
        

In [15]:
cup_equivalent = {
    'cup':1.0,
    'cups':1.0,
    'package':3.0,#random
    'packages': 3.0,
    'box': 4.0,#random
    'boxes': 4.0,
    'tablespoon':0.0625,
    'tablespoons':0.0625,
    'teaspoon': 0.02,
    'teaspoons': 0.02,
    'pound':3.5,
    'pounds':3.5,
    'pint':2,
    'pints':2,
}
def quantity_norm(ingredient):
    if ingredient['measure'] not in cup_equivalent:
        print(ingredient)
    return float(ingredient['quantity']) * cup_equivalent[ingredient['measure']]

The quantity of each ingredient was normalized to make each recipe have the same quantity of flour. This way, all of the dataset is able to make approximately the same quantity of cookies, since, on average, every cookie recipe has the same amount of flour. Obviously, the proportions vary for each recipe, but they are still similar enough to make the playing field even. By normalizing the quantities the crossover between two recipes won't have as many disproportionate ingredients and will generate more realistic cookie recipes

In [16]:
#normalize weights
flour_value = 1.0
for recipe in new_data:
    sum = 0
    for ingredient in recipe['ingredients']:
        if ingredient['role'] == 'flour':
            sum += quantity_norm(ingredient)
    if sum == 0:
        sum = flour_value
    for i, s in enumerate(recipe['ingredients']):
        recipe['ingredients'][i]['quantity'] = round(float(recipe['ingredients'][i]['quantity']) * flour_value/sum,1)


In [17]:
jsonString = json.dumps(new_data, indent=2, default=str)
jsonFile = open("data.json", "w")
jsonFile.write(jsonString)
jsonFile.close()