In [1]:
import json
import re

In [2]:
with open("db/full_format_recipes.json") as f:
    recipes = json.load(f)
recipes[:10]
len(recipes)

20130

In [3]:
def flatten(l):
    return [item for sublist in l for item in sublist]


In [4]:
ingredients = flatten([r['ingredients'] for r in filter(lambda r: 'ingredients' in r, recipes)])
ingredients = list(x.lower() for x in ingredients)
ingredients[:10]

['4 cups low-sodium vegetable or chicken stock',
 '1 cup dried brown lentils',
 '1/2 cup dried french green lentils',
 '2 stalks celery, chopped',
 '1 large carrot, peeled and chopped',
 '1 sprig fresh thyme',
 '1 teaspoon kosher salt',
 '1 medium tomato, cored, seeded, and diced',
 '1 small fuji apple, cored and diced',
 '1 tablespoon freshly squeezed lemon juice']

In [5]:
ingredient_words = flatten([re.split(r"\W+", i) for i in ingredients])
ingredient_words[:10]

['4',
 'cups',
 'low',
 'sodium',
 'vegetable',
 'or',
 'chicken',
 'stock',
 '1',
 'cup']

In [None]:
list(filter(lambda x:"sauce" in x, ingredients))

In [7]:
def sanitize(line):
    if 'equipment' in line:
        return None
    line = re.sub(r"\((.+?)\)", r"\1", re.sub(r" ?[0-9/]+ ?", "", line.lower()))
    return line
clean_ingredients = list(filter(lambda x: x, map(sanitize,ingredients)))
clean_ingredients[:10]

['cups low-sodium vegetable or chicken stock',
 'cup dried brown lentils',
 'cup dried french green lentils',
 'stalks celery, chopped',
 'large carrot, peeled and chopped',
 'sprig fresh thyme',
 'teaspoon kosher salt',
 'medium tomato, cored, seeded, and diced',
 'small fuji apple, cored and diced',
 'tablespoon freshly squeezed lemon juice']

In [8]:
_ingredient_id = 0
class Ingredient(object):
    def __init__(self, name, regex=None):
        global _ingredient_id
        _ingredient_id+=1
        self.id = _ingredient_id
        self.name = name
        self.regex = re.compile(regex) if regex else re.compile(f"{name}")
    def __hash__(self):
        return self.id
    def __eq__(self, other):
        return self.id == other.id
    def __repr__(self):
        return f"Ingredient({self.id},{self.name},{self.regex})"
    def test(self,line):
        return self.regex.search(line)
    def clean(self,line):
        return re.sub(self.regex, "", line)

In [9]:
valid_ingredients = r"""
vegetable oil	
white pepper	
orange	
vegetable broth	vegetable (?:broth|stock)
kosher salt	
olive oil	
black pepper	\bpepper\b|\bblack pepper\b|\bpeppercorns\b
seaseme seed	sesame seeds?
mustard seed	mustard seeds?
coriander seed	coriander seeds?
cumin	cumin(?: seeds?)?
baking soda	
baking powder	
red bell pepper	red bell peppers?
heavy cream	heavy cream|half-and-half
bay leaves	bay (?:leaf|leaves)
sea salt	
chicken stock	chicken (?:broth|stock)
stock	\bbroth|stock\b
brown lentils	
green lentils	
lemon	
red wine	
white wine	
scallion	\bscallions?\b|\bgreen onions\b
flank steak
dijon mustard
pork loin
eggs	eggs?
turkey breast
onion	\bonions?\b
coconut milk
chocolate chips	chocolate chips?
steak	\bsteaks?\b
tomato	tomato(?:es)?
olive	olives?
oil	\boil\b
soy sauce
sage	\bsage\b
caper	\bcapers?\b
cranberry	cranberry|cranberries
corn	\bcorn\b
pork	\bpork\b
maple syrup
ginger ale
chilli powder
green beans
black beans
red beans
blackberry	blackberries|blackberry
beef	\bbeef\b
steak	\bsteak\b
""".strip()
valid_ingredients += '\n' + r"""
dates
lime
tortilla
salt
sugar
olive
garlic
cream
flour
chicken
vinegar
butter
wine
parsley
vanilla
cheese
thyme
mustard
rice
lentils
celery
apple
currants
lettuce
baguette
fennel
anchovy
gelatin
mayonnaise
mustard
peas
potatoes
rosemary
shallot
cloves
nutmeg
whiskey
milk
brandy
chives
basil
ginger
lemongrass
candlenuts
cilantro
peach
spinach
mint
bread
breadcrumbs
pesto sauce
broccoli
pomegranate
peanut
crab
cinnamon
honey
cornstarch
oregano
carrot
allspice
coriander
dill
tarragon
walnut
paprika
almond
cayenne
turmeric
bacon
cardamom
parmesan
raisin
pecan
mushroom
ketchup
water
avocado
asparagus
ham
kumquat
chocolate	
zucchini
curry
shrimp
banana
""".strip()

valid_ingredients=valid_ingredients.splitlines()

valid_ingredients = [
    p.split('\t') for p in valid_ingredients
]
_ingredient_id = 0
valid_ingredients = [
    Ingredient(i[0].strip(), i[1] if len(i) > 1 else None)
    for i in valid_ingredients
]

In [10]:
bad_ingredients = """
cup
tablespoons
teaspoon
chopped
fresh
cups
or
tablespoon
and
large
teaspoons
into
cut
ground
sliced
finely
to
grated
minced
ounces
water
unsalted
dried
pound
plus
of
thinly
leaves
pieces
pounds
a
whole
medium
for
coarsely
dry
divided
as
slices
peeled
freshly
small
with
halved
in
baking
drained
taste
room
strips
thin
the
coarse
Special
quartered
from
zest
stalks
hot
,
sprig
squeezed
cored
seeded
diced
extra-virgin
toasted
reduced-sodium
clove
red-
low-fat
into-inch
trimmed
lengthwise
-ounce
lb
if
desired
""".strip().split('\n')

In [11]:
goods=valid_ingredients
bads=set(bad_ingredients)
def parser(line):
    result = []
    cleaned_line = line
    for i in goods:
        if i.test(line):
            result.append(i)
        cleaned_line = i.clean(cleaned_line)
    words = cleaned_line.replace(',','').split()
    return result, list(filter(lambda w: w not in bads, words)), line

In [16]:
print(valid_ingredients)
list(i[0][0].name for i in map(parser, clean_ingredients[:10]))

[Ingredient(1,vegetable oil,re.compile('vegetable oil')), Ingredient(2,white pepper,re.compile('white pepper')), Ingredient(3,orange,re.compile('orange')), Ingredient(4,vegetable broth,re.compile('vegetable (?:broth|stock)')), Ingredient(5,kosher salt,re.compile('kosher salt')), Ingredient(6,olive oil,re.compile('olive oil')), Ingredient(7,black pepper,re.compile('\\bpepper\\b|\\bblack pepper\\b|\\bpeppercorns\\b')), Ingredient(8,seaseme seed,re.compile('sesame seeds?')), Ingredient(9,mustard seed,re.compile('mustard seeds?')), Ingredient(10,coriander seed,re.compile('coriander seeds?')), Ingredient(11,cumin,re.compile('cumin(?: seeds?)?')), Ingredient(12,baking soda,re.compile('baking soda')), Ingredient(13,baking powder,re.compile('baking powder')), Ingredient(14,red bell pepper,re.compile('red bell peppers?')), Ingredient(15,heavy cream,re.compile('heavy cream|half-and-half')), Ingredient(16,bay leaves,re.compile('bay (?:leaf|leaves)')), Ingredient(17,sea salt,re.compile('sea salt')

['chicken stock',
 'brown lentils',
 'green lentils',
 'celery',
 'carrot',
 'thyme',
 'kosher salt',
 'tomato',
 'apple',
 'lemon']

In [None]:
import multiprocessing as mp

num_cores = mp.cpu_count()

def do_task(r):
    start, end = r
    return map(parser, clean_ingredients[start:end])

parse_results=[]
with mp.Pool(num_cores) as pool:
    N = len(clean_ingredients)
    step = N // num_cores
    tasks = []
    for i in range(num_cores):
        tasks.append((i*step, (i+1)*step))
    for result in pool.map(do_task, tasks):
        parse_results+=result
print(len(parse_results))

In [None]:
for i, bad, orig in parse_results:
    if not i:
        print(f"=====\n{orig}\n{i}\n{bad}")

In [None]:
bad_words={}
bad_results=[]
for i, bad, orig in parse_results:
    if not i:
        bad_results.append((i, bad, orig))
        for w in bad:
            if w in bad_words:
                bad_words[w]+=1
            else:
                bad_words[w]=1
len(bad_words)
for w in sorted(list(bad_words.keys()),key=lambda w: bad_words[w], reverse = True):
    print(f"{w}\t{bad_words[w]}")

In [None]:
bad_count = 0
for i,_,_ in parse_results:
    if not i:
        bad_count += 1
print(bad_count / len(parse_results))

In [25]:
bad_count = 0
for recipe in recipes:
    if 'ingredients' not in recipe:
        bad_count += 1
        continue
    bad_flag = False
    local_count = 0
    for ingredient in recipe['ingredients']:
        ing = sanitize(ingredient)
        if not ing:
            continue
        i, bad, orig = parser(ing)
        if not i:
            #print(orig + '\n' + repr(bad))
            bad_flag = True
            local_count += 1
    if local_count >= 2:
        bad_count+=1
    recipe['bad']=local_count
    #if bad_flag: print("================")

In [26]:
print(f"{len(recipes)-bad_count}+{bad_count}/{len(recipes)}\t{bad_count/len(recipes)}")    
print(bad_count / len(recipes))

11046+9084/20130	0.4512667660208644
0.4512667660208644


In [None]:
valid_ingredients

In [48]:
ingredient_json = [
    {"id": i.id, "name": i.name, "pattern": i.regex.pattern} for i in valid_ingredients
]

In [49]:
ingredient_json

[{'id': 1, 'name': 'vegetable oil', 'pattern': 'vegetable oil'},
 {'id': 2, 'name': 'white pepper', 'pattern': 'white pepper'},
 {'id': 3, 'name': 'orange', 'pattern': 'orange'},
 {'id': 4, 'name': 'vegetable broth', 'pattern': 'vegetable (?:broth|stock)'},
 {'id': 5, 'name': 'kosher salt', 'pattern': 'kosher salt'},
 {'id': 6, 'name': 'olive oil', 'pattern': 'olive oil'},
 {'id': 7,
  'name': 'black pepper',
  'pattern': '\\bpepper\\b|\\bblack pepper\\b|\\bpeppercorns\\b'},
 {'id': 8, 'name': 'seaseme seed', 'pattern': 'sesame seeds?'},
 {'id': 9, 'name': 'mustard seed', 'pattern': 'mustard seeds?'},
 {'id': 10, 'name': 'coriander seed', 'pattern': 'coriander seeds?'},
 {'id': 11, 'name': 'cumin', 'pattern': 'cumin(?: seeds?)?'},
 {'id': 12, 'name': 'baking soda', 'pattern': 'baking soda'},
 {'id': 13, 'name': 'baking powder', 'pattern': 'baking powder'},
 {'id': 14, 'name': 'red bell pepper', 'pattern': 'red bell peppers?'},
 {'id': 15, 'name': 'heavy cream', 'pattern': 'heavy cream|

In [58]:
missing_count = 0
for recipe in recipes:
    missing = False
    if 'ingredients' in recipe:
        recipe['ingredient_ids']=set()
        for ingredient in recipe['ingredients']:
            ing = sanitize(ingredient)
            if not ing:
                continue
            i, bad, orig = parser(ing)
            if i:
                recipe['ingredient_ids'].add(i[0].id)
            else:
                missing = True
        if missing:
            missing_count += 1
        recipe['ingredient_ids']=list(recipe['ingredient_ids'])
print(missing_count/len(recipes))

0.776602086438152


In [51]:
xs = {}
for r in recipes:
    if 'bad' in r:
        c = r['bad']
        if c in xs:
            xs[c]+=1
        else:
            xs[c]=1

In [52]:
xs

{0: 4478,
 1: 6568,
 3: 2559,
 4: 1105,
 2: 4681,
 5: 428,
 6: 176,
 7: 66,
 9: 11,
 8: 29,
 11: 2,
 15: 1,
 10: 4,
 17: 1,
 12: 1,
 14: 1}

In [53]:
total = 0
for i in range(3, 15):
    if i in xs:
        total += xs[i]
print(len(recipes)-total)

15748


In [60]:
good_recipes = list(filter(lambda r: 'bad' in r and r['bad']<=2, recipes))

In [61]:
keys = ['directions', 'fat', 'categories', 'calories', 'desc', 'protein', 'rating', 'title', 'ingredients', 'sodium', 'ingredient_ids']

In [62]:
recipes_json = [{ key : gr[key] for key in keys} for gr in good_recipes]

In [63]:
with open('./ingredients.json', 'w') as f:
    json.dump(ingredient_json, f)
with open('./recipes.json', 'w') as f:
    json.dump(recipes_json, f)