In [51]:
import ast
import os
import re
import string
from pprint import pprint

import numpy as np
import pandas as pd
import spacy

from utils.helper import read_xml

In [6]:
data_path = os.path.join("..", "data")

In [7]:
cocktails = read_xml(os.path.join(data_path, "ccc_cocktails.xml"))
print(len(cocktails))
print(type(cocktails))

108
<class 'xml.etree.ElementTree.Element'>


In [8]:
df = pd.read_csv(
    os.path.join(data_path, "recipes_w_search_terms.csv"),
    encoding="utf-8",
    dtype={"id": int, "name": str, "description": str, "serving_size": str, "servings": np.uint8},
    converters={
        "ingredients": ast.literal_eval,
        "ingredients_raw_str": ast.literal_eval,
        "steps": ast.literal_eval,
        "tags": ast.literal_eval,
        "search_terms": ast.literal_eval,
    },
)

In [9]:
df.head()

Unnamed: 0,id,name,description,ingredients,ingredients_raw_str,serving_size,servings,steps,tags,search_terms
0,96313,Grilled Garlic Cheese Grits,"We love grits, this is another good way to ser...","[water, grits, salt, cheddar cheese, garlic, o...","[4 cups water, 1 cup uncooked old fas...",1 (155 g),8,"[I a sauce pan, bring water to a boil; slowly ...","[time-to-make, course, main-ingredient, prepar...","{side, low-calorie, low-carb, vegetarian, diab..."
1,232037,Simple Shrimp and Andouille Jambalaya,"Simple, easy and very tasty for when you are i...","[onion, red bell pepper, garlic cloves, large ...","[1 medium onion, chopped coarse , 1 med...",1 (366 g),4,"[In a food processor, pulse the onion, red pep...","[60-minutes-or-less, time-to-make, course, mai...","{shrimp, dinner}"
2,41090,black-and-white bean salad,,"[white beans, canned black beans, tomatoes, on...","[1 cup canned white beans, rinsed and drai...",1 (807 g),1,"[In a large bowl, combine beans, tomato, onion...","[15-minutes-or-less, time-to-make, course, mai...","{side, vegan, dinner, vegetarian, salad}"
3,60656,Crock Pot Italian Zucchini,This is a good recipe for weight watchers. It ...,"[zucchini, yellow squash, diced tomatoes, onio...","[2 zucchini, sliced , 2 small yello...",1 (244 g),4,[Put all ingredients in the crock pot and cook...,"[weeknight, time-to-make, course, main-ingredi...","{vegetarian, side, italian}"
4,232047,Beef Stew With Dried Cherries,This is a fabulous stew that came from one of ...,"[beef stew meat, flour, salt, allspice, cinnam...","[3 lbs beef stew meat, 3 tablespoons ...",1 (358 g),8,"[Preheat oven to 350°F., Cut beef into 1 inch ...","[time-to-make, course, main-ingredient, prepar...",{dinner}


In [10]:
ingredient_lists = np.unique(df["ingredients"].to_numpy(copy=True))
ingredient_lists

array([list([]),
       list(['', '', 'blue curacao', 'cointreau liqueur', 'sugar', 'prosecco']),
       list(['', 'absolut vodka', 'kahlua', 'ice']), ...,
       list(['zwieback toast crumbs', 'butter', 'sugar', 'cream cheese', 'sugar', 'vanilla', 'lemon', 'lemon juice', 'eggs', 'sour cream', 'sugar', 'vanilla', 'lemon juice']),
       list(['zwieback toast crumbs', 'sugar', 'butter', 'cream cheese', 'half-and-half', 'canned pumpkin', 'sugar', 'all-purpose flour', 'vanilla', 'ground cinnamon', 'ground ginger', 'ground nutmeg', 'salt', 'eggs', 'sour cream', 'sugar', 'vanilla']),
       list(['zwieback toast crumbs', 'sugar', 'margarine', 'unflavored gelatin', 'water', 'lime juice', 'eggs', 'sugar', 'lime peel', 'lowfat neufchatel cheese', 'whipped topping'])],
      dtype=object)

In [None]:
ingredients = []
[ingredients.extend(sublist) for sublist in ingredient_lists]
ingredients = set(ingredients)
ingredients.remove("")
print([i for i in ingredients if re.match(r"\w*%[0-9a-z]{2}", i)])

In [None]:
chars_to_replace = {
    "%e2%80%99": " ",
    "%e2%80%93": "-",
    "%e2%84%a2": "",
    "%c3%89": "é",
    "%c3%b1": "",
    "%c3%ba": "ú",
    "%c2%ae": "",
    "%26": "&",
    "%3b": ";",
    "&reg;": "",
    "%2f": "/",
    "%2c": ",",
    "%22": "",
    "%27": " ",
    "%25": "",
    "%28": "",
    "%29": "",
    "%3f": "",
}
for key, value in chars_to_replace.items():
    ingredients = [s.replace(key, value) for s in ingredients]

print(f"Number of ingredients:\n{len(ingredients)}")
print("Ingredients:")
pprint(ingredients)

In [None]:
nlp = spacy.load("en_core_web_sm")
base_ingredients = []
exclusions = string.punctuation
for ingredient in ingredients:
    nouns = []
    for token in nlp(ingredient):
        if token.pos_ == "NOUN" and token.text not in exclusions:
            nouns.append(token.lemma_)
    base_ingredients.append((ingredient, nouns))

In [63]:
ingredient_set = base_ingredients
print(len(ingredient_set))
pprint(ingredient_set)

18619
[('fruit spread', ['fruit', 'spread']),
 ('croissant dough', ['dough']),
 ('instant vanilla pudding', ['vanilla', 'pudding']),
 ('pepper bacon', ['pepper', 'bacon']),
 ('soy sour cream', ['cream']),
 ('low calorie sweetener', ['calorie', 'sweetener']),
 ('spinach rotini pasta', ['rotini', 'pasta']),
 ('reduced-fat mexican cheese blend', ['cheese', 'blend']),
 ('dark cooking chocolate', ['cooking', 'chocolate']),
 ('betty crocker chicken helper chicken and herb rice',
  ['helper', 'chicken', 'herb', 'rice']),
 ('light olive oil', ['olive', 'oil']),
 ('creamed shortening', ['shortening']),
 ('glass container', ['glass', 'container']),
 ('alpine lace 97% fat-free roast beef', ['lace', 'roast', 'beef']),
 ('white chocolate candies', ['candy']),
 ('clear jel', []),
 ('dry red pepper', ['pepper']),
 ('hong kong noodles', ['noodle']),
 ('chopped pimiento', []),
 ('frito-lay bean dip', ['bean', 'dip']),
 ('italian sweet vermouth', ['vermouth']),
 ('tabasco pepper', ['pepper']),
 ('garden