In [26]:
import pymongo
import pandas as pd
import numpy as np
import ast

In [2]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
urls = db['urls']
recipes = db['recipes']

In [42]:
# Parser Class
class IngredientParser:
    """
    A Parser for processing ingredients
    """
    def __init__(self):
        """
        No params needed, intialization setsup many of the variables needed to run the main function
        .Parse(). This class is only half-baked, it contains a crude and static ingredient reference
        but would need to be widely expanded for general use.
        """
        self.units = ["cup", "c.", "g", "gram", 'lb', 'teaspoon', "tsp", "tbsp", "oz", 'tablespoon', 'container', 'packet',
                 'bag',
                 "stick",
                 'quart', 'pound', 'can', 'bottle', 'pint', 'package', 'ounce', 'jars', 'heads', 'gallons', 'drops',
                 "drop",
                 'envelope', 'bar', 'box', 'pinch', 'dash', 'bunch', 'recipe', 'layer', 'slice', 'link', 'bulb',
                 'stalk',
                 'square', 'sprig',
                 'fillet', 'piece', 'leg', 'thigh', 'cube', 'granule', 'strip', 'tray', 'leave', 'loaves', 'halves',
                 'jar']

        self.descriptors = ['beaten', 'chopped', 'cold', 'diced', 'packed', 'hot', 'large', 'melted', 'mini',
               'miniature', 'packed', 'room temperature', 'sifted', 'softened', 'warm', "drained", "rinsed", "optional",
               "mashed", "ripe", "medium", "ground", "pitted", "minced"]

        self.ir = {
                  "flour":{
           "types":['all purpose', "white", "brown", "spelt", "oat", "millet",
                 "whole wheat", "almond", "tapioca", "sorghum", "rice", "coconut", "garfava"
                 "barley", "cake", "bread", "self-rising", "soy", "pastry"]
                  },
                  "brown sugar":{
                      "types":["dark", "light"],
                      "aka":["brown-sugar"]
                  },
                  "sugar":{
                      "types":["granulated", "white", "substitute", "splenda", "cane"],
                      "aka":["splenda"],
                      "stopwords":["free"]
                  },
                  "butter":{
                      "types":["salted", "unsalted"],
                      "stopwords":["almond", "peanut", "scotch"]
                  },
                  "water":{
                      "types":[]
                  },
                  "baking powder":{
                      "types":[],
                      "aka":["baking-powder"]
                  },
                  "salt":{
                      "types":["kosher", "coarse", "sea", "popcorn", "fine", "himalayan", "table"],
                      "aka":["sea-salt"]
                  },
                  "chocolate chip":{
                      "types":["semi sweet", "milk", "dark", "white", "bittersweet"],
                      "aka":["chocolate chips"]
                  },
                  "chocolate chunk":{
                      "types":["semi sweet", "milk", "dark", "white", "bittersweet"],
                      "aka":["chocolate chunks", "chocolate morsels", "chocolate morsel"]
                  },
                  "baking soda":{
                      "types":[],
                      "aka":["baking-soda"]
                  },
                  "oat":{
                      "types":["old fashioned", "rolled", "quick-cooking"],
                      "aka":["rolled-oats"],
                      "stopwords":["milk", 'flour']
                  },
                  "vanilla extract":{
                      "types":['madagascar'],
                      "aka":["vanilla"]
                  },
                  "shortening":{
                      "types":['vegatable']
                  },
                  "walnut":{
                      "types":["roasted", "toasted"],
                      "aka":["walnuts"],
                      "stopwords":[" nuts"]
                  },
                  "pecan":{
                     "types":["roasted", "toasted"],
                     "aka":["pecans" ]
                  },
                  "oil":{
                      "types":["olive", "vegetable", "avocado", "canola", "virgin", "coconut"]
                  },
                  "coconut":{
                      "types":["flaked", "unsweetened"],
                      "stopwords":["milk", "pudding"]
                  },
                  "chickpeas":{
                      "types":[],
                      "aka":["garbonzo beans", "ghana"]
                  },
                  "cooking spray":{
                      "types":["nonstick"],
                      "aka":["pam"]
                  },
                  "milk":{
                      "types":["whole", "2%", "skim", "dairy free", "coconut", "almond", "oat", "soy"]
                  },
                  "sour cream":{
                      "types":["fat free", "dairy free"]
                  },
                  "banana":{
                      "types":["cavendish"],
                      "stopwords":["extract"]
                  },
                  "banana extract":{
                      "types":[]
                  },
                  "cornstarch":{
                      "types":[]
                  },
                  "cocoa powder":{
                      "types":[]
                  },
                  "nuts":{
                      "types":["walnut", "pecan", "peanut"]
                  },
                  "peanut":{
                      "types":[],
                      "stopwords":[" nuts", "butter"]
                  },
                  "honey":{
                      "types":[]
                  },
                  "baking mix":{
                      "types":[]
                  },
                  "cherry":{
                      "types":[],
                      "aka":["cherries"]
                  },
                  "almond extract":{
                      "types":[]
                  },
                  "almond":{
                      "types":[],
                      "stopwords":["milk", "extract"]
                  },
                  "pudding":{
                      "types":["coconut cream", "butterscotch"]
                  },
                  "heavy cream":{
                      "types":[]
                  },
                  "pretzels":{
                      "types":[]
                  },
                  "date":{
                      "types":["medjool"]
                  },
                  "flax seeds":{
                      "types":[]
                  },
                  "quinoa":{
                      "types":[]
                  },
                  "applesauce":{
                      "types":[]
                  },
                  "xanthan gum":{
                      "types":[]
                  },
                  "cinnamon":{
                      "types":["ground", "saigon"]
                  },
                  "instant coffee":{
                      "types":[]
                  },
                  "peanut butter":{
                      "types":[]
                  },
                  "ginger":{
                      "types":[]
                  },
                  "molasses":{
                      "types":["unsulphured", "un-sulphured", "blackstrap"]
                  },
                  "cloves":{
                      "types":[],
                      "aka":['clove']
                  },
                  "allspice":{
                      "types":[],
                      "aka":["all spice"]
                  },
                  "black pepper":{
                      "types":["ground pepper"]
                  },
                  "strawberry":{
                      "types":[],
                      "aka":['strawberries']
                  },
                  "instant espresso":{
                      "types":[]
                  },
                  "pumpkin pie spice":{
                      "types":[]
                  },
                  "apple pie spice":{
                      "types":[]
                  },
                  "orange peel":{
                      "types":[]
                  },
                  "margarine":{
                      "types":[]
                  },
                  "chocolate":{
                      "types":[]
                  },
                  "egg noodle":{
                      "types":[]
                  },
                  "onion":{
                      "types":["yellow", "spanish", "sweet", "red"]
                  },
                  "ground beef":{
                      "types":[]
                  },
                  "mushroom":{
                      "types":[],
                      "stopwords":['cream of']
                  },
                  "cream of chicken soup":{
                      "types":['undiluted']
                  },
                  "dried parsley":{
                      "types":[]
                  },
                  "garlic":{
                      "types":["clove"]
                  },
                  "cream of mushroom soup":{
                      "types":[]
                  }

                }

    def Parse(self, ing):
        """
        Core function for processing an ingredient
        :param ing:
        :return:
        """

        # Output schema for parsing
        ing_dict = {"ingredient_string": ing.lower(),
                    "quant": None,
                    "unit": None,
                    "item": None,
                    "type": None,
                    "descriptors": None}
        split_ing = ['None', 'None']

        # Unit Parsing
        for unit in self.units:
            if " " + unit + "s " in ing:
                ing_dict["unit"] = unit
                split_ing = ing.lower().split(" " + unit + "s ")
                break
            elif " " + unit + "es " in ing:
                ing_dict["unit"] = unit
                split_ing = ing.lower().split(" " + unit + "es ")
                break
            elif " " + unit + " " in ing:
                ing_dict["unit"] = unit
                split_ing = ing.lower().split(" " + unit + " ")
                break
            elif " " + unit +"." in ing:
                ing_dict["unit"] = unit
                split_ing = ing.lower().split(" " + unit +".")
                break
            else:
                split_ing = [ing, ing]
                continue
        ing_dict["quant"] = self.parse_quant(split_ing[0])

        # ING Parsing
        ing_dict["item"] = self.parse_ing_item(split_ing[1])
        ing_dict["type"] = self.parse_ing_types(split_ing[1], ing_dict["item"])
        ing_dict["descriptors"] = self.parse_ing_descriptors(split_ing[1])

        return ing_dict

    def parse_quant(self, quant):
        """
        Parse out a numerical quantity from fractional and other representations
        :param quant: str containing quantity to conver
        :return: float of quantity
        """
        if quant is None:
            return 0

        fractions = {"↉": "0", "⅒": "1/10", "⅑": "1/9", "⅛": "1/8",
                     "⅐": "1/7", "⅙": "1/6", "⅕": "1/5", "¼": "1/4",
                     "⅓": "1/3", "½": "1/2", "⅖": "2/3", "⅔": "2/3",
                     "⅜": "3/8", "⅗": "3/5", "¾": "3/4", "⅘": "4/5",
                     "⅝": "5/8", "⅚": "5/6", "⅞": "7/8"}

        new_quant = quant.replace("⁄", "/")
        for frac in fractions:
            if frac in quant:
                new_quant.replace(frac, fractions[frac])

        quant_num = 0
        for num in new_quant.split(" "):
            try:
                if "/" in num:
                    try:
                        quant_num += ast.literal_eval(num.split("/")[0]) / ast.literal_eval(num.split("/")[1])
                    except:
                        quant_num += 0
                elif ("-" in num) | ("to" in num):
                    break
                else:
                    quant_num += ast.literal_eval(num)
            except:
                quant_num += 0

        return quant_num

    def parse_ing_descriptors(self, ing_string):
        """
        Loop through possible descriptors for an ingredient and pull them out into a list
        :param ing_string:
        :param descriptors:
        :return:
        """

        descriptors_found = []
        for desc in self.descriptors:

            if desc in ing_string:
                descriptors_found.append(desc)
            elif " " in desc:
                if "-".join(desc.split(" ")) in ing_string:
                    descriptors_found.append(desc)

        return descriptors_found

    def parse_ing_types(self, ing_string, item):
        """
        Parse out the ingredient types for an ingredient string using regex
        :param ing_string: str of ingredient for a recipes ingredient list
        :param item: the item in ir to which the ingredient belongs
        :return: List of types the ingredient matches
        """
        found_types = []
        if item not in self.ir.keys():
            return found_types

        for ing_type in self.ir[item]["types"]:

            if ing_type in ing_string:
                found_types.append(ing_type)
            elif " " in ing_type:
                if "-".join(ing_type.split(" ")) in ing_string:
                    found_types.append(ing_type)

        return found_types

    def parse_ing_item(self, ing_string):
        """
        Determine which item the ingredient is using ir
        :param ing_string: ingredient string to look for
        :return: the item from ir the ingredient corresponds to
        """
        for item in self.ir:

            # Item name
            if item in ing_string:
                good_match = True

                # Check if stopwords
                if "stopwords" in self.ir[item]:
                    for sw in self.ir[item]["stopwords"]:
                        if sw in ing_string:
                            good_match = False

                if good_match:
                    return item
                else:
                    continue

            elif "aka" in self.ir[item].keys():
                for name in self.ir[item]["aka"]:
                    if name in ing_string:
                        return item

In [None]:
def unify_units(unit, quant):
    
    unit_ref = {"cup":["cup", 'c.'],
                "gram":['gram', 'g', 'g.'],
                "lb":['lb', 'pound'],
                "tsp":["tsp", "teaspoon"],
                "tbsp":["tbsp", "tablespoon"],
                "oz":["oz.", "ounce"]}

In [None]:
["cup", "c.", "g", "gram", 'lb', 'teaspoon', "tsp", "tbsp", "oz", 'tablespoon', 'container', 'packet',
                 'bag',
                 "stick",
                 'quart', 'pound', 'can', 'bottle', 'pint', 'package', 'ounce', 'jars', 'heads', 'gallons', 'drops',
                 "drop",
                 'envelope', 'bar', 'box', 'pinch', 'dash', 'bunch', 'recipe', 'layer', 'slice', 'link', 'bulb',
                 'stalk',
                 'square', 'sprig',
                 'fillet', 'piece', 'leg', 'thigh', 'cube', 'granule', 'strip', 'tray', 'leave', 'loaves', 'halves',
                 'jar']

In [4]:
# Parser Function
def parse_recipe_ingredients(ing_list):
    """
    A function that leverages the parser class to process an entire list
    :param ing_list:
    :return:
    """
    parsed = []
    ip = IngredientParser()

    for x in ing_list:
        parsed.append(ip.Parse(x))

    return parsed

In [49]:
# Get recipe list
rec_name = 'mac and cheese'

In [52]:
query = { "title": { "$regex": rec_name, "$options" :'i' }, "ingredients": { "$regex": 'white cheddar', "$options" :'i' } }
df = pd.DataFrame(list(recipes.find(query)))

In [53]:
df

Unnamed: 0,_id,title,total_time,yields,ingredients,instructions,image,rating,author,reviews,...,prepTime,cookTime,totalTime,datePublished,recipeYield,recipeCategory,cookingMethod,recipeCuisine,review_count,keywords
0,5f221a6a61c02dddcf3f8378,Longhorn Steakhouse Mac and Cheese,40.0,,"[1 pound cavatappi pasta, 2 tablespoons butter...",,https://copykat.com/wp-content/uploads/2018/10...,4.38 from 8 votes,,,...,,,,,,,,,,
1,5f22f0b003178f767f247bed,The BEST Stovetop Mac and Cheese,20.0,8 servings,"[2 tablespoons butter, 2 tablespoons flour, 3 ...","b""Melt butter in a large stockpot over medium-...",https://www.gimmesomeoven.com/wp-content/uploa...,4.900000,,,...,,,,,,,,,,
2,5f22042b61c02dddcf3f78a0,Mini Mac and Cheese,30.0,24 serving(s),"[1 pound elbow macaroni, 4 tablespoons (1/4 cu...",b'Preheat the oven to 350 degrees F. Butter 2 ...,https://food.fnr.sndimg.com/content/dam/images...,3.000000,sandra-lee,,...,,,,,,,,,,
3,5f25fe9284787bbaa4c45382,KICKED UP MAC AND CHEESE,60.0,6 serving(s),"[2 teaspoons butter, 2 eggs, 2 cups whole milk...",b'Preheat the oven to 350 degrees F. Grease a ...,https://food.fnr.sndimg.com/content/dam/images...,-1,emeril-lagasse,,...,,,,,,,,,,
4,5f288e0e962730a7ad39ebd0,Ultimate Lobster Mac and Cheese,57.0,6 serving(s),"[kosher salt, 1 lb elbow macaroni or 1 lb your...",b'Preheat oven to 375 degrees F.\nAdd pasta to...,"https://img.sndimg.com/food/image/upload/q_92,...",5.000000,,,...,,,,,,,,,,
5,5f289efd937142c226d43387,Company-Worthy Mac and Cheese,,,"[12 slices uncooked bacon, 1 teaspoon minced g...",[Cook bacon according to package directions. C...,,4.8,Sally,"[{'@type': 'Review', 'reviewRating': {'@type':...",...,PT15M,PT35M,PT1H,2019-03-18,8.0,[Dinner],[Cook],[American],10.0,"[macaroni and cheese, pasta]"
6,5f28bc71962730a7ad3a019d,Mac and Cheese Primavera,30.0,6 serving(s),"[1 pound medium pasta shells, 2 tablespoons sa...",b'Cook the pasta according to the package inst...,https://food.fnr.sndimg.com/content/dam/images...,5.000000,ree-drummond,,...,,,,,,,,,,
7,5f28d455962730a7ad3a0cc0,Kicked Up Mac and Cheese,,6 serving(s),"[2 teaspoons butter, 2 eggs, 2 cups of whole m...",b'Preheat the oven to 350 degrees F. Grease a ...,https://food.fnr.sndimg.com/content/dam/images...,-1,emeril-lagasse,,...,,,,,,,,,,
8,5f297a1164170ca38045bc01,Creamy Butternut Squash Mac and Cheese with Kale,,,[4 cups peeled and cubed butternut squash (abo...,"[Combine the squash, broth, milk, and garlic i...",,5,Sally,"[{'@type': 'Review', 'reviewRating': {'@type':...",...,PT45M,PT30M,PT1H10M,2015-10-01,8.0,[Dinner],[Cooking],[American],1.0,"[mac and cheese, butternut squash mac and che..."
9,5f2c7966bece05c4910accc2,Crawfish Mac and Cheese,75.0,8 serving(s),"[2 tablespoons olive oil, 1⁄3 cup diced pancet...",b'Preheat the oven to 350 degrees F.\nSet a la...,"https://img.sndimg.com/food/image/upload/q_92,...",-1,,,...,,,,,,,,,,


In [40]:
df['parsed_ings'] = df.ingredients.apply(parse_recipe_ingredients)

In [41]:
df['parsed_ings'][1]

[{'ingredient_string': '1 cup onion, diced',
  'quant': 1,
  'unit': 'cup',
  'item': 'onion',
  'type': [],
  'descriptors': ['diced']},
 {'ingredient_string': '1⁄4 cup sour cream',
  'quant': 0.25,
  'unit': 'cup',
  'item': 'sour cream',
  'type': [],
  'descriptors': []},
 {'ingredient_string': '2 garlic cloves, minced (or 1 tsp of pre-minced garlic from a jar)',
  'quant': 3,
  'unit': 'tsp',
  'item': 'garlic',
  'type': [],
  'descriptors': ['minced']},
 {'ingredient_string': '1 tablespoon olive oil',
  'quant': 1,
  'unit': 'tablespoon',
  'item': 'oil',
  'type': ['olive'],
  'descriptors': []},
 {'ingredient_string': '1 lb ground beef',
  'quant': 1,
  'unit': 'lb',
  'item': 'ground beef',
  'type': [],
  'descriptors': ['ground']},
 {'ingredient_string': '1 tablespoon flour',
  'quant': 1,
  'unit': 'tablespoon',
  'item': 'flour',
  'type': [],
  'descriptors': []},
 {'ingredient_string': '1 teaspoon salt',
  'quant': 1,
  'unit': 'teaspoon',
  'item': 'salt',
  'type': []