In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import ast
import pymongo

In [2]:
# Database setup
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
recipes = db['recipes']

In [3]:
choco_chip = pd.DataFrame(list(recipes.find({"title":{"$regex":"chocolate chip", "$options" :'i'}})))

In [87]:
# Ingredient Ref
ir = {"flour":{
           "types":['all purpose', "white", "brown", "spelt", "oat", "millet",
                 "whole wheat", "almond", "tapioca", "sorghum", "rice", "coconut", "garfava"
                 "barley", "cake", "bread", "self-rising", "soy", "pastry"]
      },
      "brown sugar":{
          "types":["dark", "light"],
          "aka":["brown-sugar"]
      },
      "sugar":{
          "types":["granulated", "white", "substitute", "splenda", "cane"],
          "aka":["splenda"]
      },
      "butter":{
          "types":["salted", "unsalted"]
          "stopwords":["almond", "peanut"]
      },
      "water":{
          "types":[]
      },
      "baking powder":{
          "types":[],
          "aka":["baking-powder"]
      },
      "salt":{
          "types":["kosher", "coarse", "sea", "popcorn", "fine", "himalayan", "table"],
          "aka":["sea-salt"]
      },
      "chocolate chip":{
          "types":["semi sweet", "milk", "dark", "white", "bittersweet"],
          "aka":["chocolate chips"]
      },
      "chocolate chunk":{
          "types":["semi sweet", "milk", "dark", "white", "bittersweet"],
          "aka":["chocolate chunks"]
      },
      "baking soda":{
          "types":[],
          "aka":["baking-soda"]
      },
      "rolled oats":{
          "types":["old fashioned"],
          "aka":["rolled-oats"]
      },
      "vanilla extract":{
          "types":['madagascar'],
          "aka":["vanilla"]
      },
      "shortening":{
          "types":['vegatable']
      },
      "walnut":{
          "types":["roasted", "toasted"],
          "aka":["walnuts"],
          "stopwords":[" nuts"]
      },
      "pecan":{
         "types":["roasted", "toasted"],
         "aka":["pecans" ]
      },
      "oil":{
          "types":["olive", "vegetable", "avocado", "canola", "virgin", "coconut"]
      },
      "coconut":{
          "types":["flaked", "unsweetened"],
          "stopwords":["milk", "pudding"]
      },
      "chickpeas":{
          "types":[],
          "aka":["garbonzo beans", "ghana"]
      },
      "cooking spray":{
          "types":["nonstick"],
          "aka":["pam"]
      },
      "milk":{
          "types":["whole", "2%", "skim", "dairy free", "coconut", "almond", "oat", "soy"]
      },
      "sour cream":{
          "types":["fat free", "dairy free"]
      },
      "banana":{
          "types":["cavendish"],
          "stopwords":["extract"]
      },
      "banana extract":{
          "types":[]
      },
      "cornstarch":{
          "types":[]
      },
      "cocoa powder":{
          "types":[]
      },
      "nuts":{
          "types":["walnut", "pecan", "peanut"]
      },
      "peanut":{
          "types":[],
          "stopwords":[" nuts"]
      },
      "honey":{
          "types":[]
      },
      "baking mix":{
          "types":[]
      },
      "cherry":{
          "types":[],
          "aka":["cherries"]
      },
      "almond extract":{
          "types":[]
      },
      "almond":{
          "types":[],
          "stopwords":["milk", "extract"]
      },
      "pudding mix":{
          "types":["coconut cream"]
      },
      "heavy cream":{
          "types":[]
      },
      "pretzels":{
          "types":[]
      },
      "date":{
          "types":["medjool"]
      },
      "flax seeds":{
          "types":[]
      },
      "quinoa":{
          "types":[]
      },
      "applesauce":{
          "types":[]
      },
      "xanthan gum":{
          "types":[]
      },
      "cinnamon":{
          "types":["ground", "saigon"]
      },
      "instant coffee":{
          "types":[]
      },
      "peanut butter":{
          "types":[]
      }
    }

descriptors = ['beaten', 'chopped', 'cold', 'diced', 'packed', 'hot', 'large', 'melted', 'mini', 
               'miniature', 'packed', 'room temperature', 'sifted', 'softened', 'warm', "drained", "rinsed", "optional",
               "mashed", "ripe", "medium", "ground", "pitted", "ground"]

In [89]:
num = 21
#[x if x["item"] is None else None for x in choco_chip.ingredients[:50].apply(parse_recipe_ingredients)[num]]
choco_chip.ingredients[:50].apply(parse_recipe_ingredients)[num]

[{'ingredient_string': '3⁄4 cup coconut flour',
  'quant': 0.75,
  'unit': 'cup',
  'item': 'flour',
  'type': ['coconut'],
  'descriptors': []},
 {'ingredient_string': '1⁄4 cup almond flour',
  'quant': 0.25,
  'unit': 'cup',
  'item': 'flour',
  'type': ['almond'],
  'descriptors': []},
 {'ingredient_string': '1⁄2 cup swerve granulated sugar',
  'quant': 0.5,
  'unit': 'cup',
  'item': 'sugar',
  'type': ['granulated'],
  'descriptors': []},
 {'ingredient_string': '1⁄2 cup swerve brown sugar',
  'quant': 0.5,
  'unit': 'cup',
  'item': 'brown sugar',
  'type': [],
  'descriptors': []},
 {'ingredient_string': '3⁄4 cup non sweetened flaked coconut',
  'quant': 0.75,
  'unit': 'cup',
  'item': 'coconut',
  'type': ['flaked'],
  'descriptors': []},
 {'ingredient_string': '3⁄4 cup peanut butter (i use skippy)',
  'quant': 0.75,
  'unit': 'cup',
  'item': 'butter',
  'type': [],
  'descriptors': []},
 {'ingredient_string': '1⁄2 cup butter, at room temperature',
  'quant': 0.5,
  'unit': 'c

In [11]:
def parse_quant(quant):
    
    if quant is None:
        return 0
    
    fractions = {"↉":"0", "⅒":"1/10", "⅑":"1/9", "⅛":"1/8", 
                 "⅐":"1/7", "⅙":"1/6", "⅕":"1/5", "¼":"1/4", 
                 "⅓":"1/3", "½":"1/2", "⅖":"2/3", "⅔":"2/3", 
                 "⅜":"3/8", "⅗":"3/5", "¾":"3/4", "⅘":"4/5", 
                 "⅝":"5/8", "⅚":"5/6", "⅞":"7/8"}
    
    new_quant = quant.replace("⁄", "/")
    for frac in fractions:
        if frac in quant:
            new_quant.replace(frac, fractions[frac])
            
    quant_num = 0
    for num in new_quant.split(" "):
        try:
            if "/" in num:
                try:
                    quant_num += ast.literal_eval(num.split("/")[0])/ast.literal_eval(num.split("/")[1])
                except:
                    quant_num += 0
            elif ("-" in num) | ("to" in num):
                break
            else:
                quant_num += ast.literal_eval(num)
        except:
            quant_num += 0
    
    return quant_num

In [12]:
def parse_ing_descriptors(ing_string):
    
    descriptors_found = []
    for desc in descriptors:
        
        if desc in ing_string:
            descriptors_found.append(desc)
        elif " " in desc:
            if "-".join(desc.split(" ")) in ing_string:
                descriptors_found.append(desc)
                
    return descriptors_found

In [13]:
def parse_ing_types(ing_string, item):
    
    found_types = []
    if item not in ir.keys():
        return found_types

    for ing_type in ir[item]["types"]:
        
        if ing_type in ing_string:
            found_types.append(ing_type)
        elif " " in ing_type:
            if "-".join(ing_type.split(" ")) in ing_string:
                found_types.append(ing_type)
                
    return found_types

In [14]:
def parse_ing_item(ing_string):
    
    for item in ir:
        
        # Item name
        if item in ing_string:
            return item
                        
        elif "aka" in ir[item].keys():
            for name in ir[item]["aka"]:
                if name in ing_string:
                    return item

In [84]:
def parse_ingredient(ing):
    
    ing_dict = {"ingredient_string":ing.lower(),
                    "quant":None,
                    "unit":None,
                    "item":None,
                    "type":None,
                    "descriptors":None}
    split_ing = ['None', 'None']
    
    units = ["cup", "c." "g", "gram", 'lb', 'teaspoon', "tsp", "tbsp", "oz", 'tablespoon','container','packet','bag', "stick",
         'quart','pound','can','bottle', 'pint','package','ounce','jars','heads','gallons','drops', "drop",
         'envelope','bar','box','pinch', 'dash','bunch','recipe','layer','slice','link','bulb','stalk','square','sprig',
        'fillet','piece','leg','thigh','cube','granule','strip','tray','leave','loaves','halves']
    
    # Unit Parsing
    for unit in units:
        if " "+unit+"s " in ing:
            ing_dict["unit"] = unit
            split_ing = ing.lower().split(" "+unit+"s ")
            break
        elif " "+unit+"es " in ing:
            ing_dict["unit"] = unit
            split_ing = ing.lower().split(" "+unit+"es ")
            break
        elif " "+unit+" " in ing:
            ing_dict["unit"] = unit
            split_ing = ing.lower().split(" "+unit+" ")
            break
        else:
            continue
    ing_dict["quant"] = parse_quant(split_ing[0])
    
    # ING Parsing    
    ing_dict["item"] = parse_ing_item(split_ing[1])
    ing_dict["type"] = parse_ing_types(split_ing[1], ing_dict["item"])
    ing_dict["descriptors"] = parse_ing_descriptors(split_ing[1])
            
    # Special
    if "egg" in ing:
        ing_dict['item'] = "egg"
        ing_dict["unit"] = "eggs"
        ing_dict["quant"] = parse_quant(ing.split(" ")[0])
    
    return ing_dict

In [37]:
def parse_recipe_ingredients(ing_list):
    
    parsed = []
    for x in ing_list:
        parsed.append(parse_ingredient(x))
        
    return parsed

In [49]:
words = {}
puncs = [",", "(", ")", " - ", ".", "!"]
for rec in test:
    for ing in rec:
        for punc in puncs:
            if punc in ing["remaining_info"]:
                ing["remaining_info"] = ing["remaining_info"].replace(punc, "")
                
        rem_words = set(ing["remaining_info"].split(" "))
        if ing["item"] is not None:
            rem_words = rem_words - set([ing["item"]])
        if len(ing["descriptors"])>0:
            for desc in ing["descriptors"]:
                rem_words = set(rem_words) - set(ing["descriptors"])
        if len(ing["type"])>0:
            for desc in ing["type"]:
                rem_words = set(rem_words) - set(ing["type"])
        
        for word in rem_words:
            if word not in words:
                words[word] = 0
            words[word] += 1

{'brown': 59,
 'sugar': 59,
 'lightly': 1,
 'None': 119,
 'all-purpose': 41,
 'powder': 33,
 'baking': 104,
 'chocolate': 103,
 'chips': 91,
 'soda': 72,
 'old-fashioned': 3,
 'rolled': 7,
 'oats': 15,
 '1': 14,
 'cup': 1,
 'firmly': 13,
 "nestle's": 1,
 'quik': 1,
 'vanilla': 87,
 'semisweet': 22,
 'no': 4,
 'substitutes': 4,
 'extract': 57,
 '2': 11,
 'walnuts': 12,
 'or': 36,
 'chunks': 2,
 'use': 5,
 'room': 13,
 'temperature': 13,
 'i': 7,
 'sweet': 3,
 'cream': 15,
 'stick': 6,
 'pecans': 9,
 '2/3': 1,
 'nuts': 2,
 'recommended': 1,
 'semi-sweet': 30,
 'unsalted': 4,
 'roughly': 2,
 'macadamia': 1,
 'of': 4,
 'at': 6,
 'pure': 7,
 'and': 3,
 'also': 1,
 'known': 1,
 'beans': 1,
 'ghana': 1,
 'garbanzo': 1,
 'as': 4,
 '3/4': 3,
 'crisco': 2,
 'flavor': 3,
 'wheat': 12,
 'whole': 15,
 'grain': 1,
 'free': 3,
 'sour': 4,
 'fat': 2,
 'very': 1,
 'approximately': 1,
 'chip': 5,
 'not': 5,
 'tub': 1,
 'margarine': 5,
 'cocoa': 4,
 'unbleached': 7,
 'hershey': 2,
 'medjool': 1,
 'to': 7

In [27]:
[test[0][0]['item']]

['butter']

In [21]:
choco_chip['parsed_ingredients'] = choco_chip.ingredients.apply(parse_recipe_ingredients)

In [22]:
choco_chip['parsed_ingredients']

0       [{'ingredient_string': '2⁄3 cup butter, melted...
1       [{'ingredient_string': '1 1/2 cups all-purpose...
2       [{'ingredient_string': '1 1⁄2 cups brown sugar...
3       [{'ingredient_string': '1⁄2 cup room temperatu...
4       [{'ingredient_string': '2⁄3 cup shortening', '...
5       [{'ingredient_string': '3⁄4 cup brown sugar', ...
6       [{'ingredient_string': '1 cup unsalted macadam...
7       [{'ingredient_string': '1 cup butter', 'quant'...
8       [{'ingredient_string': 'nonstick cooking spray...
9       [{'ingredient_string': '3⁄4 cup Butter Flavor ...
10      [{'ingredient_string': '1⁄2 cup butter, room t...
11      [{'ingredient_string': '1 cup whole wheat past...
12      [{'ingredient_string': '1 cup butter', 'quant'...
13      [{'ingredient_string': '1⁄2 cup packed brown s...
14      [{'ingredient_string': '2 cups unbleached all-...
15      [{'ingredient_string': '2 1⁄4 cups all-purpose...
16      [{'ingredient_string': '1⁄2 cup packed pitted ...
17      [{'ing