In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import ast
import pymongo
import tensorflow as tf

import spacy
sp = spacy.load('en_core_web_sm')

In [2]:
# MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client['food_analysis']
recipes = db['recipes']

In [3]:
df = pd.DataFrame(list(recipes.find({})))

In [4]:
nyt = pd.read_csv('nyt_ingredients_training.csv')

# Text Processing in Spacy

In [26]:
unit_map_dict = {'cup':['cup', 'cups', 'c.', 'c'],
            'tbsp':['tbsp', 'tbsp.', 'tablespoon', 'tablespoons'],
            'tsp':['tsp', 'tsp.', 'teaspoon', 'teaspoons'],
            'lb':['lb', 'pound', 'lbs', 'lb.', 'lbs.', 'pounds'],
            'oz':['ounce', 'oz.', 'oz', 'ozs', 'ozs.', 'ounces'],
            'g':['g', 'gram', 'grams'],
            'quart':['quart', 'qt', 'qrt', 'quarts'],
            'pint':['pint', 'pints', 'pt'],
            'gallon':['gallon', 'gallons']}

def map_units(x):
    
    out = x
    for i, token in enumerate(x):
        for key in unit_map_dict:
            if token in unit_map_dict[key]:
                out[i] = key
        
    return out

In [39]:
def text_proc_pipeline(df, col):
    
    odf = df.copy()
    odf[col] = odf[col].str.lower()
    
    print("Replacing bad characters")
    replacements = {
                # Bad fractions
                "↉": "0", "⅒": "1/10", "⅑": "1/9", "⅛": "1/8",
                     "⅐": "1/7", "⅙": "1/6", "⅕": "1/5", "¼": "1/4",
                     "⅓": "1/3", "½": "1/2", "⅖": "2/3", "⅔": "2/3",
                     "⅜": "3/8", "⅗": "3/5", "¾": "3/4", "⅘": "4/5",
                     "⅝": "5/8", "⅚": "5/6", "⅞": "7/8", 
                
                # Bad punnctuation
                "⁄":"/", "-":"", "'":"", '"':""}
    
    for key in replacements:
        odf[col] = odf[col].str.replace(key, replacements[key])
        
    print("Tokenizing and Stemming (Saves as token list)")
    odf[col] = odf[col].apply(lambda x: [y.lemma_ for y in sp(str(x))])
    
    print("Cleaning units")
    odf[col] = odf[col].apply(map_units)
    
    return odf[col]

In [42]:
def df_proc_pipeline(df, proc_cols, keep_cols):
    
    odf = df[keep_cols].copy()
    
    for col in proc_cols:
        print(f"Processing {col}")
        odf[col+'_sp'] = text_proc_pipeline(df, col)
        print()
        
    return odf

In [43]:
nyt_sp = df_proc_pipeline(nyt, ['input', 'unit'], ['qty', 'comment', 'input'])

Processing input
Replacing bad characters
Tokenizing and Stemming (Saves as token list)
Cleaning units

Processing unit
Replacing bad characters
Tokenizing and Stemming (Saves as token list)
Cleaning units



# Unit Parser

In [12]:
gen_vocab(nyt, 'input_sp')

'{"counts":{",":65971,"1":60198,"cup":43041,"2":36133,"tbsp":32974,"1\\/2":28840,"tsp":25756,"or":20132,"and":18944,"to":16241,"chop":16092,"(":15843,")":15662,"salt":15324,"pepper":13924,"1\\/4":13247,"lb":12487,"3":12141,"oil":11695,"fresh":10933,"4":10918,"ground":10663,"taste":10484,"freshly":9204,"olive":7753,"oz":7657,"peel":7309,"garlic":7091,"finely":6943,"large":6642,"mince":6621,"butter":6290,"cut":6095,"of":6078,"clove":6049,"dry":5932,"black":5926,"sugar":5910,"onion":5787,"for":5695,"slice":5685,"about":5648,"6":5244,"juice":5119,"white":4953,"into":4855,"lemon":4813,"3\\/4":4795,"red":4602,"egg":4501,"leave":4442,"small":4182,"in":4174,"grate":3833,"flour":3781,"tomato":3698,"8":3611,"a":3598,"water":3583,"dice":3523,"parsley":3508,"chicken":3398,"plus":3341,"medium":3129,"vinegar":3122,"seed":3088,"more":2972,"wine":2970,"1\\/3":2838,"cream":2779,"green":2720,"optional":2680,"unsalted":2592,"5":2502,"sauce":2502,"kosher":2430,"piece":2403,"whole":2318,"g":2261,"vegetable

In [11]:
def gen_vocab(df, colname, encoding='descending', vocab_cap=5000):
    """
    Function to take dataframe's token columns and generate a vocab dict.
    Returned vocab dict will have unique mapping from token to int.
    
    :param df: pandas DataFrame, df that has the tokens
    :param colname: str, column in dataframe to gen vocab from
    :param encoding: str, encoding type, one of ['arbitrary', 'descending']
    :returns: 
    """
    
    unique_tokens = {}
    for x in nyt[colname]:
        for token in x:

            token_ = token

            # check if a unit needs mapping to simplified version
            for k in unit_map_dict:
                if token in unit_map_dict[k]:
                    token_ = k

            if token_ not in unique_tokens.keys():
                unique_tokens[token_] = 1
            else:
                unique_tokens[token_] += 1
                
    df = (pd.DataFrame.from_dict(unique_tokens, orient='index')
                  .rename(columns={0:'counts'})
                  .sort_values('counts', ascending=False))
        
    return df[:vocab_cap].to_json()['counts']

def gen_training_data(vocab, tokens, split=.3):
    """
    Loading data function for training
    """
    # 