# Really tho'... how many ingredients could there possibly be???

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

import re
import string

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
# Pulling in df of recipes to test modeling

df_path = '../data/clean_recipes.csv'
df = pd.read_csv(df_path, sep=';')

In [4]:
def parse_ingredients(dfcolumn):
    '''Parses a column of ingredients and return a list of unique values'''

    # If instance is one string with all ingredients, split by comma
    if type(dfcolumn[0]) == str:
        dfcolumn = dfcolumn.apply(lambda x: x.split(','))

    # Create set of unique values, the convert to list
    ingredients = set(x for l in dfcolumn for x in l)
    ingredient_list = list(ingredients)

    return ingredient_list

In [5]:
print(df.shape)
df.head()

(12351, 10)


Unnamed: 0,Recipe Name,Review Count,Recipe Photo,Author,Prepare Time,Cook Time,Total Time,Ingredients,Directions,RecipeID
0,Golden Crescent Rolls Recipe,304,https://images.media-allrecipes.com/userphotos...,Mike A.,25 m,15 m,3 h 10 m,"yeast,water,white sugar,salt,egg,butter,flour,...","Dissolve yeast in warm water.**Stir in sugar, ...",7000
1,Poppy Seed Bread with Glaze Recipe,137,https://images.media-allrecipes.com/userphotos...,Christina Jun,15 m,1 h,1 h 20 m,"flour,salt,baking powder,poppy,butter,vegetabl...",'Preheat oven to 350 degrees F (175 degrees C)...,7001
2,Applesauce Bread I Recipe,124,https://images.media-allrecipes.com/userphotos...,GAF55,10 m,1 h 20 m,1 h 30 m,"flour,egg,white sugar,vegetable oil,applesauce...",Preheat oven to 350 degrees F (175 degrees C)....,7003
3,Apple Raisin Bread Recipe,39,https://images.media-allrecipes.com/userphotos...,Helen Hanson,15 m,1 h,1 h 15 m,"flour,baking powder,baking soda,salt,cinnamon,...",Preheat oven to 350 degrees F (175 degrees C)....,7006
4,Buttermilk Oatmeal Bread Recipe,41,https://images.media-allrecipes.com/userphotos...,Helen Hanson,10 m,1 h,1 h 40 m,"oat,buttermilk,vegetable oil,egg,brown sugar,f...",Mix oats with buttermilk. Let stand for 1/2 h...,7007


In [6]:
kgl_df = pd.read_json('../data/kgl_ingredient_train.json')
print(kgl_df.shape)
kgl_df.head()

(39774, 3)


Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [7]:
kgl2_df = pd.read_json('../data/kgl_ingredient_test.json')
print(kgl2_df.shape)
kgl2_df.head()

(9944, 2)


Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [8]:
# Isolate ingredients from each set via parse_ingredients function
df_ing = parse_ingredients(df.Ingredients)
kgl_ing = parse_ingredients(kgl_df.ingredients)
kgl2_ing = parse_ingredients(kgl2_df.ingredients)

In [9]:
ingredients = sorted(list(set().union(df_ing, kgl_ing, kgl2_ing)))
ingredients[:50]

['',
 ' 2',
 ' 2 inches thick',
 ' and 3:',
 ' and dried',
 ' casings removed',
 ' chopped',
 ' cleaned',
 ' cubed',
 ' cut into 1/2 inch thick circles',
 ' divided',
 ' drained',
 ' drained and chopped',
 ' drained and finely chopped',
 ' drained and mashed',
 ' for topping',
 ' ground',
 ' halved',
 ' julienned',
 ' lean',
 ' mashed',
 ' peeled and cubed',
 ' peeled and julienned',
 ' peeled and segmented',
 ' peeled and shredded',
 ' rinsed',
 ' rinsed and dried',
 ' rinsed and torn',
 ' seasoned croutons',
 ' soaked',
 ' split',
 ' split and toasted',
 ' stemmed and rinsed',
 ' thawed',
 ' to taste',
 ' warmed',
 ' washed and cubed',
 ' without shells',
 "'1 pound calves'' brains",
 "'2 cups crushed zwieback toast",
 "'3 tablespoons meringue powder",
 "'5 pig''s feet",
 "'CRUST:",
 "'Crust:",
 "'LADYFINGERS",
 "'Streusel:",
 '(    oz.) tomato sauce',
 '(   oz.) tomato paste',
 '(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry']

In [10]:
ingredients = [i.lower() for i in ingredients]
print(len(ingredients))
ingredients[:20]

7800


['',
 ' 2',
 ' 2 inches thick',
 ' and 3:',
 ' and dried',
 ' casings removed',
 ' chopped',
 ' cleaned',
 ' cubed',
 ' cut into 1/2 inch thick circles',
 ' divided',
 ' drained',
 ' drained and chopped',
 ' drained and finely chopped',
 ' drained and mashed',
 ' for topping',
 ' ground',
 ' halved',
 ' julienned',
 ' lean']

In [11]:
def scrub_ingredients(string, dropwords):
    if string:
        split_string = string.split()
        trimmed_words = [word for word in split_string if word.lower().strip(' .-:"()1234567890%/\\\'') not in dropwords]
        return ' '.join(trimmed_words)

In [12]:
# Cleaning up/trimming down that list
dropwords = ['ounce', 'ounces', 'oz', 'oz.', 'oz.', 'lb', 'lb.', ' pound', 'package', 'packages', 'inches', 'ground', 'and', 'for']
ingredients = [i.lower() for i in ingredients]
ingredients_trimmed = [scrub_ingredients(x, dropwords) for x in ingredients if x]
ingredients_trimmed[:50]

['2',
 '2 thick',
 '3:',
 'dried',
 'casings removed',
 'chopped',
 'cleaned',
 'cubed',
 'cut into 1/2 inch thick circles',
 'divided',
 'drained',
 'drained chopped',
 'drained finely chopped',
 'drained mashed',
 'topping',
 '',
 'halved',
 'julienned',
 'lean',
 'mashed',
 'peeled cubed',
 'peeled julienned',
 'peeled segmented',
 'peeled shredded',
 'rinsed',
 'rinsed dried',
 'rinsed torn',
 'seasoned croutons',
 'soaked',
 'split',
 'split toasted',
 'stemmed rinsed',
 'thawed',
 'to taste',
 'warmed',
 'washed cubed',
 'without shells',
 "'1 pound calves'' brains",
 "'2 cups crushed zwieback toast",
 "'3 tablespoons meringue powder",
 "'5 pig''s feet",
 "'crust:",
 "'crust:",
 "'ladyfingers",
 "'streusel:",
 '( tomato sauce',
 '( tomato paste',
 '(10 frozen chopped spinach',
 '(10 frozen chopped spinach, thawed squeezed dry',
 '(14 sweetened condensed milk']

In [13]:
ingredients_stripped = [x.strip(' .:"()1234567890%/\\\'') for x in ingredients_trimmed]
ingredients_stripped = sorted([i.lower() for i in ingredients_stripped if i])
ingredients_stripped[:50]

['a taste of thai rice noodles',
 'abalone',
 'abbamele',
 'absinthe',
 'abura age',
 'acai juice',
 'accent',
 'accent seasoning',
 'accompaniment',
 'achiote',
 'achiote paste',
 'achiote powder',
 'acini di pepe',
 'ackee',
 'acorn squash',
 'activ dry quick rise yeast',
 'active dry yeast',
 'adobo',
 'adobo all purpose seasoning',
 'adobo sauce',
 'adobo seasoning',
 'adobo seasoning to taste',
 'adobo style seasoning',
 'adzuki beans',
 'agar',
 'agar agar flakes',
 'agave nectar',
 'agave tequila',
 'aged balsamic vinegar',
 'aged cheddar cheese',
 'aged gouda',
 'aged manchego cheese',
 'ahi',
 'ahi tuna steaks',
 'aioli',
 'ajinomoto',
 'ajwain',
 'aka miso',
 'alaskan halibut',
 'alaskan king crab legs',
 'alaskan king salmon',
 'albacore',
 'albacore tuna in water',
 'alcohol',
 'alcoholic',
 'ale',
 'aleppo',
 'aleppo pepper',
 'alexia waffle fries',
 'alfalfa sprouts']

In [14]:
len(ingredients_stripped)

7796

In [62]:
vals = ingredients_stripped

def ngrams_analyzer(string):
    string = re.sub(r'[,-./]', r'', string)
    ngrams = zip(*[string[i:] for i in range(5)])  # N-Gram length is 5
    return [''.join(ngram) for ngram in ngrams]

def get_lemz(text):

    stopwords = []
    lemmas = []

    doc = nlp(text)

    for token in doc:
        if (token.text.lower() not in stopwords):
            if ((token.is_stop == False) & (token.is_punct == False)) & (token.pos_!= 'PRON'):
                lemmas.append(token.lemma_)

    return lemmas

vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer, stop_words='english', max_df=.97)

In [63]:
tfidf = TfidfVectorizer(stop_words='english',
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        tokenizer=get_lemz)

dtm = tfidf.fit_transform(vals)
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
dtm.head()

Unnamed: 0,2,achiote,acid,add,adobo,agave,aged,alaskan,ale,alfredo,...,zwieback,zwieback toast,, seasoning,®,® greek,® original,® pasta,® traditional,™
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


-----

## Let's considering going about this a different way...
If we keep the count of each ingredient, it's likely that ingredients that only show up once are the misspelled or incorrectly categorized items?!

In [64]:
df_ing.extend(kgl_ing)

In [65]:
df_ing.extend(kgl2_ing)

In [66]:
len(df_ing)

72644

In [67]:
def ingredient_count(dfcolumn):
    '''Parses a column of word lists and returns a dictionary of words/wordcounts'''

    # If observation is one long string of ingredients, split by comma
    if type(dfcolumn[0]) == str:
        dfcolumn = dfcolumn.apply(lambda x: x.split(','))

    # Keep only letters (and ' for potential possessives)
    char_table = str.maketrans('', '', string.punctuation)
    dfcolumn = [[word.translate(char_table) for word in word_list] for word_list in dfcolumn if word_list]
    
    # Drop measurement and conjunction words
    dropwords = ['ounce', 'ounces', 'oz', 'oz.', 'oz.', 'lb', 'lb.', ' pound', 'package', 'packages', 'inches',
                 'ground', 'and', 'for', 'teaspoon', 'tablespoon', 'tbsp', 'tsp']
    dfcolumn = [[word.lower().strip(' .:"()1234567890%/\\\'') for word in word_list if word.lower().strip(' .:"()1234567890%/\\\'') not in dropwords] for word_list in dfcolumn if word_list]

    # Create dictionary with word count
    d = dict()

    for word_list in dfcolumn:
        for word in word_list:
            if word in d:
                d[word] = d[word] + 1
            else:
                d[word] = 1

    return d

In [68]:
derp = ingredient_count(df.Ingredients)

In [69]:
derp.keys()

dict_keys(['yeast', 'water', 'white sugar', 'salt', 'egg', 'butter', 'flour', 'baking powder', 'poppy', 'vegetable oil', 'milk', 'vanilla', 'almond', 'orange juice', 'sugar', 'applesauce', 'raisin', 'cinnamon', 'baking soda', 'sour cream', 'nutmeg', 'brown sugar', 'oat', 'apple', 'walnut', 'buttermilk', 'shortening', 'lemon', 'fruit', 'bread', 'whole wheat', 'cottage cheese', 'margarine', 'banana', 'coffee', 'chocolate', 'cornmeal', 'bell pepper', 'onion', 'garlic', 'corn', 'pimento', 'pepper', 'maple syrup', 'cocoa powder', 'molasses', 'ghee', 'yogurt', 'corn syrup', 'cardamom', 'clove', 'pecan', 'wheat germ', 'honey', 'carrot', 'pineapple', 'vinegar', 'caraway', 'fennel', 'zucchini', 'cream cheese', 'pumpkin', 'cherry', 'citron', 'coconut', 'beer', 'bran', 'beef', 'chile', 'olive', 'cumin', 'chili', 'thyme', 'apple juice', 'tapioca', 'parsley', 'brown rice', 'tea', 'gelatin', 'lemon juice', 'pastry', 'lard', 'currant', 'sourdough', 'basil', 'dill', 'celery', 'ginger', 'spice', 'mace'

In [70]:
ingrDF = pd.DataFrame(derp.items(), columns=['item', 'count'])
ingrDF.head(15)

Unnamed: 0,item,count
0,yeast,190
1,water,2606
2,white sugar,4986
3,salt,4516
4,egg,4880
5,butter,4456
6,flour,4158
7,baking powder,1292
8,poppy,61
9,vegetable oil,1358


In [86]:
derp2 = ingredient_count(kgl_df.ingredients)
derp3 = ingredient_count(kgl2_df.ingredients)

ingrDF2 = pd.DataFrame(derp2.items(), columns=['item', 'count'])
ingrDF3 = pd.DataFrame(derp3.items(), columns=['item', 'count'])

In [87]:
merged_dfs = ingrDF.merge(ingrDF2, how='outer', on='item', suffixes=('', '_2'))
merged_dfs = merged_dfs.merge(ingrDF3, how='outer', on='item', suffixes=('', '_3'))
merged_dfs = merged_dfs.fillna(0)
merged_dfs.head()

Unnamed: 0,item,count,count_2,count_3
0,yeast,190.0,109.0,29.0
1,water,2606.0,7457.0,1836.0
2,white sugar,4986.0,1093.0,299.0
3,salt,4516.0,18049.0,4485.0
4,egg,4880.0,0.0,0.0


In [88]:
cols = ['count', 'count_2', 'count_3']
merged_dfs[cols] = merged_dfs[cols].astype('int32')
merged_dfs.head()

Unnamed: 0,item,count,count_2,count_3
0,yeast,190,109,29
1,water,2606,7457,1836
2,white sugar,4986,1093,299
3,salt,4516,18049,4485
4,egg,4880,0,0


In [89]:
merged_dfs['total_counts'] = merged_dfs.sum(axis=1, skipna=True)
final_ingr_df = merged_dfs[['item', 'total_counts']].sort_values('total_counts', ascending=False).reset_index(drop = True)

final_ingr_df.head(15)

Unnamed: 0,item,total_counts
0,salt,27050
1,water,11899
2,garlic,11287
3,butter,10534
4,onions,10008
5,olive oil,9889
6,sugar,9319
7,garlic cloves,7772
8,pepper,7080
9,vegetable oil,6874


In [91]:
recipe_count = len(df) + len(kgl_df) + len(kgl2_df)
recipe_count

62069

In [92]:
len(final_ingr_df)

7654

In [93]:
final_ingr_df = final_ingr_df[~final_ingr_df.item.str.contains(r'[^\w\s]')]
len(final_ingr_df)

7561

In [94]:
final_ingr_df.head(15)

Unnamed: 0,item,total_counts
0,salt,27050
1,water,11899
2,garlic,11287
3,butter,10534
4,onions,10008
5,olive oil,9889
6,sugar,9319
7,garlic cloves,7772
8,pepper,7080
9,vegetable oil,6874


In [95]:
final_ingr_df.iloc[775]

item            orzo
total_counts     108
Name: 775, dtype: object

In [96]:
final_ingr_df.iloc[456]

item            coffee
total_counts       235
Name: 456, dtype: object

In [242]:
# Next steps, remove single instances and find duplicates...

In [277]:
top_ingr = final_ingr_df[final_ingr_df.total_counts > 2]
len(top_ingr)

4634