In [1]:
import nltk
import pandas as pd
import unidecode, ast
import string
import re

from sklearn.neighbors import NearestNeighbors
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow as tf
import tensorflow_hub as hub

In [75]:
vocabulary = nltk.FreqDist()
recipe_df = pd.read_csv("df_recipes.csv")
print(recipe_df.recipe_name + " "+ recipe_df.ingredients)

0       Roast duck with Marsala gravy ['1 x 1.6kg whol...
1       Best-ever Brussels sprouts ['800 g Brussels sp...
2       Beautiful courgette carbonara ['6 medium green...
3       Roasted black bean burgers ['1½ red onions', '...
4       Chicken & tofu noodle soup ['2 shallots', '2 c...
                              ...                        
4642    Thai-Style Steamed Pumpkin Cake Recipe - Allre...
4643    Delicious Spicy Tomato Salad Recipe - Allrecip...
4644    Spicy Sriracha Meatballs Recipe - Allrecipes.c...
4645    Ajad (Authentic Thai Cucumber Salad) Recipe - ...
4646    Sweet Thai-Style Chicken Bowl Recipe - Allreci...
Length: 4647, dtype: object


In [None]:
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(model_url)

In [51]:
for ingredients in recipe_df['ingredients']:
    ingredients = ingredients.split()
    vocabulary.update(ingredients)
for word, frequency in vocabulary.most_common(200):
    print(f'{word};{frequency}')

cup;7546
teaspoon;6254
tablespoons;4510
fresh;3766
["1;3665
'1;3163
tablespoon;3061
ground;3048
ounce);3037
and;2687
cups;2584
to;2548
of;2317
or;2144
,;2016
g;1991
chopped;1983
olive;1853
red;1830
teaspoons;1715
black;1701
'2;1624
1/2;1589
dried;1582
white;1529
cloves;1470
garlic,;1278
pepper;1243
taste"];1183
pound;1174
large;1166
can;1096
into;1086
onion,;1063
grated;1057
chicken;1043
cut;987
Parmesan;910
green;907
oil',;899
package;882
chopped","1;873
vegetable;828
finely;817
small;785
ounces;760
bunch;733
as;729
sliced;679
pinch;671
peeled;640
tomato;619
'½;614
Italian;606
shredded;592
whole;567
minced;554
'4;550
all-purpose;517
garlic;503
pepper,;496
pounds;488
for;484
(16;484
a;474
dry;470
more;466
cheese","1;465
mozzarella;460
bell;456
tomatoes,;453
'olive;444
lemon;431
(8;423
garlic',;420
cheese,;420
'3;419
diced;418
thinly;415
x;406
oil","1;404
sprigs;403
clove;403
boneless;397
chile;391
sauce","1;387
freshly;385
ml;366
chopped","2;364
oil,;363
oil","2;361
virgin;356
medium;3

In [52]:
def ingredient_parser(ingreds):
    '''
    
    This function takes in a list (but it is a string as it comes from pandas dataframe) of 
       ingredients and performs some preprocessing. 
       For example:
       input = '['1 x 1.6kg whole duck', '2 heaped teaspoons Chinese five-spice powder', '1 clementine',
                 '6 fresh bay leaves', 'GRAVY', '', '1 bulb of garlic', '2 carrots', '2 red onions', 
                 '3 tablespoons plain flour', '100 ml Marsala', '1 litre organic chicken stock']'
       
       output = ['duck', 'chinese five spice powder', 'clementine', 'fresh bay leaf', 'gravy', 'garlic',
                 'carrot', 'red onion', 'plain flour', 'marsala', 'organic chicken stock']
    '''
    measures = ['teaspoon', 't', 'tsp.', 'tablespoon', 'T', 'tbl.', 'tb', 'tbsp.', 'fluid ounce', 'fl oz', 'gill', 'cup', 'c', 'pint', 'p', 'pt', 'fl pt', 'quart', 'q', 'qt', 'fl qt', 'gallon', 'g', 'gal', 'ml', 'milliliter', 'millilitre', 'cc', 'mL', 'l', 'liter', 'litre', 'L', 'dl', 'deciliter', 'decilitre', 'dL', 'bulb', 'level', 'heaped', 'rounded', 'whole', 'pinch', 'medium', 'slice', 'pound', 'lb', '#', 'ounce', 'oz', 'mg', 'milligram', 'milligramme', 'g', 'gram', 'gramme', 'kg', 'kilogram', 'kilogramme', 'x', 'of', 'mm', 'millimetre', 'millimeter', 'cm', 'centimeter', 'centimetre', 'm', 'meter', 'metre', 'inch', 'in', 'milli', 'centi', 'deci', 'hecto', 'kilo']
    words_to_remove = ['fresh', 'oil', 'a', 'red', 'bunch', 'and', 'clove', 'or', 'leaf', 'chilli', 'large', 'extra', 'sprig', 'ground', 'handful', 'free', 'small', 'pepper', 'virgin', 'range', 'from', 'dried', 'sustainable', 'black', 'peeled', 'higher', 'welfare', 'seed', 'for', 'finely', 'freshly', 'sea', 'quality', 'white', 'ripe', 'few', 'piece', 'source', 'to', 'organic', 'flat', 'smoked', 'ginger', 'sliced', 'green', 'picked', 'the', 'stick', 'plain', 'plus', 'mixed', 'mint', 'bay', 'basil', 'your', 'cumin', 'optional', 'fennel', 'serve', 'mustard', 'unsalted', 'baby', 'paprika', 'fat', 'ask', 'natural', 'skin', 'roughly', 'into', 'such', 'cut', 'good', 'brown', 'grated', 'trimmed', 'oregano', 'powder', 'yellow', 'dusting', 'knob', 'frozen', 'on', 'deseeded', 'low', 'runny', 'balsamic', 'cooked', 'streaky', 'nutmeg', 'sage', 'rasher', 'zest', 'pin', 'groundnut', 'breadcrumb', 'turmeric', 'halved', 'grating', 'stalk', 'light', 'tinned', 'dry', 'soft', 'rocket', 'bone', 'colour', 'washed', 'skinless', 'leftover', 'splash', 'removed', 'dijon', 'thick', 'big', 'hot', 'drained', 'sized', 'chestnut', 'watercress', 'fishmonger', 'english', 'dill', 'caper', 'raw', 'worcestershire', 'flake', 'cider', 'cayenne', 'tbsp', 'leg', 'pine', 'wild', 'if', 'fine', 'herb', 'almond', 'shoulder', 'cube', 'dressing', 'with', 'chunk', 'spice', 'thumb', 'garam', 'new', 'little', 'punnet', 'peppercorn', 'shelled', 'saffron', 'other''chopped', 'salt', 'olive', 'taste', 'can', 'sauce', 'water', 'diced', 'package', 'italian', 'shredded', 'divided', 'parsley', 'vinegar', 'all', 'purpose', 'crushed', 'juice', 'more', 'coriander', 'bell', 'needed', 'thinly', 'boneless', 'half', 'thyme', 'cubed', 'cinnamon', 'cilantro', 'jar', 'seasoning', 'rosemary', 'extract', 'sweet', 'baking', 'beaten', 'heavy', 'seeded', 'tin', 'vanilla', 'uncooked', 'crumb', 'style', 'thin', 'nut', 'coarsely', 'spring', 'chili', 'cornstarch', 'strip', 'cardamom', 'rinsed', 'honey', 'cherry', 'root', 'quartered', 'head', 'softened', 'container', 'crumbled', 'frying', 'lean', 'cooking', 'roasted', 'warm', 'whipping', 'thawed', 'corn', 'pitted', 'sun', 'kosher', 'bite', 'toasted', 'lasagna', 'split', 'melted', 'degree', 'lengthwise', 'romano', 'packed', 'pod', 'anchovy', 'rom', 'prepared', 'juiced', 'fluid', 'floret', 'room', 'active', 'seasoned', 'mix', 'deveined', 'lightly', 'anise', 'thai', 'size', 'unsweetened', 'torn', 'wedge', 'sour', 'basmati', 'marinara', 'dark', 'temperature', 'garnish', 'bouillon', 'loaf', 'shell', 'reggiano', 'canola', 'parmigiano', 'round', 'canned', 'ghee', 'crust', 'long', 'broken', 'ketchup', 'bulk', 'cleaned', 'condensed', 'sherry', 'provolone', 'cold', 'soda', 'cottage', 'spray', 'tamarind', 'pecorino', 'shortening', 'part', 'bottle', 'sodium', 'cocoa', 'grain', 'french', 'roast', 'stem', 'link', 'firm', 'asafoetida', 'mild', 'dash', 'boiling']
   
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        ingredients = ast.literal_eval(ingreds)
   
    translator = str.maketrans('', '', string.punctuation)
    lemmatizer = WordNetLemmatizer()
    ingred_list = []
    
    
    for i in ingredients:
        i.translate(translator)
        items = re.split(' |-', i)
        items = [word for word in items if word.isalpha()]
        items = [word.lower() for word in items]
        items = [unidecode.unidecode(word) for word in items]
        items = [lemmatizer.lemmatize(word) for word in items]
        items = [word for word in items if word not in measures]
        items = [word for word in items if word not in words_to_remove]
        if items:
            ingred_list.append(' '.join(items)) 
    ingred_list = " ".join(ingred_list)
    return ingred_list


recipe_df['ingredients_parsed'] = recipe_df['ingredients'].apply(lambda x: ingredient_parser(x))
df = recipe_df[['recipe_name', 'ingredients_parsed', 'ingredients', 'recipe_urls']]
df = recipe_df.dropna()

m = df.recipe_name.str.endswith('Recipe - Allrecipes.com')
df['recipe_name'].loc[m] = df.recipe_name.loc[m].str[:-23]      

In [53]:
def embed(texts):
    return model(texts)


In [54]:
embeddings = embed(recipe_df['ingredients_parsed'])
embeddings.shape

TensorShape([4647, 512])

In [55]:
nn = NearestNeighbors(n_neighbors=10)
nn.fit(embeddings)

In [56]:
def recommend(text):
    emb = embed(text)
    neighbors = nn.kneighbors(emb,return_distance=False)[0]
    return recipe_df['recipe_name'].iloc[neighbors].tolist()

In [70]:
recommend(['chickpea'])

['Italian Chickpea Bread Recipe - Allrecipes.com',
 'Capsicum Zunka Recipe - Allrecipes.com',
 'Simple Besan Ladoo Recipe - Allrecipes.com',
 'Yellow Tarka Dal Recipe - Allrecipes.com',
 'Masur Dahl Recipe - Allrecipes.com',
 'Italian Beef for Sandwiches Recipe - Allrecipes.com',
 'Black Chana with Potato Recipe - Allrecipes.com',
 'Baked Onion Bhajis Recipe - Allrecipes.com',
 'Spaghetti Olio Recipe - Allrecipes.com',
 'Insalata Caprese I Recipe - Allrecipes.com']

In [67]:
import pickle
filename="rec.sav"
pickle.dump(nn,open(filename,"wb"))

In [68]:
load_model = pickle.load(open(filename,"rb"))

In [69]:
neighbors = load_model.kneighbors(e,return_distance=False)[0]
print(recipe_df['recipe_name'].iloc[neighbors].tolist())

['New York Italian Pizza Dough Recipe - Allrecipes.com', 'Roomali Roti Recipe - Allrecipes.com', 'Mediterranean Casserole Recipe - Allrecipes.com', 'Caprese on Toast Recipe - Allrecipes.com', 'Biga Recipe - Allrecipes.com', 'Fried Squash Blossoms Recipe - Allrecipes.com', "Chef John's Rustic Italian Corn Bread  Recipe - Allrecipes.com", 'Sfincione Siciliano Pizza Recipe - Allrecipes.com', 'Strawberry Goat Cheese Bruschetta Recipe - Allrecipes.com', "Lisa's Best Ever Garlic Bread Recipe - Allrecipes.com"]
