In [1]:
import numpy as np
import pandas as pd
import ast
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel
#Import TfIdfVectorizer (scikit-learn)
from sklearn.feature_extraction.text import TfidfVectorizer

# Data
- the data was downloaded from:

https://www.kaggle.com/shuyangli94/food-com-recipes-and-user-interactions

**Data description:**
This dataset consists of 180K+ recipes and 700K+ recipe reviews covering 18 years of user interactions and uploads on Food.com (formerly GeniusKitchen). used in the following paper:

Generating Personalized Recipes from Historical User Preferences
Bodhisattwa Prasad Majumder*, Shuyang Li*, Jianmo Ni, Julian McAuley
EMNLP, 2019
https://www.aclweb.org/anthology/D19-1613/


**License:**

Kaggle: Data files © Original Authors

Paper: 
ACL materials are Copyright © 1963–2022 ACL; other materials are copyrighted by their respective copyright holders. Materials prior to 2016 here are licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 3.0 International License. Permission is granted to make copies for the purposes of teaching and research. Materials published in or after 2016 are licensed on a Creative Commons Attribution 4.0 International License.

The ACL Anthology is managed and built by the ACL Anthology team of volunteers.

Site last built on 20 January 2022 at 01:07 UTC with commit 7c68786d.

## Data exploration

In [7]:
data_path = "/home/david/Projects/Hackathons/DeveloperWeek2022/RecipeSuggestions/data/"

In [None]:
# need to convert string representation of ingredients list to list using ast
raw_recipes = pd.read_csv(data_path + "RAW_recipes.csv", converters={'ingredients': ast.literal_eval})
raw_recipes.head(5)

In [None]:
raw_interactions = pd.read_csv(data_path + "RAW_interactions.csv")
raw_interactions.head(5)

### Number of recipes

In [None]:
nr_recipes = len(raw_recipes)
print("There are ", nr_recipes, " in the database.")

### Distribution of number of ingredients

In [None]:
raw_recipes.hist(column="n_ingredients")

In [None]:
print("The average number of ingredients for a recipe is ", raw_recipes["n_ingredients"].mean())

### Time of preparation for recipies

In [None]:
raw_recipes.hist(column="minutes")

In [None]:
print("The average time for preparation of a recipe is ", raw_recipes["minutes"].mean())

In [None]:
raw_recipes["minutes"].max()

In [None]:
raw_recipes["minutes"].min()

In [None]:
raw_recipes["minutes"].median()

# Data filtering

In [None]:
def filter_by_ingredients(df, ingredients):
    '''
    function returns only those recipes which consist of not more than the specified ingredients
    '''
    
    print("Search for recipes using ", ingredients)
    
    # return recipes in which ingredients occur
    mask = [set(ingredients).issubset(x) for x in df['ingredients']]
    
    # return recipes which consist of not more than the mentioned ingredients
    mask = [set(x).issubset(ingredients) for x in df['ingredients']]
    
    return df[mask]

In [None]:
my_ingredients = ['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil', 'salt']
#my_ingredients = ['salt']
my_ingredients = ['salt', 'pasta', 'tomato']
my_df = filter_by_ingredients(df=raw_recipes, ingredients=my_ingredients)
print("Found ", len(my_df), " recipes.")
my_df.head(3)

In [None]:
raw_recipes["ingredients"][20]

# Content Based Recommender

## Define functions

In [2]:
def concat_ingredients(df):
    '''
    concatenates list of ingredients strings to one string
    '''
    
    df['ingredients'] = [' '.join(x) for x in df['ingredients']]
    
    return df
    

In [3]:
def add_dummy_recipe(df, ingredients):
    '''
    - adds a dummy recipe to the dataframe of recipes consisting of the input ingredients
    - concatenates list of ingredient strings to one string
    
    Arguments:
    df -- dataframe of recipes
    ingredients -- ingredients to be used for dummy recipe
    
    Returns:
    df -- df with included dummy recipe
    idx_dummy -- value of the id column of the dummy recipe
    '''
    id_dummy = df["id"].max()+1
    dummy = {"id":id_dummy,
             "name":"dummy", 
             "minutes":30, 
             "ingredients":ingredients}
    
    dummy_data = pd.Series(dummy)
    
    df = df.append(dummy_data, ignore_index=True)
    
    idx_dummy = len(df) - 1
    
    # concatenate strings of ingredients lists
    df = concat_ingredients(df)
    
    return df, idx_dummy

In [45]:
def get_recommendations(my_ingredients):
    
    # 1) Load recepe data
    raw_recipes = pd.read_csv(data_path + "RAW_recipes.csv", converters={'ingredients': ast.literal_eval})
    print("Total number of recipes: ", len(raw_recipes))
    raw_recipes.head(5)
    
    
    # 1.1) Exclude recipes with to many ingredients
    raw_recipes = raw_recipes[raw_recipes["n_ingredients"] < 1.5*len(my_ingredients)]
    
    
    
    # 1.1) get random subset of data
    selection = ["name", "id", "minutes", "ingredients"]
    if len(df) > 30000:
        df = raw_recipes[selection].sample(30000)
    
    # 2) add dummy recipe built from input to dataframe
    df, idx_dummy = add_dummy_recipe(df=df, ingredients=my_ingredients)
    #print("max id = ", df["id"].max())
    #print("idx_dummy = ", idx_dummy)
    #print("Length of dataframe: ", len(df))
    
    # 3) calculate cosine similarities
    tfidf = TfidfVectorizer()
    #Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(df['ingredients'])
    # Compute the cosine similarity matrix
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    #print("Shape of cosine sim matrix: ", cosine_sim.shape)
    
    # 4) get reverse mapping of indices
    indices = pd.Series(df.index, index=df['ingredients']).drop_duplicates()
    
    # 5) get pairwise similarity scores and sort recipes based on them
    sim_scores = list(enumerate(cosine_sim[idx_dummy]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 6) Select top 10 recommendations 
    sim_scores = sim_scores[1:11] # first one is dummy recipe itself
    # Get the recipe indices
    recipe_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar recipes
    recommendation_df = raw_recipes.iloc[recipe_indices]
    
    return recommendation_df
    
    

## Get recommendations

In [51]:
my_ingredients = ['Sugar', 'Butter', 'corn', 'Salt', 'Pepper']
recommendations = get_recommendations(my_ingredients)

Total number of recipes:  231637


In [52]:
for ingredients in recommendations['ingredients']:
    print(ingredients)
    print("")
    #print(recommendations.iloc[idx]['ingredients'])
    print("-------------------------")

['skim milk', 'cinnamon', 'bananas', 'coffee ice cubes', 'splenda sugar substitute']

-------------------------
['semi-sweet chocolate chips', 'vanilla chip', 'butter', 'powdered sugar', 'baileys irish cream', 'nuts']

-------------------------
['frozen strawberries', 'milk', 'heavy cream', 'sour cream', 'sugar', 'mint']

-------------------------
['96% lean ground beef', 'salt', 'center-cut bacon', 'light hamburger bun', 'reduced-fat swiss cheese', 'red onion', 'barbecue sauce']

-------------------------
['vanilla ice cream', 'milk', 'cream', 'cinnamon extract', 'cinnamon', 'nutmeg', 'whipped cream']

-------------------------
['semisweet baking chocolate', 'butter', 'eggs', 'sugar', 'brandy', 'peak freen sweet-meal biscuits', 'whole toasted hazelnuts']

-------------------------
['banana', 'strawberry', 'raspberries', 'orange', 'soymilk', 'slivered almonds']

-------------------------
['lean stewing beef', 'condensed golden mushroom soup', 'cream of mushroom soup', 'sliced mushrooms

In [68]:
recommendations.iloc[0]

name                                          brazilian shake it up
id                                                           315558
minutes                                                           5
contributor_id                                               169430
submitted                                                2008-07-24
tags              ['15-minutes-or-less', 'time-to-make', 'course...
nutrition                  [158.7, 1.0, 57.0, 3.0, 12.0, 1.0, 11.0]
n_steps                                                           1
steps             ['blend all the ingredients in a blender and s...
description       a delicious shake drink without the ice cream....
ingredients       [skim milk, cinnamon, bananas, coffee ice cube...
n_ingredients                                                     5
Name: 28018, dtype: object

In [64]:
recommendations.iloc[0]["name"]

'brazilian shake it up'

In [66]:
recommendations.iloc[0]['ingredients']

['skim milk',
 'cinnamon',
 'bananas',
 'coffee ice cubes',
 'splenda sugar substitute']

In [69]:
recommendations.iloc[0]['steps']

"['blend all the ingredients in a blender and serve in two frosty glasses']"

## Test function step wise

### Load dataframe

In [53]:
# need to convert string representation of ingredients list to list using ast
#raw_recipes = pd.read_csv(data_path + "RAW_recipes.csv")
raw_recipes = pd.read_csv(data_path + "RAW_recipes.csv", converters={'ingredients': ast.literal_eval})
print("Total number of recipes: ", len(raw_recipes))
raw_recipes.head(5)

Total number of recipes:  231637


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8


### reduce dataframe

In [54]:
selection = ["name", "id", "minutes", "ingredients"]
df = raw_recipes[selection].sample(30000)
#df = raw_recipes[selection]

### add dummy recipe from ingredients to dataframe

In [55]:
my_ingredients=['squash', 'mexican seasoning']
my_ingredients=['tomato', 'onion', 'garlic']

In [56]:
df, idx_dummy = add_dummy_recipe(df=df, ingredients=my_ingredients)

print("max id = ", df["id"].max())
print("idx_dummy = ", idx_dummy)
print("Length of dataframe: ", len(df))

#df[df["id"] == df["id"].max()]
df.tail(6)

max id =  537486
idx_dummy =  30000
Length of dataframe:  30001


Unnamed: 0,name,id,minutes,ingredients
29995,steak mushroom hoagies,37184,25,steak whole mushrooms vidalia onion green pepp...
29996,slow cooker arroz con pollo,278551,485,whole chickens italian-style stewed tomatoes f...
29997,pf chang s chicken lettuce wraps,137175,55,cornstarch sherry wine water soy sauce boneles...
29998,peanut butter bites,47451,75,whole wheat bread creamy peanut butter vegetab...
29999,cranberry croquettes or fritters,395139,45,flour baking powder salt egg milk lemon juice ...
30000,dummy,537486,30,tomato onion garlic


In [58]:
df.iloc[110]["ingredients"]

'tomatoes herbs fresh parmesan cheese garlic cloves olive oil pepper'

### calculate cosine similarities

In [12]:
tfidf = TfidfVectorizer()
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['ingredients'])
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print("Shape of cosine sim matrix: ", cosine_sim.shape)

### get reverse mapping of indices

In [14]:
#indices = pd.Series(df.index, index=df['ingredients']).drop_duplicates()
indices = pd.Series(df.index, index=df['ingredients'])
#indices

### get pairwise similarity scores of all recipes with dummy recipe

In [15]:
sim_scores = list(enumerate(cosine_sim[idx_dummy]))

### sort recipes based on the similarity scores

In [16]:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

### print out top recommendations

In [17]:
# Get the scores of the 10 most similar recipes
sim_scores = sim_scores[1:11]
    
# Get the recipe indices
recipe_indices = [i[0] for i in sim_scores]
    
# Return the top 10 most similar recipes
df.iloc[recipe_indices]

Unnamed: 0,name,id,minutes,ingredients
9522,no meat baked spaghetti,140073,75,spaghetti green pepper onion tomato soup tomat...
22797,texas style salsa,16821,10,diced tomato onion jalapeno pepper tomato sauc...
12280,spaghetti skillet,37581,70,ground beef onion garlic tomato juice water to...
23585,off the boat sicilian pasta sauce,174469,85,fresh onion olive oil tomato sauce salt pepper
17188,40 minute chili,208407,60,hamburger onion beans tomato soup tomato juice...
17410,mamma mia meat sauce,113813,165,ground beef tomato sauce tomato paste olive oi...
7703,roast pepper and chicken,355339,80,chicken legs bell peppers red onion garlic clo...
16679,christie s spaghetti and meatballs,223649,30,minced beef onion breadcrumbs nutmeg salt pepp...
2132,garlicky pasta sauce,179913,40,olive oil garlic cloves tomato sauce tomato pa...
11472,frank s spaghetti,279272,45,onion olive oil garlic basil sliced mushrooms ...


In [18]:
my_ingredients

['tomato', 'onion', 'garlic']

In [19]:
for idx in recipe_indices:
    print("**Recommended recipe ingredients:**")
    print("")
    print(df.iloc[idx]['ingredients'])
    print("-------------------------")

**Recommended recipe ingredients:**

spaghetti green pepper onion tomato soup tomato sauce cheddar cheese garlic powder
-------------------------
**Recommended recipe ingredients:**

diced tomato onion jalapeno pepper tomato sauce garlic salt black pepper cumin chili powder
-------------------------
**Recommended recipe ingredients:**

ground beef onion garlic tomato juice water tomato sauce tomato paste chili powder oregano basil sugar salt pepper thin spaghetti parmesan cheese
-------------------------
**Recommended recipe ingredients:**

fresh onion olive oil tomato sauce salt pepper
-------------------------
**Recommended recipe ingredients:**

hamburger onion beans tomato soup tomato juice water chili powder salt garlic salt pepper paprika
-------------------------
**Recommended recipe ingredients:**

ground beef tomato sauce tomato paste olive oil onion dried parsley dried oregano garlic parmesan cheese salt pepper sugar
-------------------------
**Recommended recipe ingredients:

In [None]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
#tfidf = TfidfVectorizer(stop_words='english')
tfidf = TfidfVectorizer()

In [None]:
#Replace NaN with an empty string
#raw_recipes["ingredients"] = raw_recipes["ingredients"].fillna('')
raw_recipes["ingredients"].isna().sum()
# we don't have nan values

In [None]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(raw_recipes["ingredients"].sample(100))

# Output the shape of the matrix
tfidf_matrix.shape

## Calculate similarities

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape

We are going to define a function, that constructs a dummy-recipe of the inserted ingredients, calculates the similarities to all the other recipes in the DB and outputs the 10 most similar recipes based on the used ingredients.

For this we need a reverse mapping of a recipes ingredients and the DataFrame indices, i.e. a mechanism to identify the index of a recipe in our DataFrame, given its ingredients.

In [None]:
indices = pd.Series(raw_recipes.index, index=raw_recipes['ingredients']).drop_duplicates()
#indices

In [None]:
df