**imports**

In [1]:
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
import os

# Preparing the dataset

## Importing the recipe-data
- drop columns
- split string columns (ingredients, steps,..) to list

In [2]:
data_path = (os.getcwd())+"/data/RAW_recipes.csv"

df = pd.read_csv(data_path)

In [3]:
df.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [4]:
for col in df.columns:
    print(f'{col} has type {type(df[col][0])}')

name has type <class 'str'>
id has type <class 'numpy.int64'>
minutes has type <class 'numpy.int64'>
contributor_id has type <class 'numpy.int64'>
submitted has type <class 'str'>
tags has type <class 'str'>
nutrition has type <class 'str'>
n_steps has type <class 'numpy.int64'>
steps has type <class 'str'>
description has type <class 'str'>
ingredients has type <class 'str'>
n_ingredients has type <class 'numpy.int64'>


In [5]:
#dropping columns we don't need

df.drop(columns=['contributor_id', 'submitted'], inplace = True)

In [6]:
# changing the list columns into a list

for col in ['tags', 'nutrition', 'ingredients']:
    df[col] = df[col].apply(lambda x: x.strip('[]').split(','))

In [7]:
# split steps column on ', to split only after the complete step

df['steps'] = df['steps'].apply(lambda x: x.strip('[]').split("',"))

## Clean the data

New column for ingredients to search

1. lowercase
2. remove numbers
3. remove spaces
4. remove special chars
5. lemmatize
6. join words with underscores

Ingredients:

1. capitalize
2. connect to string with /n to be displayed correctly

Tags, steps:
1. remove extra '
2. remove spaces
3. capitalize first letter
4. - for steps - add step number & connect to string with /n to be displayed correctly

Nutrition
1. change to float

Name:
1. capitalize first letter

In [8]:
# cleaning functions

def remove_num(text):
    return ''.join(char for char in text if not char.isdigit())

def remove_punctuation(ingredients):
    cleaned_list = []
    for word in ingredients:
        for punctuation in string.punctuation:
            word = word.replace(punctuation, '')
        cleaned_list.append(word)
    return cleaned_list

def lemmatize_ingredients(ingredients):
    lemmatized = []
    for ingredient in ingredients:
        ingredient_words = ingredient.split()
        ingredient_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in ingredient_words]
        lemmatized.append('_'.join(ingredient_lemmatized))
    return lemmatized
    

In [9]:
# full preproc function 

def preproc_ingredients(user_input):
    #preproces input same way as df

    #1.lowercase
    user_input = [i.lower() for i in user_input]

    #2. remove numbers

    user_input = [remove_num(i) for i in user_input]
    
    #3. remove spaces

    user_input =  [i.strip() for i in user_input]

    #4. remove special chars

    user_input = remove_punctuation(user_input)

    #5. lemmatize and join words with underscores

    user_input = lemmatize_ingredients(user_input)

    #6. change to set

    user_input = set(user_input)
    
    return user_input

In [10]:
df['search_ingredients'] = df['ingredients'].apply(preproc_ingredients)

In [11]:
#clean tags and steps

df['ingredients'] = df['ingredients'].apply(lambda x: [i.strip().strip("'").capitalize() for i in x])
df['tags'] = df['tags'].apply(lambda x: [i.strip().strip("'").capitalize() for i in x])
df['steps'] = df['steps'].apply(lambda x: [i.strip().strip("'").capitalize() for i in x])


In [12]:
df['name'] = df['name'].str.capitalize()
df['description'] = df['description'].str.capitalize()

## change ingredients and steps to presentation

In [13]:
def new_line(input_list):
    result= ""
    for element in input_list:
        result += f'{element} \n'
    return result

In [14]:
# fn to enumerate steps

def enumerate_steps(input_list):
    steps_string = []
    for x, y in enumerate(input_list):
        steps_string.append(f"{x+1}. {y}")
    return steps_string

In [15]:
# enumerate steps on separated lines

df['steps'] = df['steps'].apply(enumerate_steps).apply(new_line)

In [16]:
# 'list' ingredients on separated lines

df['ingredients'] = df['ingredients'].apply(lambda x: ['- '+ i for i in x]).apply(new_line)

In [17]:
#clean nutrition

df['nutrition'] = df['nutrition'].apply(lambda x: [float(i) for i in x])

## Add Reviews

1. import reviews
2. get avg review per recipe
3. merge the reviews with the recipe df
4. sort by avg review score

In [18]:
#import reviews

review_path = (os.getcwd())+"/data/RAW_interactions.csv"
review_df = pd.read_csv(review_path)

review_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [19]:
#get avg rating & rename columns

avg_review_df = review_df[['recipe_id', 'rating']].groupby('recipe_id', as_index=False).mean().round(2)

avg_review_df.rename(columns = {'recipe_id': 'id', 'rating':'avg_rating'}, inplace=True)

avg_review_df.head()

Unnamed: 0,id,avg_rating
0,38,4.25
1,39,3.0
2,40,4.33
3,41,4.5
4,43,1.0


In [20]:
#merge reviews with the recipe df

df = df.merge(avg_review_df, on='id')

In [21]:
df = df.sort_values(by=['avg_rating'], ascending=False)

In [22]:
#removing recipes for "hot boiling water"
df = df.drop(df[df.name == "Brining solution for poultry and meat"].index)
df = df.drop(df[df.name == "Salted boiling water   what does it mean"].index)

## save cleaned recipes as local pkl

In [23]:
# df ready! -> save to local pkl
saved_pkl_path = os.getcwd()+"/data/clean_df.pkl"

In [24]:
df.to_pickle(saved_pkl_path)
del df
del review_df
del avg_review_df

In [25]:
pkl_df = pd.read_pickle(saved_pkl_path)
pkl_df.head()

Unnamed: 0,name,id,minutes,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,search_ingredients,avg_rating
0,Arriba baked winter squash mexican style,137739,55,"[60-minutes-or-less, Time-to-make, Course, Mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,1. Make a choice and proceed with recipe \n2. ...,Autumn is my favorite time of year to cook! th...,- Winter squash \n- Mexican seasoning \n- Mixe...,7,"{salt, mexican_seasoning, honey, olive_oil, bu...",5.0
125098,Littlemafia s apple cider,395272,10,"[15-minutes-or-less, Time-to-make, Course, Cui...","[12.1, 0.0, 12.0, 0.0, 0.0, 0.0, 1.0]",1,"1. Mix together and boil for 10 minutes , then...",One of my favorite drinks.,- Apple cider \n- Cinnamon sticks \n- Cloves \...,4,"{clove, apple_cider, cinnamon_stick, sugar}",5.0
125089,Little vanilla pound cake for 2,369222,35,"[60-minutes-or-less, Time-to-make, Course, Mai...","[545.1, 38.0, 204.0, 11.0, 14.0, 72.0, 24.0]",17,1. Preheat oven to 350 \n2. Grease and flour p...,This is my favorite little vanilla pound cake....,- Flour \n- Sugar \n- Sour cream \n- Baking so...,7,"{egg, sour_cream, butter, flour, vanilla_extra...",5.0
125090,Little veronica s healthy macaroni and cheese ...,177041,17,"[30-minutes-or-less, Time-to-make, Course, Mai...","[694.1, 49.0, 14.0, 26.0, 53.0, 64.0, 24.0]",9,1. Boil pasta according to package directions ...,My daughter loves cheese but to avoid giving h...,- Pasta \n- Carrot \n- Broccoli \n- Olive oil ...,6,"{monterey_jack_cheese, broccoli, olive_oil, ca...",5.0
125093,Little west end,228449,6,"[15-minutes-or-less, Time-to-make, Course, Cui...","[67.8, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",3,1. Shake with ice \n2. Serve in an iced glass ...,From a pub in the bahamas.,- Lemon juice \n- White rum \n- Triple sec \n-...,4,"{simple_syrup, white_rum, triple_sec, lemon_ju...",5.0


# Testing Food2Vec Model

First we try to use a pretrained "Food2Vec" model - a Word2Vec model trained on recipes/ingredients from github.com/ChantalMP

In [26]:
#load the pretrained model

#download the data from github
#!wget https://github.com/ChantalMP/Exploiting-Food-Embeddings-for-Ingredient-Substitution/releases/download/0.1/food2vec_models.zip

#unzip it and save in data directory

In [27]:
from gensim.models import Word2Vec

#load the model
model_path = os.getcwd()+'/data/model.bin'
model = Word2Vec.load(model_path)

## Dealing with ingredients missing in pretrained Word2Vec space

In [28]:
#exploring dataset ingredients and their embedding in the pretrained model

missing_ingredients = []

all_ingredients = pkl_df['search_ingredients'].explode().value_counts().index

for ingredient in all_ingredients:
    try:
        model.wv[ingredient]
    except:
        missing_ingredients.append(ingredient)
    

In [29]:
missing_ingredients

['allpurpose_flour',
 'salt_and_pepper',
 'extra_virgin_olive_oil',
 'fresh_ground_black_pepper',
 'red_onion',
 'boneless_skinless_chicken_breast',
 'juice_of',
 'fresh_ground_pepper',
 'salt_freshly_ground_black_pepper',
 'crushed_red_pepper_flake',
 'semisweet_chocolate_chip',
 'monterey_jack_cheese',
 'halfandhalf',
 'tabasco_sauce',
 'jalapeno_pepper',
 'salt_pepper',
 'cream_of_mushroom_soup',
 'swiss_cheese',
 'boneless_skinless_chicken_breast_half',
 'green_chilies',
 'zest_of',
 'pine_nut',
 'lowfat_milk',
 'ice_cube',
 'cream_of_chicken_soup',
 'hardboiled_egg',
 'fresh_basil_leaf',
 'of_fresh_mint',
 'icing_sugar',
 'low_sodium_chicken_broth',
 'fresh_strawberry',
 'kalamata_olive',
 'sundried_tomato',
 'spaghetti_sauce',
 'splenda_sugar_substitute',
 'dry_breadcrumb',
 'basil_leaf',
 'fresh_garlic',
 'minced_garlic_clove',
 'graham_cracker_crumb',
 'salt_and_black_pepper',
 'velveeta_cheese',
 'maraschino_cherry',
 'low_sodium_soy_sauce',
 'sweet_red_pepper',
 'halfandhalf_

In [30]:
#percentage of missing ingredients

len(missing_ingredients)/len(all_ingredients)

0.7819769575108241

In [31]:
# checking if the missing ingredients contain more than one word

len([i for i in missing_ingredients if '_' in i])/len(missing_ingredients)

0.9847972972972973

In [32]:
# since all ingredients contain more words -> try searching for reduced ingredient name
# -> removing from the front, since lot of the ingredients begin with adjectives

ingredients_dict = {}
not_existent_ingredients = []

for ingredient in missing_ingredients:
    search_ingredient = ingredient
    in_space = False
    while not in_space:
        try:
            model.wv[search_ingredient]
            ingredients_dict[ingredient] = search_ingredient
            in_space = True
        except KeyError:
            try:
                search_ingredient = search_ingredient.split("_", maxsplit=1)[1]
            except:
                not_existent_ingredients.append(ingredient)
                break
    

In [33]:
ingredients_dict

{'allpurpose_flour': 'flour',
 'salt_and_pepper': 'pepper',
 'extra_virgin_olive_oil': 'virgin_olive_oil',
 'fresh_ground_black_pepper': 'ground_black_pepper',
 'red_onion': 'onion',
 'boneless_skinless_chicken_breast': 'skinless_chicken_breast',
 'juice_of': 'of',
 'fresh_ground_pepper': 'ground_pepper',
 'salt_freshly_ground_black_pepper': 'ground_black_pepper',
 'crushed_red_pepper_flake': 'red_pepper_flake',
 'semisweet_chocolate_chip': 'chocolate_chip',
 'monterey_jack_cheese': 'jack_cheese',
 'tabasco_sauce': 'sauce',
 'jalapeno_pepper': 'pepper',
 'salt_pepper': 'pepper',
 'cream_of_mushroom_soup': 'mushroom_soup',
 'swiss_cheese': 'cheese',
 'boneless_skinless_chicken_breast_half': 'chicken_breast_half',
 'zest_of': 'of',
 'pine_nut': 'nut',
 'lowfat_milk': 'milk',
 'ice_cube': 'cube',
 'cream_of_chicken_soup': 'soup',
 'hardboiled_egg': 'egg',
 'fresh_basil_leaf': 'leaf',
 'of_fresh_mint': 'fresh_mint',
 'icing_sugar': 'sugar',
 'low_sodium_chicken_broth': 'chicken_broth',
 'f

In [34]:
#percentage of still missing ingredients

len(not_existent_ingredients)/len(all_ingredients)

0.0270052102443678

In [35]:
#create column containing only ingredients from the space

def move_ingredients_to_space(ingredients):
    ingredients = ingredients
    in_space_ingredients = set()
    for ingredient in ingredients:
        in_space_ingredients.add(ingredients_dict.get(ingredient, ingredient))
    return in_space_ingredients

In [36]:
pkl_df['search_in_space_ingredients'] = pkl_df['search_ingredients'].apply(move_ingredients_to_space)

In [37]:
#saving the new df

pkl_df.to_pickle(saved_pkl_path)

# Searching recipes!

Searching for recipe!

1. load the df
2. load the pretrained model
3. get user input as a list
4. preproces input same way as df
5. extend user input by the nearest ingredients found in the model
6. search all "subset recipes" of the extended input in the df

In [38]:
#get user input -> fix for now (taking ingredients from the first recipe, making sure a match exists)

user_input = ['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

In [39]:
#extend user input by the k most similar ingredients found in the given model


def extend_input(user_input, model, k):
    extended_input = set()
    for ingredient in user_input:
        extended_input.add(ingredient)
        search_ingredient = ingredient
        found = False
        while not found:
            try:
                k_nearest_ingredients = model.wv.most_similar(search_ingredient, topn=k)
                extended_input.add(search_ingredient)
                extended_input = extended_input.union({i[0] for i in k_nearest_ingredients})
                found = True
            except KeyError:
                try:
                    search_ingredient = search_ingredient.split("_", maxsplit=1)[1]
                except:
                    break     
    return extended_input

In [40]:
#search all "subset recipes" of the extended input in the df

def recipe_search(model, df, user_input, k):
    #preprocess
    preprocced_input = preproc_ingredients(user_input)
    
    if k == 0:
        #search without extending
        recipe_df = df[df['search_ingredients'].apply(lambda x: x.issubset(preprocced_input))]     
    else:
        #extend
        k_input = extend_input(preprocced_input, model, k)
        recipe_df = df[df['search_in_space_ingredients'].apply(lambda x: x.issubset(k_input))]
        
    recipe_df['input_matching_rate'] = recipe_df['search_ingredients'].apply(lambda x: len(x.intersection(preprocced_input)))/len(preprocced_input)   
    return recipe_df.sort_values(['input_matching_rate', 'avg_rating'], ascending=[False, False]).iloc[:16]

In [41]:
recipe_search(model, pkl_df, user_input,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipe_df['input_matching_rate'] = recipe_df['search_ingredients'].apply(lambda x: len(x.intersection(preprocced_input)))/len(preprocced_input)


Unnamed: 0,name,id,minutes,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,search_ingredients,avg_rating,search_in_space_ingredients,input_matching_rate
0,Arriba baked winter squash mexican style,137739,55,"[60-minutes-or-less, Time-to-make, Course, Mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,1. Make a choice and proceed with recipe \n2. ...,Autumn is my favorite time of year to cook! th...,- Winter squash \n- Mexican seasoning \n- Mixe...,7,"{salt, mexican_seasoning, honey, olive_oil, bu...",5.0,"{salt, honey, olive_oil, butter, seasoning, wi...",1.0
107271,Honey butter shanachie,368529,25,"[30-minutes-or-less, Time-to-make, Course, Mai...","[150.0, 23.0, 15.0, 5.0, 0.0, 48.0, 1.0]",4,"1. ""warm the butter -- don't need to melt it"",...",This was a demonstration recipe at charlotte a...,- Butter \n- Honey \n,2,"{butter, honey}",5.0,"{butter, honey}",0.285714
107272,Honey butter spread,357691,5,"[15-minutes-or-less, Time-to-make, Course, Mai...","[94.2, 14.0, 13.0, 2.0, 0.0, 29.0, 1.0]",4,1. Place butter and honey in a small mixing bo...,This is a quick and easy recipe for honey butt...,- Butter \n- Honey \n,2,"{butter, honey}",5.0,"{butter, honey}",0.285714
107267,Honey butter 101,497916,20,"[30-minutes-or-less, Time-to-make, Course, Cui...","[166.1, 17.0, 69.0, 4.0, 0.0, 36.0, 5.0]",9,1. 1 \n2. Place butter in a small bowl \n3. Gr...,Honey butter is very good on homemade white br...,- Butter \n- Honey \n,2,"{butter, honey}",5.0,"{butter, honey}",0.285714
107508,Honey hair conditioner,492846,32,"[60-minutes-or-less, Time-to-make, Cuisine, Pr...","[1323.5, 110.0, 742.0, 0.0, 1.0, 49.0, 62.0]",4,1. Using a small amount at a time work mixture...,Add a beautiful shine to your hair,- Honey \n- Olive oil \n,2,"{olive_oil, honey}",5.0,"{olive_oil, honey}",0.285714
204792,Susan s honey bee butter,257753,2,"[15-minutes-or-less, Time-to-make, Course, Mai...","[117.7, 17.0, 17.0, 3.0, 0.0, 36.0, 1.0]",2,1. Mix by hand or in a small food chopper \n2....,"Since we have our own beehives, i'm always on ...",- Butter \n- Honey \n,2,"{butter, honey}",5.0,"{butter, honey}",0.285714
225746,Whipped honey butter with varieties,131054,10,"[15-minutes-or-less, Time-to-make, Course, Pre...","[2142.9, 283.0, 557.0, 67.0, 4.0, 583.0, 46.0]",6,1. Whip butter with mixer until fluffy \n2. Gr...,A basic honey butter that's delicious on cornb...,- Butter \n- Honey \n,2,"{butter, honey}",5.0,"{butter, honey}",0.285714
130230,Maple spread,121563,15,"[15-minutes-or-less, Time-to-make, Course, Pre...","[343.5, 35.0, 127.0, 7.0, 0.0, 72.0, 12.0]",3,"1. In a heavy-bottomed saucepan , heat syrup u...","2 ingredients! serve with easy-does-it rolls, ...",- Real maple syrup \n- Butter \n,2,"{butter, real_maple_syrup}",5.0,"{maple_syrup, butter}",0.142857
78178,Easy squeeze honey butter,382927,10,"[15-minutes-or-less, Time-to-make, Course, Mai...","[32.2, 0.0, 34.0, 0.0, 0.0, 0.0, 2.0]",1,"1. Empty 1 / 2 of margarine out of bottle , ad...",Could anything be simpler than this? you might...,- Margarine \n- Honey \n,2,"{margarine, honey}",5.0,"{margarine, honey}",0.142857
172919,Relaxing honey bath,100746,1,"[15-minutes-or-less, Time-to-make, Preparation...","[172.4, 0.0, 186.0, 0.0, 0.0, 0.0, 15.0]",5,1. Put honey in a glass with lavender oil \n2....,My dad has honey bees so i always have lots ar...,- Honey \n- Lavender oil \n,2,"{lavender_oil, honey}",5.0,"{oil, honey}",0.142857
