**imports**

In [1]:
import nltk

nltk.download("wordnet", download_dir="data")

[nltk_data] Downloading package wordnet to data...


True

In [None]:
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
import os

# Preparing the dataset

## Importing the recipe-data
- drop columns
- split string columns (ingredients, steps,..) to list

In [None]:
data_path = (os.getcwd())+"/data/RAW_recipes.csv"

df = pd.read_csv(data_path)

In [None]:
df.columns

In [None]:
for col in df.columns:
    print(f'{col} has type {type(df[col][0])}')

In [None]:
#dropping columns we don't need

df.drop(columns=['contributor_id', 'submitted'], inplace = True)

In [None]:
# changing the list columns into a list

for col in ['tags', 'nutrition', 'ingredients']:
    df[col] = df[col].apply(lambda x: x.strip('[]').split(','))

In [None]:
# split steps column on ', to split only after the complete step

df['steps'] = df['steps'].apply(lambda x: x.strip('[]').split("',"))

## Clean the data

New column for ingredients to search

1. lowercase
2. remove numbers
3. remove spaces
4. remove special chars
5. lemmatize
6. join words with underscores

Ingredients:

1. capitalize
2. connect to string with /n to be displayed correctly

Tags, steps:
1. remove extra '
2. remove spaces
3. capitalize first letter
4. - for steps - add step number & connect to string with /n to be displayed correctly

Nutrition
1. change to float

Name:
1. capitalize first letter

In [None]:
# cleaning functions

def remove_num(text):
    return ''.join(char for char in text if not char.isdigit())

def remove_punctuation(ingredients):
    cleaned_list = []
    for word in ingredients:
        for punctuation in string.punctuation:
            word = word.replace(punctuation, '')
        cleaned_list.append(word)
    return cleaned_list

def lemmatize_ingredients(ingredients):
    lemmatized = []
    for ingredient in ingredients:
        ingredient_words = ingredient.split()
        ingredient_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n") for word in ingredient_words]
        lemmatized.append('_'.join(ingredient_lemmatized))
    return lemmatized
    

In [None]:
# full preproc function 

def preproc_ingredients(user_input):
    #preproces input same way as df

    #1.lowercase
    user_input = [i.lower() for i in user_input]

    #2. remove numbers

    user_input = [remove_num(i) for i in user_input]
    
    #3. remove spaces

    user_input =  [i.strip() for i in user_input]

    #4. remove special chars

    user_input = remove_punctuation(user_input)

    #5. lemmatize and join words with underscores

    user_input = lemmatize_ingredients(user_input)

    #6. change to set

    user_input = set(user_input)
    
    return user_input

In [None]:
df['search_ingredients'] = df['ingredients'].apply(preproc_ingredients)

In [None]:
#clean tags and steps

df['ingredients'] = df['ingredients'].apply(lambda x: [i.strip().strip("'").capitalize() for i in x])
df['tags'] = df['tags'].apply(lambda x: [i.strip().strip("'").capitalize() for i in x])
df['steps'] = df['steps'].apply(lambda x: [i.strip().strip("'").capitalize() for i in x])


In [None]:
df['name'] = df['name'].str.capitalize()
df['description'] = df['description'].str.capitalize()

## change ingredients and steps to presentation

In [None]:
def new_line(input_list):
    result= ""
    for element in input_list:
        result += f'{element} \n'
    return result

In [None]:
# fn to enumerate steps

def enumerate_steps(input_list):
    steps_string = []
    for x, y in enumerate(input_list):
        steps_string.append(f"{x+1}. {y}")
    return steps_string

In [None]:
# enumerate steps on separated lines

df['steps'] = df['steps'].apply(enumerate_steps).apply(new_line)

In [None]:
# 'list' ingredients on separated lines

df['ingredients'] = df['ingredients'].apply(lambda x: ['- '+ i for i in x]).apply(new_line)

In [None]:
#clean nutrition

df['nutrition'] = df['nutrition'].apply(lambda x: [float(i) for i in x])

## Add Reviews

1. import reviews
2. get avg review per recipe
3. merge the reviews with the recipe df
4. sort by avg review score

In [None]:
#import reviews

review_path = (os.getcwd())+"/data/RAW_interactions.csv"
review_df = pd.read_csv(review_path)

review_df.head()

In [None]:
#get avg rating & rename columns

avg_review_df = review_df[['recipe_id', 'rating']].groupby('recipe_id', as_index=False).mean().round(2)

avg_review_df.rename(columns = {'recipe_id': 'id', 'rating':'avg_rating'}, inplace=True)

avg_review_df.head()

In [None]:
#merge reviews with the recipe df

df = df.merge(avg_review_df, on='id')

In [None]:
df = df.sort_values(by=['avg_rating'], ascending=False)

In [None]:
#removing recipes for "hot boiling water"
df = df.drop(df[df.name == "Brining solution for poultry and meat"].index)
df = df.drop(df[df.name == "Salted boiling water   what does it mean"].index)

## save cleaned recipes as local pkl

In [None]:
# df ready! -> save to local pkl
saved_pkl_path = os.getcwd()+"/data/clean_df.pkl"

In [None]:
df.to_pickle(saved_pkl_path)
del df
del review_df
del avg_review_df

In [None]:
pkl_df = pd.read_pickle(saved_pkl_path)
pkl_df.head()

# Testing Food2Vec Model

First we try to use a pretrained "Food2Vec" model - a Word2Vec model trained on recipes/ingredients from github.com/ChantalMP

In [None]:
#load the pretrained model

#download the data from github
#!wget https://github.com/ChantalMP/Exploiting-Food-Embeddings-for-Ingredient-Substitution/releases/download/0.1/food2vec_models.zip

#unzip it and save in data directory

In [None]:
from gensim.models import Word2Vec

#load the model
model_path = os.getcwd()+'/data/model.bin'
model = Word2Vec.load(model_path)

## Dealing with ingredients missing in pretrained Word2Vec space

In [None]:
#exploring dataset ingredients and their embedding in the pretrained model

missing_ingredients = []

all_ingredients = pkl_df['search_ingredients'].explode().value_counts().index

for ingredient in all_ingredients:
    try:
        model.wv[ingredient]
    except:
        missing_ingredients.append(ingredient)
    

In [None]:
missing_ingredients

In [None]:
#percentage of missing ingredients

len(missing_ingredients)/len(all_ingredients)

In [None]:
# checking if the missing ingredients contain more than one word

len([i for i in missing_ingredients if '_' in i])/len(missing_ingredients)

In [None]:
# since all ingredients contain more words -> try searching for reduced ingredient name
# -> removing from the front, since lot of the ingredients begin with adjectives

ingredients_dict = {}
not_existent_ingredients = []

for ingredient in missing_ingredients:
    search_ingredient = ingredient
    in_space = False
    while not in_space:
        try:
            model.wv[search_ingredient]
            ingredients_dict[ingredient] = search_ingredient
            in_space = True
        except KeyError:
            try:
                search_ingredient = search_ingredient.split("_", maxsplit=1)[1]
            except:
                not_existent_ingredients.append(ingredient)
                break
    

In [None]:
ingredients_dict

In [None]:
#percentage of still missing ingredients

len(not_existent_ingredients)/len(all_ingredients)

In [None]:
#create column containing only ingredients from the space

def move_ingredients_to_space(ingredients):
    ingredients = ingredients
    in_space_ingredients = set()
    for ingredient in ingredients:
        in_space_ingredients.add(ingredients_dict.get(ingredient, ingredient))
    return in_space_ingredients

In [None]:
pkl_df['search_in_space_ingredients'] = pkl_df['search_ingredients'].apply(move_ingredients_to_space)

In [None]:
#saving the new df

pkl_df.to_pickle(saved_pkl_path)

# Searching recipes!

Searching for recipe!

1. load the df
2. load the pretrained model
3. get user input as a list
4. preproces input same way as df
5. extend user input by the nearest ingredients found in the model
6. search all "subset recipes" of the extended input in the df

In [None]:
#get user input -> fix for now (taking ingredients from the first recipe, making sure a match exists)

user_input = ['winter squash',
 'mexican seasoning',
 'mixed spice',
 'honey',
 'butter',
 'olive oil',
 'salt']

In [None]:
#extend user input by the k most similar ingredients found in the given model


def extend_input(user_input, model, k):
    extended_input = set()
    for ingredient in user_input:
        extended_input.add(ingredient)
        search_ingredient = ingredient
        found = False
        while not found:
            try:
                k_nearest_ingredients = model.wv.most_similar(search_ingredient, topn=k)
                extended_input.add(search_ingredient)
                extended_input = extended_input.union({i[0] for i in k_nearest_ingredients})
                found = True
            except KeyError:
                try:
                    search_ingredient = search_ingredient.split("_", maxsplit=1)[1]
                except:
                    break     
    return extended_input

In [None]:
#search all "subset recipes" of the extended input in the df

def recipe_search(model, df, user_input, k):
    #preprocess
    preprocced_input = preproc_ingredients(user_input)
    
    if k == 0:
        #search without extending
        recipe_df = df[df['search_ingredients'].apply(lambda x: x.issubset(preprocced_input))]     
    else:
        #extend
        k_input = extend_input(preprocced_input, model, k)
        recipe_df = df[df['search_in_space_ingredients'].apply(lambda x: x.issubset(k_input))]
        
    recipe_df['input_matching_rate'] = recipe_df['search_ingredients'].apply(lambda x: len(x.intersection(preprocced_input)))/len(preprocced_input)   
    return recipe_df.sort_values(['input_matching_rate', 'avg_rating'], ascending=[False, False]).iloc[:16]

In [None]:
recipe_search(model, pkl_df, user_input,1)