In [1]:
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import spacy

In [2]:
# read csv file with recipe data
recipes_df = pd.read_csv('RAW_recipes.csv')



# clean df

# drop rows with empty data
recipes_df = recipes_df.dropna()

# keep only recipes with 'indian' or 'greek' words in their names
recipes_df = recipes_df[recipes_df['name'].str.contains('indian|greek', case=False)]

#drop columns that we do not need
unneeded_columns = ['id',
                        'minutes',
                        'contributor_id', 
                        'submitted', 
                        'tags', 
                        'nutrition', 
                        'n_steps', 
                        'steps', 
                        'n_ingredients', 
                        'description'
                        ]

recipes_df = recipes_df.drop(unneeded_columns, axis=1)

In [3]:
recipes_df.ingredients

100       ['cucumber', 'potato', 'chopped tomato', 'plai...
170       ['seedless cucumber', 'red ripe tomatoes', 're...
272       ['ouzo', 'orange juice', 'ice cube', 'orange s...
347       ['romaine lettuce', 'feta cheese', 'pepperonci...
370       ['romaine lettuce', 'salt and pepper', 'red on...
                                ...                        
230321    ['roma tomatoes', 'red bell pepper', 'red onio...
230466    ['of fresh mint', 'lemon peel', 'lemon juice',...
230467    ['couscous', 'black pepper', 'lemon juice', 'o...
230468    ['olive oil', 'white wine vinegar', 'garlic cl...
231575    ['buttermilk', 'egg', 'butter', 'molasses', 'c...
Name: ingredients, Length: 1932, dtype: object

In [4]:
# tokenize df column 'name'
recipes_df['name'] = recipes_df['name'].apply(lambda x: nltk.word_tokenize(x))

# fix 'ingredients' column and place appropriate item into a new column 'tokens'
recipes_df['tokens'] = [eval(recipe) for recipe in recipes_df.ingredients.tolist()]

recipes_df['tokens']

100       [cucumber, potato, chopped tomato, plain yogur...
170       [seedless cucumber, red ripe tomatoes, red oni...
272            [ouzo, orange juice, ice cube, orange slice]
347       [romaine lettuce, feta cheese, pepperoncini pe...
370       [romaine lettuce, salt and pepper, red onion, ...
                                ...                        
230321    [roma tomatoes, red bell pepper, red onion, en...
230466    [of fresh mint, lemon peel, lemon juice, olive...
230467    [couscous, black pepper, lemon juice, olive oi...
230468    [olive oil, white wine vinegar, garlic cloves,...
231575    [buttermilk, egg, butter, molasses, cornmeal, ...
Name: tokens, Length: 1932, dtype: object

## Named Entity Recognition

We are using NER for extracting the pure ingredient's name e.g. 'tomato' instead of 'chopped tomato'
We also know that the ingredient will be a noun.

In [5]:
for index_of_rows, row in recipes_df.iterrows():
    for index_of_array, ingredient in enumerate(row['tokens']):
        print(ingredient)

cucumber
potato
chopped tomato
plain yogurt
chat masala
paprika
salt
seedless cucumber
red ripe tomatoes
red onion
red bell pepper
water-packed artichoke hearts
italian parsley
kosher salt & freshly ground black pepper
lemon, juice of
shallot
red wine vinegar
oregano
extra virgin olive oil
tuna in vegetable oil
pepperoncini peppers
kalamata olive
ouzo
orange juice
ice cube
orange slice
romaine lettuce
feta cheese
pepperoncini peppers
red onion
black olives
romaine lettuce
salt and pepper
red onion
cherry tomatoes
carrots
green peppers
black olives
feta cheese
elbow macaroni
cheese
milk
salt and pepper
chicken breasts
olive oil
lemon juice
dried oregano
dried dill
kosher salt
black pepper
garlic powder
red onion
rosemary
kalamata olive
feta cheese
lemon
garbanzo beans
scallion
garlic cloves
tomatoes
celery ribs
fresh lemon juice
of fresh mint
fresh parsley
extra virgin olive oil
feta cheese
romaine lettuce
salt
cracked black pepper
garbanzo beans
garlic cloves
tomatoes
red onion
parsley

We can see above that our data is not clean because it contains additional information that we do not need.
<br>
For example the "tuna in vegetable oil" can be simplified to "tuna" or the "lemon, juice of" could be simplified to "lemon juice" or even "lemon"

Let's perform Data Exploration to see if we can simplify and normalize some of the ingredients' names

Let's count the total occurencies for each ingredient to find some patterns for simplification

In [6]:
# Concatenate all the lists into a single list
all_items = sum(recipes_df['tokens'], [])

# Get the counts of each item
counts = pd.Series(all_items).value_counts()

# Create a DataFrame from the counts
counts_df = pd.DataFrame({'name': counts.index, 'number': counts.values})

# Save the DataFrame to a CSV file
counts_df.to_csv('preprocessed_ingredients_counts.csv', index=False)

counts

salt                        870
olive oil                   691
feta cheese                 532
garlic cloves               413
onion                       389
                           ... 
mixed sprouts                 1
sumac                         1
cooked chicken breast         1
frozen pizza                  1
unsalted sunflower seeds      1
Name: count, Length: 2012, dtype: int64

We are seeing 2012 different ingredients

In [7]:
# Load the English language model for spaCy
from spacy.cli import download
download('en_core_web_sm')
download('en_core_web_lg')
base_model = spacy.load('en_core_web_sm')
large_model = spacy.load('en_core_web_lg')


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [8]:
import ast
import regex as re


measurements = re.compile(r'(bowl|bulb|cube|clove|cup|drop|ounce|oz|pinch|pound|teaspoon|tablespoon|juice of)s?')

# Define a function to clean and simplify ingredient names using Named Entity Recognition/POS-tagging
def clean_ingredients(ingredients):

    clean_ingredients = []
    for ingredient in ingredients:
        
        tokens = large_model(ingredient.lower())  # Process the ingredient text with spaCy
        normalized_ingredient = ''

        entities = [ent.text for ent in tokens.ents if ent.label_ == "FOOD"]
       
        if entities:
            print(entities)
            clean_ingredients.extend(entities)
        else:
            for token in tokens:
                if (token.dep_ in ['nsubj', 'ROOT']) and (token.pos_ in ['NOUN', 'PROPN']) and (not measurements.match(token.text)):
                #explore children
                    for child in token.children:
                        if (not measurements.match(child.text)) and (child.dep_ in ['amod', 'compound']):
                            normalized_ingredient += child.text + ' '
                    normalized_ingredient += token.text + ' '
            normalized_ingredient = normalized_ingredient.rstrip()
            if normalized_ingredient == '': continue
            clean_ingredients.append(normalized_ingredient)

       
    return clean_ingredients


# Initialize an empty list to store the cleaned ingredients
cleaned_ingredients_list = []

# Loop over the ingredients in the original DataFrame
for index, row in enumerate(recipes_df['ingredients']):
    # if index == 300: break
    # Clean the ingredients using the clean_ingredients() function
    cleaned_ingredients = clean_ingredients(ast.literal_eval(row))
    # Append the cleaned ingredients to the cleaned_ingredients_list

    cleaned_ingredients_list.append(cleaned_ingredients)




# recipes_df['cleaned_ing0redients'] = pd.Series(cleaned_ingredients_list)
recipes_df.insert(3, "cleaned_ingredients", cleaned_ingredients_list, True)
recipes_df['cleaned_ingredients']

100       [cucumber, potato, chopped tomato, plain yogur...
170       [red ripe tomatoes, red onion, red bell pepper...
272                      [ouzo, orange juice, orange slice]
347       [romaine lettuce, feta cheese, pepperoncini pe...
370       [romaine lettuce, salt, red onion, cherry toma...
                                ...                        
230321    [roma tomatoes, red bell pepper, red onion, en...
230466    [lemon, lemon juice, olive oil, salt, fresh gr...
230467    [couscous, black pepper, lemon juice, olive oi...
230468    [olive oil, white wine vinegar, dried oregano,...
231575    [buttermilk, egg, butter, molasses, cornmeal, ...
Name: cleaned_ingredients, Length: 1932, dtype: object

Let's see again how many ingredients we have in total after cleaning and 'fixing' all the available data

In [9]:
recipes_df

Unnamed: 0,name,ingredients,tokens,cleaned_ingredients
100,"[tide, me, over, indian, chaat, simple, veggie...","['cucumber', 'potato', 'chopped tomato', 'plai...","[cucumber, potato, chopped tomato, plain yogur...","[cucumber, potato, chopped tomato, plain yogur..."
170,"[chic, greek, salad]","['seedless cucumber', 'red ripe tomatoes', 're...","[seedless cucumber, red ripe tomatoes, red oni...","[red ripe tomatoes, red onion, red bell pepper..."
272,"[sexy, greek, cocktail]","['ouzo', 'orange juice', 'ice cube', 'orange s...","[ouzo, orange juice, ice cube, orange slice]","[ouzo, orange juice, orange slice]"
347,"[chic, to, be, greek, salad, from, salad, crea...","['romaine lettuce', 'feta cheese', 'pepperonci...","[romaine lettuce, feta cheese, pepperoncini pe...","[romaine lettuce, feta cheese, pepperoncini pe..."
370,"[dressed, up, greek, style, salad]","['romaine lettuce', 'salt and pepper', 'red on...","[romaine lettuce, salt and pepper, red onion, ...","[romaine lettuce, salt, red onion, cherry toma..."
...,...,...,...,...
230321,"[zee, ultimate, greek, salad]","['roma tomatoes', 'red bell pepper', 'red onio...","[roma tomatoes, red bell pepper, red onion, en...","[roma tomatoes, red bell pepper, red onion, en..."
230466,"[zesty, greek, chicken, with, mint]","['of fresh mint', 'lemon peel', 'lemon juice',...","[of fresh mint, lemon peel, lemon juice, olive...","[lemon, lemon juice, olive oil, salt, fresh gr..."
230467,"[zesty, greek, couscous, salad]","['couscous', 'black pepper', 'lemon juice', 'o...","[couscous, black pepper, lemon juice, olive oi...","[couscous, black pepper, lemon juice, olive oi..."
230468,"[zesty, greek, pasta, salad]","['olive oil', 'white wine vinegar', 'garlic cl...","[olive oil, white wine vinegar, garlic cloves,...","[olive oil, white wine vinegar, dried oregano,..."


In [10]:
# len(recipes_df['cleaned_ingredients'])

# recipes_df['cleaned_ingredients'].squeeze().tolist()

# recipes_df['modified_tokens'] = [eval(recipe) for recipe in recipes_df['cleaned_ingredients']]

# recipes_df['modified_tokens']

In [12]:
# Concatenate all the lists into a single list
all_items = sum(recipes_df['cleaned_ingredients'], [])

# Get the counts of each item
counts = pd.Series(all_items).value_counts()

# Create a DataFrame from the counts
counts_df = pd.DataFrame({'name': counts.index, 'number': counts.values})

counts

salt                        1122
olive oil                    691
feta cheese                  537
onion                        389
tomatoes                     364
                            ... 
shell pasta                    1
radicchio                      1
smoked turkey                  1
roma                           1
unsalted sunflower seeds       1
Name: count, Length: 1671, dtype: int64

The above results show us that the ingredients count is now less than before which means that some ingredients merged with others. This is what we wanted because now we can have a more representative image of our ingredients

## Word Embeddings

We can now train a Word2Vec model with our modified data

In [13]:
word2vec_model = Word2Vec(sentences=recipes_df['cleaned_ingredients'], vector_size=100, window=5, min_count=1, workers=4)
