# Group 3 - Kitchen Assistant : A Recipe Prediction Tool

### About: Our tool is designed to provide suggested recipes based on a user's input of available ingredients

### User Directions: Provide recipe ingredients, seperated by commas, recieve five recipe suggestions

In [13]:
import warnings
warnings.filterwarnings('ignore')
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\18502\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\18502\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [3]:
"""
1. READ DATA

Data source: Food Ingredients and Recipe Dataset.csv

Columns:
    Title
    Ingredients
    Instructions
    Image_Name
    Cleaned_Ingredients
    
Columns Used:
    Title
    Ingredients
    
Pupose: Read data, remove unnecessary columns
"""

# Import Data

# Import libraries
import pandas as pd

def read_data(file):
    # Read in data, keep only title and ingredients columns
    data = pd.read_csv(file)
    data_subset = data[["Title", "Ingredients"]]
    
    return data_subset

In [4]:
"""
TOKENIZE DATA
Purpose: Tokenize data, create column to maintain original data for comparison
"""

# Tokenize sentences and words. Print exmaple for testing
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

def tokenize_recipes(dataframe):
    # Create empty column for tokenized ingredients
    dataframe['Tokenized Ingredients'] = np.nan

    for i in range(len(dataframe)):
        dataframe['Ingredients'][i].lower() # lower case everything
        dataframe['Tokenized Ingredients'][i] = word_tokenize(dataframe['Ingredients'][i])
     
    return dataframe

In [5]:
"""
REMOVE PUNCTUATION
Purpose: Remove punctuation, create column to maintain original data for comparison
"""
#Remove punctuations. Only print the first 20 words.
import string

def remove_punct(text):
    return "".join([c for c in text if c not in string.punctuation])

def remove_pucntuation_from_recipes(dataframe):
    # Empty column for ingredients without punctuation
    dataframe['Remove Punct'] = np.nan

    for i in range(len(dataframe)):
        dataframe['Remove Punct'][i] = [remove_punct(w.lower()) for w in dataframe['Tokenized Ingredients'][i] if remove_punct(w)!='']
        
    return dataframe

In [6]:
"""
REMOVE STOPWORDS
Purpose: Remove stopwords, create column to maintain original data for comparison
"""
#Remove stop words and count the distinct cleaned words
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

def remove_stopwords(dataframe):
    # Empty column for ingredients without stop words
    dataframe['Stop Words Removed'] = np.nan

    for i in range(len(dataframe)):
        dataframe['Stop Words Removed'][i] = [word for word in dataframe['Remove Punct'][i] if word not in stop_words]
        
    return dataframe

In [7]:
"""
LEMMATIZE INGREDIENTS
Purpose: Lemmatize ingredients, create column to maintain original data for comparison
"""
#Lemmatize the cleaned words
from nltk.stem import WordNetLemmatizer

def lemmatize_data(dataframe):
    #Empty column for lemmatized ingredients
    dataframe['Lemmatized'] = np.nan

    lemmatizer = WordNetLemmatizer()

    for i in range(len(dataframe)):
        dataframe['Lemmatized'][i] = [lemmatizer.lemmatize(word) for word in dataframe['Stop Words Removed'][i]]
        
    return dataframe

In [8]:
"""
REMOVE NUMERICal STRINGS
Purpose: Remove numerical strings caused by ingredient measurements
Source: Remove strings from a list that contains numbers in python. (n.d.). Stack Overflow. https://stackoverflow.com/questions/16084642/remove-strings-from-a-list-that-contains-numbers-in-python
"""

def remove_numerics(dataframe):
    #Empty column for data without numbers or strings with numbers
    dataframe['Remove Numerics'] = np.nan

    for i in range(len(dataframe)):
        dataframe['Remove Numerics'][i] = [item for item in dataframe['Lemmatized'][i] if item.isalpha()]

    return dataframe

In [9]:
"""
POS TAGGING
Remove excess columns up to this point

Purpose: Remove excess columns created up to this point, use pos-tagging to only maintain noun type words
Source: How to use pos_tag in NLTK? (n.d.). Stack Overflow. https://stackoverflow.com/questions/47519987/how-to-use-pos-tag-in-nltk
"""

from nltk import pos_tag
    
def pos_tagging(dataframe):
# Remove Excess Columns now that we are almost done with preprocessing
    recipe_data = dataframe[["Title", "Ingredients", "Remove Numerics"]]

    # Empty column for new data
    recipe_data['Ingredient Nouns'] = np.nan

    for i in range(len(recipe_data)):
        
        tags = pos_tag(recipe_data['Remove Numerics'][i])
        nouns = [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]

        recipe_data['Ingredient Nouns'][i] = nouns    
    
    return recipe_data

In [10]:
"""
CONVERT LIST OF WORDS BACK INTO STRINGS OF RECIPE INGREDIENTS
Purpose: Ingredients exist as list of words, convert each row back to a string of all ingredients
"""

def convert_to_strings(dataframe):
    # Join the lists of ingredients for comparison
    # Empty column for ingredients data input (string instead of list)
    dataframe['Ingredient Input'] = np.nan

    for i in range(len(dataframe)):
        dataframe['Ingredient Input'][i] = " ".join(dataframe['Ingredient Nouns'][i])

    data_subset = dataframe[["Title", "Ingredients", "Ingredient Input"]] # create final dataframe
    
    return data_subset

In [11]:
"""
COSINE SIMILARITY FUNCTIONS
Purpose: 
"""
# Calculate cosine similarity given 2 sentence strings. (n.d.). Stack Overflow. https://stackoverflow.com/questions/15173225/calculate-cosine-similarity-given-2-sentence-strings

import math
import re
from collections import Counter

WORD = re.compile(r"\w+")

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    
    return Counter(words)

In [14]:
def main():
    user_ingredients = input("Enter ingredients, seperated by commas:")
    ingredients = user_ingredients
    
    file = 'Food Ingredients and Recipe Dataset.csv'
    
    # Data Preprocessing
    data = read_data(file)
    tokenized_recipes = tokenize_recipes(data)
    punct_removed = remove_pucntuation_from_recipes(tokenized_recipes)
    remove_stop = remove_stopwords(punct_removed)
    lemma_recipes = lemmatize_data(remove_stop)
    remove_num = remove_numerics(lemma_recipes)
    tagged = pos_tagging(remove_num)
    final_data = convert_to_strings(tagged)
    
    # METHOD FOR TESTING: Create list to hold all similarity scores, take the first record and compare to all other records.
    # Iterate over the entire dataframe to get a similarity score for how the first record compares to each subsequent record
    # Save each score for later use
    
    # UPDATE FOR REAL USE: Use user input as string to compare to corpus. Convert both user input and corpus into vectors to
    # then compute cosine similarity. Save all similarity scores. Sort scores, print top 5 recipes

    sim_scores = []

    test_recipe = ingredients # assign user input 
    test_vector = text_to_vector(test_recipe) # vectorize user input

    for i in range(len(final_data)):
        compare_recipe = final_data['Ingredient Input'][i] # iterate through corpus to compare all recipes to user input
        compare_vector = text_to_vector(compare_recipe)

        cosine = get_cosine(test_vector, compare_vector)

        sim_scores.append(cosine)

    list_of_scores_with_index = []
    list_of_scores_with_recipe = []
    list_of_recipes = final_data['Title'].to_list()

    for i in range(len(sim_scores)):
        list_of_scores_with_index.append([sim_scores[i], i])

    # Create list to hold title of recipe and similarity scores
    for i in range(len(list_of_scores_with_index)):
        list_of_scores_with_recipe.append([list_of_scores_with_index[i][0],list_of_recipes[list_of_scores_with_index[i][1]]])

    # Sort cosine similarity scores
    list_of_scores_descending = sorted(list_of_scores_with_recipe, key = lambda x : x[0], reverse=True)

    # Print top five recipes
    print('Your suggested recipes are: ' ,list_of_scores_descending[0][1],',',
          list_of_scores_descending[1][1],',',list_of_scores_descending[2][1],',',
          list_of_scores_descending[3][1],', and ',list_of_scores_descending[4][1])
    
    recipes = [list_of_scores_descending[0][1], list_of_scores_descending[1][1],list_of_scores_descending[2][1],
              list_of_scores_descending[3][1],list_of_scores_descending[4][1]]
    recipe_output = data[data['Title'].isin(recipes)]
    
    output = recipe_output[['Title', 'Ingredients']]
    
    output.to_excel('recipe_recommendations.xlsx', index=False) # export recipes with title and ingredients
    
if __name__ == '__main__':
    main()

Enter ingredients, seperated by commas:strawberry, rice, tomato
Your suggested recipes are:  Parboiled Rice , Yogurt Granita , Paella with Tomatoes and Eggs , Sweet and Spicy Chicken Drumsticks , and  Stir-Fried Egg and Tomato
