In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
import os
import pandas as pd
import nltk
import string
import ast
import re
import unidecode

# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from collections import Counter



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
def ingredient_parser(ingreds):
    """

    This function takes in a list (but it is a string as it comes from pandas dataframe) of
       ingredients and performs some preprocessing.
       For example:

       input = '['1 x 1.6kg whole duck', '2 heaped teaspoons Chinese five-spice powder', '1 clementine',
                 '6 fresh bay leaves', 'GRAVY', '', '1 bulb of garlic', '2 carrots', '2 red onions',
                 '3 tablespoons plain flour', '100 ml Marsala', '1 litre organic chicken stock']'

       output = ['duck', 'chinese five spice powder', 'clementine', 'fresh bay leaf', 'gravy', 'garlic',
                 'carrot', 'red onion', 'plain flour', 'marsala', 'organic chicken stock']

    """
    measures = [
        "teaspoon",
        "t",
        "tsp.",
        "tablespoon",
        "T",
        "tbl.",
        "tb",
        "tbsp.",
        "fluid ounce",
        "fl oz",
        "gill",
        "cup",
        "c",
        "pint",
        "p",
        "pt",
        "fl pt",
        "quart",
        "q",
        "qt",
        "fl qt",
        "gallon",
        "g",
        "gal",
        "ml",
        "milliliter",
        "millilitre",
        "cc",
        "mL",
        "l",
        "liter",
        "litre",
        "L",
        "dl",
        "deciliter",
        "decilitre",
        "dL",
        "bulb",
        "level",
        "heaped",
        "rounded",
        "whole",
        "pinch",
        "medium",
        "slice",
        "pound",
        "lb",
        "#",
        "ounce",
        "oz",
        "mg",
        "milligram",
        "milligramme",
        "g",
        "gram",
        "gramme",
        "kg",
        "kilogram",
        "kilogramme",
        "x",
        "of",
        "mm",
        "millimetre",
        "millimeter",
        "cm",
        "centimeter",
        "centimetre",
        "m",
        "meter",
        "metre",
        "inch",
        "in",
        "milli",
        "centi",
        "deci",
        "hecto",
        "kilo",
    ]
    words_to_remove = [
        "fresh",
        "minced",
        "chopped" "oil",
        "a",
        "red",
        "bunch",
        "and",
        "clove",
        "or",
        "leaf",
        "chilli",
        "large",
        "extra",
        "sprig",
        "ground",
        "handful",
        "free",
        "small",
        "pepper",
        "virgin",
        "range",
        "from",
        "dried",
        "sustainable",
        "black",
        "peeled",
        "higher",
        "welfare",
        "seed",
        "for",
        "finely",
        "freshly",
        "sea",
        "quality",
        "white",
        "ripe",
        "few",
        "piece",
        "source",
        "to",
        "organic",
        "flat",
        "smoked",
        "ginger",
        "sliced",
        "green",
        "picked",
        "the",
        "stick",
        "plain",
        "plus",
        "mixed",
        "mint",
        "bay",
        "your",
        "cumin",
        "optional",
        "fennel",
        "serve",
        "mustard",
        "unsalted",
        "baby",
        "paprika",
        "fat",
        "ask",
        "natural",
        "skin",
        "roughly",
        "into",
        "such",
        "cut",
        "good",
        "firmly",
        "grated",
        "trimmed",
        "powder",
        "yellow",
        "dusting",
        "knob",
        "frozen",
        "on",
        "deseeded",
        "low",
        "runny",
        "balsamic",
        "cooked",
        "streaky",
        "nutmeg",
        "sage",
        "rasher",
        "zest",
        "pin",
        "halved",
        "grating",
        "stalk",
        "light",
        "tinned",
        "dry",
        "soft",
        "rocket",
        "bone",
        "colour",
        "washed",
        "skinless",
        "leftover",
        "splash",
        "removed",
        "dijon",
        "thick",
        "big",
        "hot",
        "drained",
        "sized",
        "english",
        "raw",
        "flake",
        "cider",
        "cayenne",
        "tbsp",
        "leg",
        "pine",
        "wild",
        "if",
        "fine",
        "chipped",
   
        "shoulder",
        "cube",
        "dressing",
        "with",
        "chunk",
        "spice",
        "thumb",
       
        "new",
        "little",
        "punnet",
       
        "shelled",
    
        "other" "chopped",
        "salt",
      
        "taste",
        "can",
        "sauce",
        "water",
        "diced",
        "package",
        "italian",
        "shredded",
        "divided",
      
      
        "all",
        "purpose",
        "crushed",
        "juice",
        "more",
       
        "bell",
        "needed",
        "thinly",
        "boneless",
        "half",
        "thyme",
        "cubed",
       
        
        "jar",
        "seasoning",
      
        "extract",
        "sweet",
        "baking",
        "beaten",
        "heavy",
        "seeded",
        "tin",
       
        "uncooked",
        "crumb",
        "style",
        "thin",
        "nut",
        "coarsely",
        "spring",
        "chili",
       
        "strip",
        
        "rinsed",
        
       
        "root",
        "quartered",
        "head",
        "softened",
        "container",
        "crumbled",
        "frying",
        "lean",
        "cooking",
        "roasted",
        "warm",
        "whipping",
        "thawed",
       
        "pitted",
        "sun",
        "kosher",
        "bite",
        "toasted",
       
        "split",
        "melted",
        "degree",
        "lengthwise",
     
        "packed",
        "pod",
        "anchovy",
        "rom",
        "prepared",
        "juiced",
        "fluid",
        "floret",
        "room",
        "active",
        "seasoned",
        "mix",
        "deveined",
        "lightly",
        "anise",
     
        "size",
        "unsweetened",
        "torn",
        "wedge",
        "sour",
        "one",

      
        "dark",
        "temperature",
        "garnish",
        "at",
        "loaf",
        "shell",
       
      
    
        "round",
        "canned",
       
        "crust",
        "long",
        "broken",
        "ketchup",
        "bulk",
        "cleaned",
        "condensed",
        "sherry",
        "up",
        "cold",
        "soda",
       
        "spray",
        "partially",

        
        "shortening",
        "part",
        "bottle",
        "sodium",
       
        "grain",
        "french",
        "roast",
        "stem",
        "link",
        "firm",
        
        "mild",
        "dash",
        "boiling",
        "oil",
        "chopped",
        "vegetable oil",
        "chopped oil",
        "garlic",
        "skin off",
        "bone out" "from sustrainable sources",
    ]
    # The ingredient list is now a string so we need to turn it back into a list. We use ast.literal_eval
    if isinstance(ingreds, list):
        ingredients = ingreds
    else:
        ingredients = ast.literal_eval(ingreds)
    # We first get rid of all the punctuation. We make use of str.maketrans. It takes three input
    # arguments 'x', 'y', 'z'. 'x' and 'y' must be equal-length strings and characters in 'x'
    # are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character
    #  in the string is mapped to None.
    translator = str.maketrans("", "", string.punctuation)
    lemmatizer = WordNetLemmatizer()
    ingred_list = []
    for i in ingredients:
        i.translate(translator)
        # We split up with hyphens as well as spaces
        items = re.split(" |-", i)
        # Get rid of words containing non alphabet letters
        items = [word for word in items if word.isalpha()]
        # Turn everything to lowercase
        items = [word.lower() for word in items]
        # remove accents
        items = [
            unidecode.unidecode(word) for word in items
        ]  #''.join((c for c in unicodedata.normalize('NFD', items) if unicodedata.category(c) != 'Mn'))
        # Lemmatize words so we can compare words to measuring words
        items = [lemmatizer.lemmatize(word) for word in items]
        # Gets rid of measuring words/phrases, e.g. heaped teaspoon
        items = [word for word in items if word not in measures]
        # Get rid of common easy words
        items = [word for word in items if word not in words_to_remove]
        if items:
            ingred_list.append(" ".join(items))
    # ingred_list = " ".join(ingred_list)
    return ingred_list

In [5]:
import nltk
nltk.download('wordnet')
data_path = 'C:/Users/Hp/OneDrive/Desktop/ISTE 612/project/top_5000_rows.csv'
recipe_df = pd.read_csv(data_path)

recipe_df["ingredients_parsed"] = recipe_df["ingredients"].apply(
        lambda x: ingredient_parser(x))

df = recipe_df[["title", "ingredients", "directions", "link", "source", "NER"]]

df = recipe_df.dropna()

df.to_csv('C:/Users/Hp/OneDrive/Desktop/ISTE 612/project/final.csv', index=False)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
print(recipe_df.head())

   Unnamed: 0                  title  \
0           0    No-Bake Nut Cookies   
1           1  Jewell Ball'S Chicken   
2           2            Creamy Corn   
3           3          Chicken Funny   
4           4   Reeses Cups(Candy)     

                                         ingredients  \
0  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  ["1 small jar chipped beef, cut up", "4 boned ...   
2  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4  ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   www.cookbooks.com

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

newFile = 'C:/Users/Hp/OneDrive/Desktop/ISTE 612/project/final.csv'

parsed_df = pd.read_csv(newFile)

# Tfidf needs unicode or string types
#parsed_df['ingredients_parsed'] = parsed_df.ingredients_parsed.values.astype('U')

# TF-IDF feature extractor
tfidf = TfidfVectorizer()
tfidf.fit(parsed_df['ingredients_parsed'])
tfidf_recipe = tfidf.transform(parsed_df['ingredients_parsed'])

In [8]:
print(tfidf_recipe.shape)
print(tfidf_recipe)

(5000, 1643)
  (0, 1556)	0.24300200953020426
  (0, 1415)	0.16600397458663219
  (0, 1179)	0.37502130111994714
  (0, 899)	0.22569224037848684
  (0, 853)	0.28488368673569237
  (0, 515)	0.45930335246882414
  (0, 192)	0.2250603914081592
  (0, 174)	0.3072210780253344
  (0, 117)	0.5335348957696217
  (1, 1338)	0.2738798331522013
  (1, 933)	0.30819379848125317
  (1, 375)	0.4093382183942384
  (1, 263)	0.27290934150555957
  (1, 233)	0.4273703538537212
  (1, 163)	0.386301334874732
  (1, 139)	0.5060582902907501
  (2, 375)	0.5300725364961731
  (2, 351)	0.8479523017555369
  (3, 1498)	0.38593551146989197
  (3, 1408)	0.3485211176181426
  (3, 1400)	0.4208537034883156
  (3, 1338)	0.2043598735684727
  (3, 933)	0.22996379458583743
  (3, 637)	0.4338097196805301
  (3, 375)	0.1527171708756353
  :	:
  (4998, 1490)	0.12143686301448937
  (4998, 1338)	0.11880292894633826
  (4998, 1226)	0.2162661608120122
  (4998, 1052)	0.24246074673828202
  (4998, 975)	0.09652214321576173
  (4998, 973)	0.16396423557472165
  (4998

In [9]:
# calculate cosine similarity between actual recipe ingreds and test ingreds

from sklearn.metrics.pairwise import cosine_similarity

cos_sim = map(lambda x: cosine_similarity(tfidf_recipe, x), tfidf_recipe)
scores = list(cos_sim)

In [10]:
print(scores[1].shape)

(5000, 1)


In [25]:
# neaten the ingredients being outputted

def ingredient_parser_final(ingredient):
    if isinstance(ingredient, list):
        ingredients = ingredient
    else:
        ingredients = ast.literal_eval(ingredient)

    ingredients = ",".join(ingredients)
    ingredients = unidecode.unidecode(ingredients)
    return ingredients


def title_parser(title):
    title = unidecode.unidecode(title)
    return title


In [26]:
# Top-N recomendations order by score
def get_recommendations(N, scores):
    
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations
    recommendation = pd.DataFrame(columns=["recipe", "ingredients", "score", "url"])
    count = 0
    for i in top:
        recommendation.at[count, "recipe"] = title_parser(parsed_df["title"][i])
        recommendation.at[count, "ingredients"] = ingredient_parser_final(
            parsed_df["ingredients_parsed"][i]
        )
        recommendation.at[count, "url"] = parsed_df["link"][i]
        recommendation.at[count, "score"] = "{:.3f}".format(float(scores[i]))
        count += 1
    return recommendation

In [27]:
def RecSys(ingredients, N=5):
    """
    The reccomendation system takes in a list of ingredients and returns a list of top 5 
    recipes based of of cosine similarity. 
    :param ingredients: a list of ingredients
    :param N: the number of reccomendations returned 
    :return: top 5 reccomendations for cooking recipes
    """

    # load in tdidf model and encodings
#     with open(config.TFIDF_ENCODING_PATH, "rb") as f:
#         tfidf_encodings = pickle.load(f)

#     with open(config.TFIDF_MODEL_PATH, "rb") as f:
#         tfidf = pickle.load(f)

    # parse the ingredients using my ingredient_parser
    try:
        ingredients_parsed = ingredient_parser(ingredients)
    except:
        ingredients_parsed = ingredient_parser([ingredients])

    # use our pretrained tfidf model to encode our input ingredients
    ingredients_parsed = " ".join(ingredients_parsed)
    ingredients_tfidf = tfidf.transform([ingredients_parsed])

    # calculate cosine similarity between actual recipe ingreds and test ingreds
    cos_sim = map(lambda x: cosine_similarity(ingredients_tfidf, x), tfidf_recipe)
    scores = list(cos_sim)

    # Filter top N recommendations
    recommendations = get_recommendations(N, scores)
    return recommendations

In [42]:
# test ingredients
test_ingredients = "cherry , ale , jell o cherry flavor gelatin  , almond , miniature marshmallow"
recs = RecSys(test_ingredients)
print(recs)
# print(recs.scores)

                  recipe                                        ingredients  \
0  Double Cherry Delight  cherry,ale,jell o cherry flavor gelatin,almond...   
1     Cherry Jello Salad  box cherry jell o,cherry pie filling,cream che...   
2           Cherry Salad     cherry,box cherry jell o,sugar,pineapple,pecan   
3        Cranberry Punch               cranberry,ale,cherry jell o,lemonade   
4            Penny Punch  cherry flavored gelatin,lemon flavored gelatin...   

   score                                              url  
0  1.000  www.cookbooks.com/Recipe-Details.aspx?id=703381  
1  0.517  www.cookbooks.com/Recipe-Details.aspx?id=662334  
2  0.508  www.cookbooks.com/Recipe-Details.aspx?id=677459  
3  0.495  www.cookbooks.com/Recipe-Details.aspx?id=321508  
4  0.454  www.cookbooks.com/Recipe-Details.aspx?id=138148  


  recommendation.at[count, "score"] = "{:.3f}".format(float(scores[i]))
  recommendation.at[count, "score"] = "{:.3f}".format(float(scores[i]))
  recommendation.at[count, "score"] = "{:.3f}".format(float(scores[i]))
  recommendation.at[count, "score"] = "{:.3f}".format(float(scores[i]))
  recommendation.at[count, "score"] = "{:.3f}".format(float(scores[i]))
