In [43]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import random
import string
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt

In [44]:
# Import nltk package 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import re

from sklearn.feature_extraction.text import TfidfVectorizer

In [45]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bblu2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bblu2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bblu2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
import warnings
warnings.filterwarnings("ignore")

In [47]:
# import the datasets
dfInteract = pd.read_csv("RAW_interactions.csv")
dfRecipes = pd.read_csv("RAW_recipes.csv")

In [48]:
# correcting data type
dfInteract['date'] = pd.to_datetime(dfInteract.date)

# dropping columns with nan values in review
dfInteract.dropna(axis = 0, inplace = True)

In [49]:
# correcting data types
dfRecipes['submitted'] = pd.to_datetime(dfRecipes.submitted)
dfRecipes['n_steps'] = dfRecipes.n_steps.astype('int')

# dropping columns with nan values in description and name
dfRecipes.dropna(axis=0, inplace=True)

# dropping an outlier
dfRecipes = dfRecipes[dfRecipes.minutes < max(dfRecipes.minutes)]

In [50]:
dfInteractionsNew = dfInteract[['recipe_id', 'review']]
dfInteractionsNew.rename(columns={'recipe_id': 'id'}, inplace=True)

In [51]:
# clean the interactions dataset, combining all the reviews for a recipe into one review so we can run through it with tfidf
combinedReviewsDict = {}
for i in range(0,len(dfInteract)):
    row = dfInteractionsNew.iloc[i,:]
    foodID = row['id']
    foodReview = row['review']
    if foodID not in combinedReviewsDict:
        combinedReviewsDict[foodID] = ""
    if foodReview == foodReview:
        combinedReviewsDict[foodID] = combinedReviewsDict[foodID] + " " + foodReview

combinedReviewsList = []
for id in combinedReviewsDict:
    combinedReviewsList.append([id, combinedReviewsDict[id]])

dfInteractionsNewNew = pd.DataFrame(combinedReviewsList, columns=["id", "review"])

In [52]:
dfRecipesNew = dfRecipes[['name', 'id', 'description', 'ingredients', 'tags']]

In [53]:
merged_df = pd.merge(dfRecipesNew, dfInteractionsNewNew, on='id', how='inner')

In [54]:
merged_df.iloc[0]

name                  arriba   baked winter squash mexican style
id                                                        137739
description    autumn is my favorite time of year to cook! th...
ingredients    ['winter squash', 'mexican seasoning', 'mixed ...
tags           ['60-minutes-or-less', 'time-to-make', 'course...
review           I used an acorn squash and recipe#137681 Swe...
Name: 0, dtype: object

In [55]:
stop_words = set(stopwords.words('english'))

def preprocess(string): 
    string = string.replace('\n', ' ')
    string = re.sub(r'[^\w\s]|_', ' ', string)
    string = re.sub(r'\s+', ' ', string).lower().strip()

    string = word_tokenize(string)
    string = [word for word in string if word.lower() not in stop_words]
    
    return string

In [56]:
def preprocesslist(string): 
    string = string.replace("'", '')
    string = string.replace("[", '')
    string = string.replace("]", '')
    string = string.replace(",", '')

    string = word_tokenize(string)
    
    return string

In [57]:
merged_df['description'] = merged_df.description.transform(preprocess)
merged_df['review'] = merged_df.review.transform(preprocess)
merged_df['ingredients'] = merged_df.ingredients.transform(preprocesslist)
merged_df['tags'] = merged_df.tags.transform(preprocesslist)

In [58]:
merged_df.iloc[0]

name                  arriba   baked winter squash mexican style
id                                                        137739
description    [autumn, favorite, time, year, cook, recipe, p...
ingredients    [winter, squash, mexican, seasoning, mixed, sp...
tags           [60-minutes-or-less, time-to-make, course, mai...
review         [used, acorn, squash, recipe, 137681, sweet, m...
Name: 0, dtype: object

In [59]:
flavor_map = {
    "Malty": ["pork", "chicken", "sausage", "beet", "carrot", "caramel", "honey", "syrup", "brown sugar", "raisin", "date", "nutmeg", "cinnamon", "allspice", "gouda", "swiss", "mild", "cheddar"],
    "Hoppy": ["beef", "lamb", "sausage", "arugula", "endive", "lime", "lemon", "zest", "chili", "pepper", "jalapeno", "coriander", "parsley", "blue", "sharp", "cheddar", "parmesan"],
    "Crisp": ["shrimp", "fish", "oyster", "lobster", "clam", "lettuce", "vinaigrette", "grill", "chicken", "cucumber", "lime", "avocado", "mint", "dill", "pretzel", "cracker", "popcorn", "brie", "mozarella", "ricotta"],
    "Roasty": ["smoke", "brisket", "roast", "beef", "mushroom", "bbq", "dark", "chocolate", "coffee", "molasses", "toast", "nut", "pecan", "hazelnut", "clove", "star", "anise", "cardamom", "cheddar", "age", "gruyere"],
    "Fruity/Spicy": ["roast", "chicken", "salmon", "scallops", "pasta", "sphagetti", "fettucine", "orange", "lemon", "berry", "berries", "raspeberries", "raspberry", "blackberry", "blackberries", "ginger", "basil", "thyme", "cinnamon", "feta", "goat", "cheese"],
    "Sour": ["salami", "prosciutto", "shellfish", "clam", "oyster", "tart", "cranberry", "cranberries", "cherry", "cherries", "passionfruit", "nut", "pickle", "olive", "chevre", "feta"],
    "Rich/Hearty": ["duck", "short", "rib" "lamb", "roast", "vegetable", "butternut", "squash", "carrot", "caramel", "toffee", "fig", "prune", "rosemary", "sage", "bay", "leaf", "stilton", "age", "cheddar", "camembert"],
    "Smoky": ["smoke", "smoked", "salmon", "charred", "char" "vegetables", "eggplant", "bacon", "bell", "peppers", "bbq", "paprika", "chipotle", "thyme", "toast", "almonds", "peanuts", "gouda", "manchego"],
    "Specialty": ["turkey", "holiday", "fall", "winter", "ham", "squash", "spice", "pumpkin", "cranberry", "cranberries", "chocolate", "coffee", "cinnamon", "nutmeg", "clove", "vanilla", "cream", "cheese", "dessert", "havarti"]
}

In [60]:
flavor_dict = {}

In [61]:
def labelrecipe(row):
    flavor_count = {flavor: 0 for flavor in flavor_map.keys()}
    for word in row['ingredients']:
        for flavor, thing in flavor_map.items():
            if word.lower() in thing:
                flavor_dict[flavor] += 1
    for word in row['description']:
        for flavor, thing in flavor_map.items():
            if word.lower() in thing:
                flavor_dict[flavor] += 1
    for word in row['tags']:
        for flavor, thing in flavor_map.items():
            if word.lower() in thing:
                flavor_dict[flavor] += 1

    return max(flavor_count, key=flavor_count.get)

In [62]:
merged_df['Flavor'] = merged_df.apply(labelrecipe)

KeyError: 'ingredients'