# Import and loading data

In [92]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize
from random import randint
import random
import gensim
from gensim.models import Word2Vec

In [93]:
file_name_filtered = "filtered_df"
file_name_recipes = "RAW_recipes"
csv_extension = ".csv"
directory = "datasets/" 

In [94]:
id = 'id'
tags = 'tags'
tags_str = 'tags_string'
tags_final = 'tags_final'
tags_vec = 'tags_vectorized'
FILTERED_COLUMNS = ["user_id", "recipe_id", "date", "rating", "review"]
RECIPE_COLUMNS = ["name", id, tags]

In [95]:
df_filtered = pd.read_csv(directory + file_name_filtered + csv_extension)
df_recipe = pd.read_csv(directory + file_name_recipes + csv_extension)
df_filtered = df_filtered[FILTERED_COLUMNS]
df_recipe = df_recipe[RECIPE_COLUMNS]

# Fixing tags datatype and structure

In [96]:
forbidden_characters = [',', '[', ']', ' ', '', '\'']
all_tags = []
df_recipe[tags_str] = ''
for id, value in df_recipe.iterrows():
    curr_tag = value[tags]
    curr_word = str()
    for curr_char in curr_tag:
        if curr_char not in forbidden_characters:
            curr_word+=curr_char
        else:
            if curr_word != '':
                all_tags.append(curr_word.lower())
                # this method might create an error if you have a different
                # version of pandas than me:
                df_recipe.set_value(id, tags_str, all_tags)
                curr_word = str()
    all_tags = []

  from ipykernel import kernelapp as app


# Remove most 15% frequent and less 15% frequent tags

In [97]:
df_recipe.head()

Unnamed: 0,name,id,tags,tags_string
0,arriba baked winter squash mexican style,137739,"['60-minutes-or-less', 'time-to-make', 'course...","[60-minutes-or-less, time-to-make, course, mai..."
1,a bit different breakfast pizza,31490,"['30-minutes-or-less', 'time-to-make', 'course...","[30-minutes-or-less, time-to-make, course, mai..."
2,all in the kitchen chili,112140,"['time-to-make', 'course', 'preparation', 'mai...","[time-to-make, course, preparation, main-dish,..."
3,alouette potatoes,59389,"['60-minutes-or-less', 'time-to-make', 'course...","[60-minutes-or-less, time-to-make, course, mai..."
4,amish tomato ketchup for canning,44061,"['weeknight', 'time-to-make', 'course', 'main-...","[weeknight, time-to-make, course, main-ingredi..."


In [98]:
tag_frequency = {}
for index, value in df_recipe.iterrows():
    for tag in value[tags_str]:
        if tag in tag_frequency:
            tag_frequency[tag]+=1
        else:
            tag_frequency[tag]=1
# remove tags that appear less than 5 times:
tag_filter1 = {key:val for key, val in tag_frequency.items() if (val >= 5)}
tag_value = list(tag_filter1.values())
tag_value = sorted(tag_value, reverse=True)
#remove the top 15% most frequent tags:
threshold = tag_value[int(len(tag_frequency) * 0.15)]
tag_filter2 = {key:val for key, val in tag_filter1.items() if (val <= threshold)}   #left with 413 tags

In [99]:
#now remove the tags we removed in the tags_normalized
new_tag = []
df_recipe[tags_final] = ''
for index, value in df_recipe.iterrows():
    for tag in value[tags_str]:
        if tag in tag_filter2:
            new_tag.append(tag)
    df_recipe.set_value(index, tags_final, new_tag)
    new_tag = []
df_recipe.head()

  


Unnamed: 0,name,id,tags,tags_string,tags_final
0,arriba baked winter squash mexican style,137739,"['60-minutes-or-less', 'time-to-make', 'course...","[60-minutes-or-less, time-to-make, course, mai...","[mexican, fall, winter, christmas, squash]"
1,a bit different breakfast pizza,31490,"['30-minutes-or-less', 'time-to-make', 'course...","[30-minutes-or-less, time-to-make, course, mai...","[pizza, northeastern-united-states]"
2,all in the kitchen chili,112140,"['time-to-make', 'course', 'preparation', 'mai...","[time-to-make, course, preparation, main-dish,...","[chili, crock-pot-slow-cooker]"
3,alouette potatoes,59389,"['60-minutes-or-less', 'time-to-make', 'course...","[60-minutes-or-less, time-to-make, course, mai...","[easter, christmas, new-years, thanksgiving, i..."
4,amish tomato ketchup for canning,44061,"['weeknight', 'time-to-make', 'course', 'main-...","[weeknight, time-to-make, course, main-ingredi...","[canning, heirloom-historical, amish-mennonite..."


In [100]:
df_full = pd.merge(df_filtered, df_recipe, left_on='recipe_id', right_on='id', how='left')
del df_full[tags_str]
del df_full[tags]
del df_full['id']

In [101]:
#remove entries with empty tags:
df_full = df_full[df_full[tags_final].map(lambda d: len(d)) > 0]
df_full.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,tags_final
1,124416,120345,2011-08-06,0,"Just an observation, so I will not rate. I fo...",sugared raspberries,"[jams-and-preserves, gifts, northeastern-unite..."
2,76535,134728,2005-09-02,4,Very good!,kfc honey bbq strips,[high-protein]
3,255338,134728,2008-04-11,5,First time using liquid smoke in a recipe. Mad...,kfc honey bbq strips,[high-protein]
4,136726,197160,2006-11-25,5,I used this mix to make meat balls.Very simple...,mexican hots,"[mexican, high-protein]"
5,900992,225241,2009-02-19,3,"The ""cornbread"" on top is bland. I'd us a real...",chicken tamale pie for 2 ww core,"[casseroles, southwestern-united-states, tex-m..."


# Suggest recipe with same tag

In [108]:
#input_userID = 124416
input_userID = 76535
rating_threshold = 5
df_user = df_full[df_full['user_id'] == input_userID]
df_user = df_user[df_user['rating'] >= rating_threshold]
df_user.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,name,tags_final
960,76535,79378,2006-05-29,5,Frosty and delicious! I added more sugar to mi...,slushy lemonade,"[punch, food-processor-blender, independence-d..."
1481,76535,133567,2005-08-22,5,This is delicious.,beef stew with cinnamon,[stews]
14965,76535,134672,2005-10-31,5,This took a lot of time and concentration on m...,mushroom pie,"[scandinavian, finnish, mushrooms]"
18098,76535,325569,2008-09-22,5,This is a must-try. I did a small-batch just ...,buffet style fluffy oven scrambled eggs for a ...,"[very-low-carbs, omelets-and-frittatas, casser..."
18212,76535,183221,2006-09-11,5,I needed a pick me up after a particularly dis...,good every dang time seafood stuffed shells,[pasta-shells]


In [109]:
tags_user_stats = {}
for index, value in df_user.iterrows():
    for tags in value[tags_final]:
        if tags in tags_user_stats:
            tags_user_stats[tags]+=1
        else:
            tags_user_stats[tags]=1
top_tags_user = {}
#keep only the top 10% most used tags from this user:
for i in range(0, (int(0.1 * len(tags_user_stats)))):
    max_key = max(tags_user_stats, key=tags_user_stats.get)
    top_tags_user[max_key] = tags_user_stats[max_key]
    tags_user_stats.pop(max_key)
print(top_tags_user)

{'novelty': 13, 'southern-united-states': 10, 'christmas': 9, 'copycat': 8, 'sauces': 7, 'fish': 6, 'very-low-carbs': 5, 'turkey': 5, 'microwave': 5, 'served-cold': 5, 'mexican': 5, 'ground-beef': 5, 'picnic': 5}


In [110]:
# selects one tag based on roulette wheel selection
def selectOne(tags_dict):
    sum_values = sum(tags_dict.values())
    random_val = randint(1, sum_values)
    keep_track_sum = 0
    for i in range(0, len(tags_dict)):
        keep_track_sum+=list(top_tags_user.values())[i]
        if random_val < keep_track_sum:
            return  list(top_tags_user.keys())[i]
    tag_selected = ''
    return tag_selected

In [111]:
def findRecipeWithTag(given_tag):
    recipe_id = ''
    recipe_name = ''
    recipe = {}
    for index, value in df_recipe.iterrows():
        if given_tag in value[tags_final]:
            recipe[value['name']] = value['id']
    return recipe

In [114]:
tag_sugg = selectOne(top_tags_user)
recipe, id_recipe = random.choice(list(findRecipeWithTag(tag_sugg).items()))
print('Because you enjoyed', top_tags_user[tag_sugg], 'recipes with the tag', tag_sugg, 'we believe you will also like', recipe, 'which also has the tag', tag_sugg)

Because you enjoyed 7 recipes with the tag sauces we believe you will also like anything brandy cream sauce which also has the tag sauces


# Suggest recipe with tags similarity (Word2Vec)

In [116]:
model1 = gensim.models.Word2Vec(df_recipe.tags_final, min_count = 1)

In [125]:
model1.wv.most_similar(tag_sugg)  #sauces is a good example

[('garnishes', 0.5152801275253296),
 ('salad-dressings', 0.4261131286621094),
 ('dips', 0.4152565002441406),
 ('marinades-and-rubs', 0.3975967466831207),
 ('canning', 0.390713632106781),
 ('jams-and-preserves', 0.3815482556819916),
 ('spreads', 0.37519463896751404),
 ('spicy', 0.35800784826278687),
 ('herb-and-spice-mixes', 0.35016003251075745),
 ('jellies', 0.3397112488746643)]

In [131]:
tag_sugg = selectOne(top_tags_user)
#tag_sugg = "sauces"
tag_most_similar = model1.wv.most_similar(tag_sugg)[0][0] 
recipe, id_recipe = random.choice(list(findRecipeWithTag(tag_most_similar).items()))
print('Because you enjoyed', top_tags_user[tag_sugg], 'recipes with the tag \"', tag_sugg, '\" we believe you will also like \"', recipe, '\" which has a similar tag: \"', tag_most_similar, '\"')

Because you enjoyed 10 recipes with the tag " southern-united-states " we believe you will also like " red beans   rice  pressure cooker " which has a similar tag: " creole "


# Two tags suggestion

In [136]:
def findRecipeWithTwoTag(given_tag1, given_tag2):
    recipe_id = ''
    recipe_name = ''
    recipe = {}
    for index, value in df_recipe.iterrows():
        if (given_tag1 in value[tags_final]) and (given_tag2 in value[tags_final]):
            recipe[value['name']] = value['id']
    return recipe

In [138]:
tag_sugg = selectOne(top_tags_user)
tag_most_similar = model1.wv.most_similar(tag_sugg)[0][0] 
recipe, id_recipe = random.choice(list(findRecipeWithTwoTag(tag_sugg, tag_most_similar).items()))
print('Because you enjoyed', top_tags_user[tag_sugg], 'recipes with the tag \"', tag_sugg, '\" we believe you will also like \"', recipe, '\" which has the tag: \"', tag_sugg,'\" and the similar tag: \"', tag_most_similar, '\"')

Because you enjoyed 6 recipes with the tag " fish " we believe you will also like " smoked trout pate " which has the tag: " fish " and the similar tag: " trout "
