In [1]:
import numpy as np
import pandas as pd
import re
import random
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

# import and run file
from importnb import imports
with imports("ipynb"):
    import data_preparation
%run data_preparation.ipynb

# Jaccard Distance

In [2]:
# add cuisine to the description
restaurants['Augmented Description'] = restaurants['Brief Description'] + ' ' + restaurants['Cuisine'] + '.'

# calculate jaccard distance between each restaurant
restaurants_jaccard = np.zeros((len(restaurants),len(restaurants)))

words_per_restaurant = [set(filter(None, re.split('[.!?,; ]', x.lower()))) for x in restaurants['Augmented Description']]
for i in range(0,len(restaurants)):
    for j in range(0,len(restaurants)):
        restaurants_jaccard[i,j] = len(words_per_restaurant[i].intersection(words_per_restaurant[j])) / len(words_per_restaurant[i].union(words_per_restaurant[j]))
        
restaurants_jaccard = pd.DataFrame(
    restaurants_jaccard,
    columns=restaurants['Restaurant Name'],
    index=restaurants['Restaurant Name']
)

In [3]:
# Recommendation using jaccard distance
def contentfilter_recommendation_jaccard(name, max_suggest):
    reviewer_restaurants = reviews[reviews['Reviewer Name'] == name]
    fav_restaurants = list(reviewer_restaurants[reviewer_restaurants['Rating'] == reviewer_restaurants['Rating'].max()]['Restaurant Name'])
    restaurant_to_input = random.sample(fav_restaurants, 1)[0]
    data_ret = restaurants_jaccard.loc[:,restaurant_to_input].sort_values(ascending=False)
    
    return( data_ret[~data_ret.index.isin(reviewer_restaurants['Restaurant Name'])].head(max_suggest) )

In [4]:
print( contentfilter_recommendation_jaccard('Calvin Smith', 3) )

Restaurant Name
Alcove         0.307692
Oceanique      0.206897
Kabul House    0.200000
Name: Barn Steakhouse, dtype: float64


# TF-IDF

In [5]:
def TD_IDF(word):
    word_count_by_restaurant = restaurants['Augmented Description'].apply(lambda x: list(filter(None, re.split('[.!?,; ]', x.lower()))).count(word.lower()))
    total_words_by_restaurant = restaurants['Augmented Description'].apply(lambda x: len(list(filter(None, re.split('[.!?,; ]', x.lower())))))
    total_doc = len(restaurants['Augmented Description'])
    docs_with_word = sum(word_count_by_restaurant >= 1)
    
    restaurants_tdidf = pd.DataFrame(
        list(word_count_by_restaurant / total_words_by_restaurant * np.log(total_doc / docs_with_word)),
        columns=[word.lower()], index=restaurants['Restaurant Name']
    )
    
    return( restaurants_tdidf )

In [6]:
# top 100 most common words
descriptions_all_words = re.split('[.!?,; ]', " ".join(list(restaurants['Augmented Description'])).lower())
most_pop_words = list(pd.Series(filter(None, descriptions_all_words)).value_counts().sort_values(ascending=False).head(100).index)

# calculate tf-idf for each word
restaurants_td_idf = np.zeros((len(restaurants),len(most_pop_words)))

for j in range(0,len(most_pop_words)):
    cur_word_td_idf = TD_IDF(most_pop_words[j])
    for i in range(0,len(restaurants)): 
        restaurants_td_idf[i,j] = cur_word_td_idf.iloc[i]
        
restaurants_td_idf = pd.DataFrame(
    restaurants_td_idf,
    columns=most_pop_words,
    index=restaurants['Restaurant Name']
)

  restaurants_td_idf[i,j] = cur_word_td_idf.iloc[i]


In [7]:
restaurants_td_idf_euclidean = pd.DataFrame(
    euclidean_distances(restaurants_td_idf, restaurants_td_idf),
    columns=restaurants['Restaurant Name'], index=restaurants['Restaurant Name']
)

In [8]:
def contentfilter_recommendation_td_idf(name, max_suggest):
    reviewer_restaurants = reviews[reviews['Reviewer Name'] == name]
    fav_restaurants = list(reviewer_restaurants[reviewer_restaurants['Rating'] == reviewer_restaurants['Rating'].max()]['Restaurant Name'])
    restaurant_to_input = random.sample(fav_restaurants, 1)[0]
    data_ret = restaurants_td_idf_euclidean.loc[:,restaurant_to_input].sort_values(ascending=True)
    
    return( data_ret[data_ret.index != restaurant_to_input].head(max_suggest) )

In [9]:
print( contentfilter_recommendation_td_idf('Calvin Smith', 3) )

Restaurant Name
Alcove             0.236856
Claire's Korner    0.329290
Oceanique          0.414726
Name: Barn Steakhouse, dtype: float64
