### Yelp Content-Based Recommender Engine

In [1]:
import pandas as pd, numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

#### Text Preprocessing

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
from nltk.stem import PorterStemmer 

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bentleyou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def preprocessing(texts):
    
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer() 

    return [''.join([ps.stem(lemmatizer.lemmatize(w)) for w in texts])]

#### Using LSA transformer

In [18]:
with open("Tfidf.pkl", "rb") as fp:   # Unpickling
    Tfidf_vectorizer = pickle.load(fp)
       
with open("lsa_model.pkl", "rb") as fp:   # Unpickling
    lsa = pickle.load(fp)

def text_transformer_lsa(preprocessed_texts):
    
    vectorized_text = Tfidf_vectorizer.transform(preprocessed_texts)
    
    compressed_vector = lsa.transform(vectorized_text)
    
    return compressed_vector

#### Using NMF transformer

In [5]:
with open("Tfidf_vectorizer.txt", "rb") as fp:   # Unpickling
    Tfidf_vectorizer = pickle.load(fp)
with open("nmf_model.pkl", "rb") as fp:   # Unpickling
    nmf = pickle.load(fp)

def text_transformer_nmf(preprocessed_texts):
    
    vectorized_text = Tfidf_vectorizer.transform(preprocessed_texts)
    
    compressed_vector = nmf.transform(vectorized_text)
    
    return compressed_vector

In [6]:
doc_topic_lsa = pd.read_pickle('doc_topic_lsa.pkl')

def find_similarity_LSA(search, top_search, location):
    
    preprocessed_search = preprocessing(search)
    
    search_vector = text_transformer_lsa(preprocessed_search)
    
    location_res_topic = doc_topic_lsa[doc_topic_lsa['city'] == location].iloc[:,0:10]
    restaurant_topic_array = location_res_topic.values
    restaurant_index = location_res_topic.index
    
    print(restaurant_topic_array.shape)
    cosine_list = []
    
    
    for restaurant in restaurant_topic_array:
        
        cosine_list.append(cosine_similarity([restaurant],[search_vector])[1][0])
    
    cosine_array = np.array(cosine_list)
    
    restaurant_sim = pd.DataFrame(cosine_array, 
                                  index = restaurant_index,
                                 columns = ['Similarity']).sort_values(by = 'Similarity', 
                                                                       ascending = False)
    return restaurant_sim[:top_search]

In [7]:
doc_topic_nmf = pd.read_pickle('doc_topic_nmf.pkl')

def find_similarity_NMF(search, top_search, location):
    
    preprocessed_search = preprocessing(search)

    
    search_vector = text_transformer_nmf(preprocessed_search)
    
    location_res_topic = doc_topic_nmf[doc_topic_nmf['city'] == location].iloc[:,0:5]
    restaurant_topic_array = location_res_topic.values
    restaurant_index = location_res_topic.index
    
    print(restaurant_topic_array.shape)
    cosine_list = []
    
    
    for restaurant in restaurant_topic_array:
        
        cosine_list.append(cosine_similarity([restaurant,search_vector.reshape(-1)])[1][0])
    
    cosine_array = np.array(cosine_list)
    
    restaurant_sim = pd.DataFrame(cosine_array, 
                                  index = restaurant_index,
                                 columns = ['Similarity']).sort_values(by = 'Similarity', 
                                                                       ascending = False)
    return restaurant_sim[:top_search]

In [19]:
preprocessing('the best burger in town')

['the best burger in town']

In [20]:
Tfidf_vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=3, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
Tfidf_vectorizer.transform(preprocessing('the best burger in town')).shape

(1, 119269)

In [22]:
text_transformer_lsa(preprocessing('the best burger in town'))

array([[ 0.07510358, -0.02398096, -0.12689207,  0.02622772, -0.19288896,
         0.14638499,  0.29520272,  0.44437212,  0.11443655,  0.02469081]])

In [23]:
find_similarity_LSA('Burgers in the latenight snack', 5, 'Las Vegas')

(6455, 6)


ValueError: setting an array element with a sequence.

In [82]:
find_similarity_LSA('Burgers in the latenight snack', 5, 'Las Vegas')

(6455, 5)


Unnamed: 0_level_0,Similarity
name,Unnamed: 1_level_1
Kilroy's Restaurant & Bar,0.834243
Burger Stop,0.817671
The Habit Burger Grill,0.802033
Burger Bar,0.794919
Baby's Badass Burgers,0.772852


In [83]:
find_similarity_LSA('Give me the worst burger possible', 5, 'Las Vegas')

(6455, 5)


Unnamed: 0_level_0,Similarity
name,Unnamed: 1_level_1
Kilroy's Restaurant & Bar,0.798284
Burger Stop,0.779579
The Habit Burger Grill,0.761498
Burger Bar,0.750133
Baby's Badass Burgers,0.73082


In [64]:
find_similarity_LSA('Best Burger', 5, 'Las Vegas')

(6455, 5)


Unnamed: 0_level_0,Similarity
name,Unnamed: 1_level_1
Red Plate,0.553427
American Grill,0.528648
Antonio's Italian Ristorante,0.52025
Philippine Gardens Karaoke Restaurant and Lounge,0.515089
MB Steak,0.501737


In [67]:
find_similarity_NMF('thai food', 5, 'Las Vegas')

(6455, 5)


Unnamed: 0_level_0,Similarity
name,Unnamed: 1_level_1
Pho Pasteur & Grill,0.883456
Jasmine Thai Gourmet,0.87549
Mr. Bangkok,0.866505
Essence of Thai,0.863637
Corner Thai Kitchen,0.847785


In [68]:
find_similarity_NMF('Best thai food in Las Vegas', 5, 'Las Vegas')

(6455, 5)


Unnamed: 0_level_0,Similarity
name,Unnamed: 1_level_1
Pho Pasteur & Grill,0.905577
Jasmine Thai Gourmet,0.899147
Mr. Bangkok,0.891424
Essence of Thai,0.887788
Corner Thai Kitchen,0.874384


In [56]:
find_similarity_NMF('I want the worst thai and american food in the city', 5, 'Las Vegas')

(6455, 5)


Unnamed: 0_level_0,Similarity
name,Unnamed: 1_level_1
Pho Pasteur & Grill,0.929226
Jasmine Thai Gourmet,0.926814
Mr. Bangkok,0.921865
Essence of Thai,0.916986
Corner Thai Kitchen,0.905324
