In [13]:
%run config.ipynb
from collections import defaultdict
import math

from surprise import BaselineOnly, SVD, NormalPredictor, KNNBasic, KNNBaseline, SVDpp
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv("data/yelp_academic_dataset_sample005_filter.csv")
df.head(1)

Unnamed: 0,review_id,user_id,business_id,review_stars,review_text,review_text_after_cleaning,user_review_count,user_elite,user_friends,user_fans,user_average_stars,user_total_compliments,business_name,business_categories,business_stars,to_recommend,num_user_friends
0,yNB39szX3M8mTEzTtsgoCw,Y1iCYGvLf4ifPoXlKLGq-w,o2Qh4SiGYJ7BK4hP7dfkrw,5,This is an amazing indian Bistro!!I If I do sa...,amaz bistroi say myself never cuisin glad expe...,1,0,,0,5.0,0,Saffron Indian Bistro,"Restaurants, Indian",4.5,True,0


### Load DataFrame to Dataset

In [4]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user_id", "business_id", "review_stars"]], reader)

### Split Train and Test Dataset

In [5]:
trainset, testset = train_test_split(data, test_size=.33, random_state=42)

# Random Predictor

In [14]:
rand_pred = NormalPredictor().fit(trainset)

In [15]:
evaluate(rand_pred, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.5231998935550275
MAE:  1.9156310646532066


Prediction(uid='d4CFNGvU_JR7PlcJyhcplQ', iid='kJS1-c00AhTFF18OjJYeTA', r_ui=1.0, est=2.3452469544391645, details={'was_impossible': False})

# SGD BaseLine

In [9]:
bsl_options = {'method': 'sgd',
               'learning_rate': .0001,
               }

sgd = BaselineOnly(bsl_options=bsl_options)
sgd.fit(trainset)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1b5800067f0>

In [11]:
evaluate(sgd, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.2773557638310875
MAE:  1.484429132020409


# SVD

In [12]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b5916750d0>

In [13]:
evaluate(svd, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.195018315682902
MAE:  1.4228417869818788


# SVDpp

In [17]:
svdpp = SVDpp()
svdpp.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1b5959e4700>

In [27]:
predictions = evaluate(svdpp, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.1940237520686514
MAE:  1.4216907242617678


In [35]:
get_top_n(predictions)

defaultdict(list,
            {'_laOfCHYOXa9CuUJRabhFg': [('t-o_Sraneime4DDhWrQRBA',
               4.570893894832528)],
             'KQFvxL1Rd0J4puTHw34OCg': [('Dfx64-JvyQvwzupG8YkKjA',
               4.098005228224837)],
             'wNLZnNNLV8rOGiPjqMPVdQ': [('XxnQKE9jTD0bTE3LZ5NOtQ',
               4.175927904426026),
              ('Iv5uLuPULugH4YG3nyNBcw', 4.122846130605087)],
             '1IUiBso0g60bDNB1Zwe2AQ': [('2sx52lDoiEtef7xgPCaoBw',
               4.041953608390335)],
             'Rnwsdjuac1Grtp14GVPWxg': [('9a3DrZvpYxVs3k_qwlCNSw',
               4.4521656287283236)],
             '0NSSrtxfpVoJlpJq5417Aw': [('TZmMsbw5TbRqv8rmKj_aGg',
               4.09971245973925)],
             'yd4BQE9YmZ4-mLPlp8jlnA': [('L_ZLtfHvfzfoNVQ0-okTXg',
               4.00396066975957)],
             'BcZY0Hh2_3lzc95BktPtkQ': [('tnvZOd2vNCLPQS1KEZxqog',
               4.12547264937456)],
             'YvkB1cQDSWydONFrDBp6uw': [('KOEBBp2PhiV_4PkW9eo9Sg',
               4.173363536451875

# NLP based Recommender System

In [3]:
df.head(1)

Unnamed: 0,review_id,user_id,business_id,review_stars,review_text,review_text_after_cleaning,user_review_count,user_elite,user_friends,user_fans,user_average_stars,user_total_compliments,business_name,business_categories,business_stars,to_recommend,num_user_friends
0,yNB39szX3M8mTEzTtsgoCw,Y1iCYGvLf4ifPoXlKLGq-w,o2Qh4SiGYJ7BK4hP7dfkrw,5,This is an amazing indian Bistro!!I If I do sa...,amaz bistroi say myself never cuisin glad expe...,1,0,,0,5.0,0,Saffron Indian Bistro,"Restaurants, Indian",4.5,True,0


In [5]:
genres = df["business_categories"].apply(lambda x : x.split(","))

In [11]:
business_name = df["business_name"]

In [29]:
bag_words = []
for i,j in zip(genres, df["review_text_after_cleaning"]):
    words = ''
    for word in i:
        words += ''.join(word) + ' '
    for word in j:
        words += word
    bag_words.append(words)

bag_words

['Restaurants  Indian amaz bistroi say myself never cuisin glad experi eat doctor took us lunch brought us great locat quit friendli fast servic great new place enjoy extraordinari food order chicken tikka masala regret bit came fresh hot readi eat enough season spice without overbear like restaur be definit recommend place famili friend think make hungri again enjoy guy',
 'Restaurants  Indian great great food super friendli staff well went lunch buffet disappoint all look new place sinc old favorit close down saffron bistro go fit bill nice',
 'Restaurants  Indian order chicken biryani hot spice level burnt goodi go againeven chicken piec tasti all expens too price could make better',
 'Restaurants  Indian love food absolut favorit cuisin saffron bistro hand best restaur valley excel choic meat eater vegetarian dish pack flavor tongu know manag enough heat digest tract thank later chines fusion dish worth tri well someth everi pro fresh real food abund amaz favor w choic heat choic n

In [None]:
count = CountVectorizer()
count_matrix = count.fit_transform(bag_words)
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

# Not Reach Yet

In [62]:
trainset.all_users()

range(0, 215109)

In [21]:
genres = get_genres(df)

In [23]:
def genres_sim(business1, business2, genres):
    genres1 = genres[business1]
    genres2 = genres[business2]
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(genres1)):
        x = genres1[i]
        y = genres2[i]
        sumxx += x * x
        sumyy += y * y
        sumxy += x * y

    return sumxy/math.sqrt(sumxx*sumyy)

genres_sim("xkVMIk_Vqh17f48ZQ_6b0w", "ZRLmQ3oAuspAkk5R6BOfyg", genres)

0.3086066999241838

In [19]:
df.iloc[45535]["business_id"]

'xkVMIk_Vqh17f48ZQ_6b0w'

In [26]:
df.iloc[20000]["business_id"]

'tIX6o1jkLrrmsJIdGUjA5A'

In [37]:
df.iloc[45535]["business_categories"]

'Cheesesteaks, American (New), Steakhouses, Desserts, Food, Restaurants'

In [29]:
df.iloc[20000]["business_categories"]

'Shopping, Home Services, Car Window Tinting, Auto Customization, Vehicle Wraps, Auto Glass Services, Home & Garden, Home Window Tinting, Home Decor, Automotive, Shades & Blinds'

In [40]:
df.iloc[4535]["business_id"]

'ZRLmQ3oAuspAkk5R6BOfyg'

In [42]:
df.iloc[4535]["business_categories"]

'Nightlife, American (New), Bars, Caterers, Event Planning & Services, Asian Fusion, Restaurants'