In [1]:
%run config.ipynb
from collections import defaultdict
import math

from surprise import BaselineOnly, SVD, NormalPredictor, KNNBasic, KNNBaseline, SVDpp, KNNWithMeans
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("data/yelp_academic_dataset_sample005_filter.csv")
df.head(1)

Unnamed: 0,review_id,user_id,business_id,review_stars,review_text,review_text_after_cleaning,user_review_count,user_elite,user_friends,user_fans,user_average_stars,user_total_compliments,business_name,business_categories,business_stars,to_recommend,num_user_friends
0,yNB39szX3M8mTEzTtsgoCw,Y1iCYGvLf4ifPoXlKLGq-w,o2Qh4SiGYJ7BK4hP7dfkrw,5,This is an amazing indian Bistro!!I If I do sa...,amaz bistroi say myself never cuisin glad expe...,1,0,,0,5.0,0,Saffron Indian Bistro,"Restaurants, Indian",4.5,True,0


### Load DataFrame to Dataset

In [3]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user_id", "business_id", "review_stars"]], reader)

### Split Train and Test Dataset

In [4]:
trainset, testset = train_test_split(data, test_size=.33, random_state=42)

# Random Predictor

In [5]:
rand_pred = NormalPredictor().fit(trainset)

In [6]:
evaluate(rand_pred, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.5275199913466273
MAE:  1.9208157848820995


Prediction(uid='d4CFNGvU_JR7PlcJyhcplQ', iid='kJS1-c00AhTFF18OjJYeTA', r_ui=1.0, est=3.059690032876307, details={'was_impossible': False})

# SGD BaseLine

In [7]:
bsl_options = {'method': 'sgd',
               'learning_rate': .0001,
               }

sgd = BaselineOnly(bsl_options=bsl_options)
sgd.fit(trainset)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x2000000d760>

In [8]:
predictions = evaluate(sgd, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.2773557638310875
MAE:  1.484429132020409


Prediction(uid='d4CFNGvU_JR7PlcJyhcplQ', iid='kJS1-c00AhTFF18OjJYeTA', r_ui=1.0, est=3.6994749385566394, details={'was_impossible': False})

# SVD

In [9]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2004e219790>

In [10]:
predictions = evaluate(svd, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.1947718807666743
MAE:  1.4228714856224651


Prediction(uid='d4CFNGvU_JR7PlcJyhcplQ', iid='kJS1-c00AhTFF18OjJYeTA', r_ui=1.0, est=3.6994749385566394, details={'was_impossible': False})

# SVDpp

In [11]:
svdpp = SVDpp()
svdpp.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2000000dac0>

In [15]:
predictions = evaluate(svdpp, testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.1943365349114525
MAE:  1.4221097507809177


In [16]:
get_top_n(predictions)

defaultdict(list,
            {'_laOfCHYOXa9CuUJRabhFg': [('t-o_Sraneime4DDhWrQRBA',
               4.2394887720245915)],
             'KQFvxL1Rd0J4puTHw34OCg': [('Dfx64-JvyQvwzupG8YkKjA',
               4.112512895008402)],
             'wNLZnNNLV8rOGiPjqMPVdQ': [('Iv5uLuPULugH4YG3nyNBcw',
               4.220419580629123),
              ('XxnQKE9jTD0bTE3LZ5NOtQ', 4.1653894153159365)],
             'Rnwsdjuac1Grtp14GVPWxg': [('9a3DrZvpYxVs3k_qwlCNSw',
               4.360912390234025)],
             '0NSSrtxfpVoJlpJq5417Aw': [('TZmMsbw5TbRqv8rmKj_aGg',
               4.165524849050835)],
             'BcZY0Hh2_3lzc95BktPtkQ': [('tnvZOd2vNCLPQS1KEZxqog',
               4.074942112247084)],
             'YvkB1cQDSWydONFrDBp6uw': [('KOEBBp2PhiV_4PkW9eo9Sg',
               4.0794068907528045)],
             'f3MnZfw1QKIDFLpO_HfLUg': [('SVGApDPNdpFlEjwRQThCxA',
               4.176055393491081)],
             'tK3NRpwMB87McFG5ioQRyg': [('UMqfeCItzQ2glr4d9apGlA',
               4.1666592196

# KNN-Sim Item-Based

In [6]:
sample = df.sample(frac=0.15)
reader = Reader(rating_scale=(1, 5))
sample_data = Dataset.load_from_df(sample[["user_id", "business_id", "review_stars"]], reader)

In [7]:
sample_trainset, sample_testset = train_test_split(sample_data, test_size=.33, random_state=42)

In [11]:
sim_options = {'name': 'cosine',
               'user_based': False
               }
knn = KNNBasic(sim_options=sim_options)
knn.fit(sample_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1cc158aaa30>

In [13]:
predictions = evaluate(knn, sample_testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.2831857340380446
MAE:  1.4911913762536504


# KNN-Sim User-Based

In [8]:
sim_options = {'name': 'cosine',
               'user_based': True
               }
knn = KNNBasic(sim_options=sim_options)
knn.fit(sample_trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x16cece5bd90>

In [9]:
predictions = evaluate(knn, sample_testset)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  1.2856943213591854
MAE:  1.4915194847616067


# Genres Similarity

In [62]:
trainset.all_users()

range(0, 215109)

In [21]:
genres = get_genres(df)

In [23]:
def genres_sim(business1, business2, genres):
    genres1 = genres[business1]
    genres2 = genres[business2]
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(genres1)):
        x = genres1[i]
        y = genres2[i]
        sumxx += x * x
        sumyy += y * y
        sumxy += x * y

    return sumxy/math.sqrt(sumxx*sumyy)

genres_sim("xkVMIk_Vqh17f48ZQ_6b0w", "ZRLmQ3oAuspAkk5R6BOfyg", genres)

0.3086066999241838

In [19]:
df.iloc[45535]["business_id"]

'xkVMIk_Vqh17f48ZQ_6b0w'

In [26]:
df.iloc[20000]["business_id"]

'tIX6o1jkLrrmsJIdGUjA5A'

In [37]:
df.iloc[45535]["business_categories"]

'Cheesesteaks, American (New), Steakhouses, Desserts, Food, Restaurants'

In [29]:
df.iloc[20000]["business_categories"]

'Shopping, Home Services, Car Window Tinting, Auto Customization, Vehicle Wraps, Auto Glass Services, Home & Garden, Home Window Tinting, Home Decor, Automotive, Shades & Blinds'

In [40]:
df.iloc[4535]["business_id"]

'ZRLmQ3oAuspAkk5R6BOfyg'

In [42]:
df.iloc[4535]["business_categories"]

'Nightlife, American (New), Bars, Caterers, Event Planning & Services, Asian Fusion, Restaurants'