# Collaborative Filtering 
This section uses collaborative filtering techniques to find out top business a user might like based on his/her similarity to other users or based on similarity between the business we want to check and businesses he/she has liked

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from collections import defaultdict

from surprise import SVD
from surprise import Dataset

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Reading the reviews as chunks since they all don't fit in memory.

In [3]:
reviews = pd.read_json("/data/yelp_academic_dataset_review.json", orient="records", lines=True, chunksize=20000)

In [4]:
for reviews in reviews:
    reviews = reviews[['review_id', 'user_id', 'business_id', 'stars']]
    break

In [5]:
reviews.describe()

Unnamed: 0,stars
count,20000.0
mean,3.73585
std,1.443703
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [6]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    20000 non-null  object
 1   user_id      20000 non-null  object
 2   business_id  20000 non-null  object
 3   stars        20000 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 625.1+ KB


In [7]:
users_all = pd.read_json("/data/yelp_academic_dataset_user.json", chunksize=10000,orient="records", lines=True)

In [8]:
users = None
for e, chunk in enumerate(users_all):
    chunk = pd.merge(chunk, reviews, on='user_id', how='inner')
    if users is None:
        users = chunk
    else:
        users = pd.concat([users, chunk])
    if users.shape[0] >= 10000:
        break
    if e==1000:
        break

In [9]:
users.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos,review_id,business_id,stars
0,q-v8elVPvKz0KvK69QSj1Q,Lisa Marie,666,2009-05-19 01:42:25,2993,1281,1832,20112012201320142015201620172018,"rt1KveqwFMnkN6dXKg5Qyg, NfnKx3z7zFottS3yHabw1g...",197,...,7,120,150,135,135,42,72,PaYbct-XdNPb3QoT0VzmmA,fgskuH5aQq0ROHm9zst_0g,4
1,Fta-vmTJYKGeVyyhm9D6Vw,Emily,3,2010-03-25 20:12:56,14,0,14,,"NMmu4oC-YRqdi0WY9pV78g, n5wQ7qm2QZY9iB8c25pzxA...",0,...,0,0,0,0,0,0,0,vcBDU6pbS08Xeba6W6HOcg,yNPh5SO-7wr8HPpVCDPbXQ,5
2,dIIKEfOgo0KqUfGQvGikPg,Gabi,2061,2007-08-10 19:01:51,20024,9684,16904,"2007,2008,2009,2010,2011,2012,2013,2014,2015,2...","6Y-l3x4LpUNhTBVMTFmTmA, HYNhRw_-8g660mpnwY2VJA...",971,...,96,1171,3272,2169,2169,463,281,BRTYuOLnAreHuyjtfU4rRQ,tDYcVluqZwieulc1iqxGXg,4
3,dIIKEfOgo0KqUfGQvGikPg,Gabi,2061,2007-08-10 19:01:51,20024,9684,16904,"2007,2008,2009,2010,2011,2012,2013,2014,2015,2...","6Y-l3x4LpUNhTBVMTFmTmA, HYNhRw_-8g660mpnwY2VJA...",971,...,96,1171,3272,2169,2169,463,281,qZjUgf2evwvTmKQYaF9EcA,5nZVVPO_cYH9aQeClTP9zA,3
4,dIIKEfOgo0KqUfGQvGikPg,Gabi,2061,2007-08-10 19:01:51,20024,9684,16904,"2007,2008,2009,2010,2011,2012,2013,2014,2015,2...","6Y-l3x4LpUNhTBVMTFmTmA, HYNhRw_-8g660mpnwY2VJA...",971,...,96,1171,3272,2169,2169,463,281,R6MVByeIwtW86OE4bQe25Q,zJGtD3y-pAIGNId4codEEg,4


In [10]:
users = users.drop(['name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'], axis=1)

In [11]:
users.describe()

Unnamed: 0,stars
count,10826.0
mean,3.784223
std,1.320996
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [12]:
businesses_all = pd.read_json("/data/yelp_academic_dataset_business.json", chunksize=10000,orient="records", lines=True)

In [13]:
businesses = None
for e, chunk in enumerate(businesses_all):
    chunk = pd.merge(chunk, reviews, on='business_id', how='inner')
    if businesses is None:
        businesses = chunk
    else:
        businesses = pd.concat([businesses, chunk])
    if businesses.shape[0] >= 10000:
        break
    if e==1000:
        break
    if e%10==0:
        print(e)

0


In [14]:
# businesses.head()

In [15]:
ind = businesses.business_id.isin(reviews.business_id) & reviews.user_id.isin(users.user_id)

In [16]:
ind.value_counts()

False    18945
True     10411
dtype: int64

In [17]:
ind = businesses.business_id.isin(users.business_id) & users.user_id.isin(businesses.user_id)
ind

0        True
0        True
0        True
0        True
0        True
        ...  
9661    False
9662    False
9663    False
9664    False
9665    False
Length: 37254, dtype: bool

In [18]:
ind.value_counts()

False    19160
True     18094
dtype: int64

In [19]:
businesses.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars_x,review_count,is_open,attributes,categories,hours,review_id,user_id,stars_y
0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'...",YkpjR5N5TLkwHi20IZ15zg,xDtS2iKsJuVUVzB2YhfPsg,4
1,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",EY8t3ndAZo2vWY7eeOnVLw,nKBtfZ93gPYybGEz2QOvTQ,5
2,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",uoujAVvyx-GLyQnByuon0w,UjVtviHTm2mgZnXCfl33CQ,5
3,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726648,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '...",pECK3p9w7m-_xEp--lGxHg,L498DJb5YDAtoqgv9thWCg,5
4,cKyLV5oWZJ2NudWgqs8VZw,Oasis Auto Center - Gilbert,"1720 W Elliot Rd, Ste 105",Gilbert,AZ,85233,33.350399,-111.827142,4.5,38,1,{'BusinessAcceptsCreditCards': 'True'},"Auto Repair, Automotive, Oil Change Stations, ...","{'Monday': '7:0-18:0', 'Tuesday': '7:0-18:0', ...",edUqROoq7qAVzR2b27j2dA,4oE6BjeHyTVRqd_82LcrhQ,2


In [20]:
users.shape

(10826, 4)

In [21]:
businesses.shape

(19022, 17)

In [22]:
users[users['user_id'].isin(businesses.user_id)].shape

(10392, 4)

In [23]:
businesses[businesses['business_id'].isin(users.business_id)].shape

(16512, 17)

In [24]:
businesses.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars_x', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'review_id', 'user_id', 'stars_y'],
      dtype='object')

In [25]:
businesses=businesses[['user_id', 'business_id', 'stars_y']]
businesses.head()

Unnamed: 0,user_id,business_id,stars_y
0,xDtS2iKsJuVUVzB2YhfPsg,f9NumwFMBDn751xgFiRbNA,4
1,nKBtfZ93gPYybGEz2QOvTQ,51M2Kk903DFYI6gnB5I6SQ,5
2,UjVtviHTm2mgZnXCfl33CQ,51M2Kk903DFYI6gnB5I6SQ,5
3,L498DJb5YDAtoqgv9thWCg,51M2Kk903DFYI6gnB5I6SQ,5
4,4oE6BjeHyTVRqd_82LcrhQ,cKyLV5oWZJ2NudWgqs8VZw,2


In [26]:
businesses.columns = ['user_id', 'business_id', 'stars']
businesses.head()

Unnamed: 0,user_id,business_id,stars
0,xDtS2iKsJuVUVzB2YhfPsg,f9NumwFMBDn751xgFiRbNA,4
1,nKBtfZ93gPYybGEz2QOvTQ,51M2Kk903DFYI6gnB5I6SQ,5
2,UjVtviHTm2mgZnXCfl33CQ,51M2Kk903DFYI6gnB5I6SQ,5
3,L498DJb5YDAtoqgv9thWCg,51M2Kk903DFYI6gnB5I6SQ,5
4,4oE6BjeHyTVRqd_82LcrhQ,cKyLV5oWZJ2NudWgqs8VZw,2


In [27]:
users = users[['user_id', 'business_id', 'stars']]
users.head()

Unnamed: 0,user_id,business_id,stars
0,q-v8elVPvKz0KvK69QSj1Q,fgskuH5aQq0ROHm9zst_0g,4
1,Fta-vmTJYKGeVyyhm9D6Vw,yNPh5SO-7wr8HPpVCDPbXQ,5
2,dIIKEfOgo0KqUfGQvGikPg,tDYcVluqZwieulc1iqxGXg,4
3,dIIKEfOgo0KqUfGQvGikPg,5nZVVPO_cYH9aQeClTP9zA,3
4,dIIKEfOgo0KqUfGQvGikPg,zJGtD3y-pAIGNId4codEEg,4


In [28]:
users.shape

(10826, 3)

In [29]:
businesses.shape

(19022, 3)

In [30]:
intersection = users.merge(businesses, how = 'inner' ,on = ['user_id', 'business_id'])
intersection.head()

Unnamed: 0,user_id,business_id,stars_x,stars_y
0,q-v8elVPvKz0KvK69QSj1Q,fgskuH5aQq0ROHm9zst_0g,4,4
1,Fta-vmTJYKGeVyyhm9D6Vw,yNPh5SO-7wr8HPpVCDPbXQ,5,5
2,dIIKEfOgo0KqUfGQvGikPg,tDYcVluqZwieulc1iqxGXg,4,4
3,dIIKEfOgo0KqUfGQvGikPg,5nZVVPO_cYH9aQeClTP9zA,3,3
4,dIIKEfOgo0KqUfGQvGikPg,zJGtD3y-pAIGNId4codEEg,4,4


In [31]:
intersection.shape

(10327, 4)

In [32]:
union = users.merge(businesses, how = 'outer')
union.head()

Unnamed: 0,user_id,business_id,stars
0,q-v8elVPvKz0KvK69QSj1Q,fgskuH5aQq0ROHm9zst_0g,4
1,Fta-vmTJYKGeVyyhm9D6Vw,yNPh5SO-7wr8HPpVCDPbXQ,5
2,dIIKEfOgo0KqUfGQvGikPg,tDYcVluqZwieulc1iqxGXg,4
3,dIIKEfOgo0KqUfGQvGikPg,5nZVVPO_cYH9aQeClTP9zA,3
4,dIIKEfOgo0KqUfGQvGikPg,zJGtD3y-pAIGNId4codEEg,4


In [33]:
union.shape

(19569, 3)

In [34]:
union = union.drop_duplicates()
union.shape

(19536, 3)

## Preparing dataset
We use the surprise library for recommendations, so we prepare the data using the dataframe we created.

In [35]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans


reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
data = Dataset.load_from_df(union[["user_id", "business_id", "stars"]], reader)
# create training/test set
trainingSet = data.build_full_trainset()
# users that are not included in the training set
testset = trainingSet.build_anti_testset()

In [36]:
# users that ARE included in the training set (for validation)
testset2 = trainingSet.build_testset()

In [37]:
print(trainingSet.n_items, trainingSet.n_users, trainingSet.n_ratings)

6354 17955 19536


In [38]:
print(len(testset), len(testset2))

114066546 19536


## Model based Recommender systems using SVD
This is a model based approach as opposed memory based using KNN


In [39]:
from surprise import SVD

restaurant_recommender = SVD()
restaurant_recommender.fit(trainingSet)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f3a27d6c8d0>

In [40]:
from surprise import accuracy

predictions = restaurant_recommender.test(testset2)

print('MSE by recommender: {}'.format(accuracy.mse(predictions)))
print('RMSE by recommender: {}'.format(accuracy.rmse(predictions)))
print('MAE by recommender: {}'.format(accuracy.mae(predictions)))


MSE: 0.8653
MSE by recommender: 0.8653383473374032
RMSE: 0.9302
RMSE by recommender: 0.9302356407585141
MAE:  0.7706
MAE by recommender: 0.7705707849044107


In [41]:
# taken from: https://surprise.readthedocs.io/en/stable/FAQ.html
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [42]:
unique = list(set(testset[:10000]))

In [45]:
predictions = restaurant_recommender.test(unique[:40])

top_n = get_top_n(predictions, n=10)



# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, ':')
    for i, (iid, score) in enumerate(user_ratings):
        print('\t{} {}'.format(iid, score))
#         print('\t', iid, top_n_ib[uid][i][0])


q-v8elVPvKz0KvK69QSj1Q :
	wTRnf52iB-vAjwVZhRlGEQ 4.067160876423644
	RjFoZYD2VTkgzyZ2AhJ9SQ 4.058454810250066
	SZEFE5hL7aN5nM-A44iPwQ 3.947028688165758
	4YjueXz8-y75OVaWIW_3cw 3.942590404888891
	dvbcUnKv2awsIxog7dO4vw 3.938090793770335
	O0UhNI9KHdY1wOdhEtT2vg 3.907973717996816
	QpSrAzil6SnTK9-80BrUUA 3.865337978659793
	2lQfKKIMxHsAG0TbSkhh0w 3.8549402136292867
	kSp9lWgQJrE9nsX7OS-x8w 3.7861360513243523
	oGYXSVs0ympLy0ur8Qnp7g 3.778287430908999
Fta-vmTJYKGeVyyhm9D6Vw :
	0wW9PasC8pw8SY7rlY3ZKw 4.196373457103049
	aBHpHQCgpPvBRQvK7ySLIA 4.080954230462713
	f_dgbsb9t7xrun9dpamlFw 4.001754024372908
	uE6hI5_i4QVq12xU99xtqA 3.9725065365891137
	KPm5Q_edkqfD6TNSX87rVg 3.9092842190310817
	5d0V1EDAIiSm60RMQmeW4g 3.8687200186062447
	tqVpJ8DaqWf4CG8CIskHfg 3.862290253624744
	wEisic-u9Yj543iYzfpAhw 3.8516151420893205
	59teSMeYXeDySTPHkCAfJg 3.807703862142603
	uQdk6xm0TE4qji48yysRDA 3.791636950213179


In [46]:
from surprise import dump
dump.dump('/files/svd_restaurant_recommender', algo=restaurant_recommender)