# Collaborative filtering

We created a baseline for our method, that is, building a recommendation system without using text reviews. The method I use is collaborative filtering. 
In the newer, narrower sense, collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating).
The underlying assumption of the collaborative filtering approach is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person. 

In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import operator
import seaborn as sns
import json
import matplotlib.pyplot as plt

In [65]:

# Import libraries
from sklearn.metrics.pairwise import pairwise_distances
from __future__ import division
import time

In [6]:
def load_json_to_df(datapass):
    data = [] 
    with open(datapass) as data_file: 
        for f in data_file:
            data.append(json.loads(f))
    df = pd.DataFrame(data)
    return df

In [22]:
business = load_json_to_df("business.json")

In [43]:
review = load_json_to_df("review.json")

In [None]:
user = load_json_to_df("user.json")

In [7]:
n_users_review = review.user_id.unique().shape[0]
n_items_review = review.business_id.unique().shape[0]

In [8]:
#Group res by city
city = business.groupby('city')['city'].count()
city


In [38]:
# subset restaurant to category restaurant
#restaurant = business[business['categories'].str.contains("Restaurants",na=False)]
is_rest = []
for i in business['categories']:
    
    if 'Restaurants' in i or 'Food' in i:
        is_rest.append(True)
    else:
        is_rest.append(False)
is_burger = []
for i in business['categories']:
    
    if 'Burgers' in i:
        is_burger.append(True)
    else:
        is_burger.append(False)

is_m = []
for i in business['categories']:
    
    if 'Michelin' in i:
        is_m.append(True)
    else:
        is_m.append(False)

In [40]:
burger_res = business.loc[is_burger]
burger_res.head(2)


Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
14,9616 E Independence Blvd,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",SDMRxmcKPNt1AHPBKqO64Q,"[Burgers, Bars, Restaurants, Sports Bars, Nigh...",Matthews,"{'Monday': '11:00-0:00', 'Tuesday': '11:00-0:0...",1,35.135196,-80.714683,Applebee's,,28105,21,2.0,NC
32,1794 Liverpool Road,"{'RestaurantsTableService': True, 'GoodForMeal...",KW4y7uDGjVfU3ClkEjIGhg,"[Burgers, Restaurants]",Pickering,{},1,43.834351,-79.090135,The Works,,L1V 1V9,41,3.0,ON


In [None]:
import pickle
with open("restaurants_pd_df.txt", "wb") as f:
    pickle.dump(restaurant, f)

In [17]:
city2 = restaurant.groupby('city')['city'].count()
city.head(2)

city
                 3
110 Las Vegas    1
Name: city, dtype: int64

In [18]:
with open("restaurants_pd_df.txt", "rb") as f:
    restaurant = pickle.load(f)


In [41]:
#res_vegas = restaurant.loc[restaurant['city']=='Las Vegas']
b_vegas = burger_res.loc[restaurant['city']=='Las Vegas']
# with open("res_vegas_pd_df.txt", "wb") as f:
#     pickle.dump(res_vegas, f)
b_vegas.head(2)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
302,3501 S Rainbow,"{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",W-3Sy3fy85mQdd0ZNFKIiw,"[Sports Bars, Nightlife, Burgers, Bars, Americ...",Las Vegas,"{'Monday': '11:00-0:00', 'Tuesday': '11:00-0:0...",1,36.12527,-115.243588,Applebee's,Chinatown,89103,74,2.5,NV
449,1021 S Buffalo Dr,"{'GoodForMeal': {'dessert': False, 'latenight'...",q7OKOkEK-pgAQNjDiVd4bA,"[American (Traditional), Restaurants, Burgers]",Las Vegas,{},0,36.159785,-115.261803,Kilroy's,Westside,89162,12,3.0,NV


In [82]:
b_vegas.shape

(505, 15)

In [7]:
import pickle
with open("res_vegas_pd_df.txt", "rb") as f:
    res_vegas = pickle.load(f)

with open("restaurants_pd_df.txt", "rb") as f:
    restaurants = pickle.load(f)

In [49]:
# subset review and b_vegas to u,i,r
review_ur = review[['user_id', 'stars','business_id']]

In [50]:
review_ur.head(2)

Unnamed: 0,user_id,stars,business_id
0,cjpdDjZyprfyDG3RlkVG3w,5,uYHaNptLzDLoV_JZ_MuzUA
1,bjTcT8Ty4cJZhEOEo01FGA,3,uYHaNptLzDLoV_JZ_MuzUA


In [51]:
# Left join restaruants in vegas table with review table
review_rest_tor = pd.merge(b_vegas, review_ur, on='business_id', how='left')
# review_rest_tor.shape
# review_rest_tor.columns

# Subset to user, item, rating columns
uir = review_rest_tor[['user_id','business_id','stars_y']]

with open('uir.txt', 'wb') as f:
    pickle.dump(uir, f)
# len(uir)



In [52]:
import pickle
with open("uir.txt", "rb") as f:
    uir =pickle.load(f)

In [53]:
# Assign index for user and item
user_index = uir.user_id.unique()
item_index = uir.business_id.unique()

# Count number of unique users and items
n_users = uir.user_id.unique().shape[0]
n_items = uir.business_id.unique().shape[0]

In [54]:
# Split User, Item, Rating dataset to train and test sets of 70% & 30%
from sklearn.model_selection import train_test_split
train, test = train_test_split(uir, test_size=0.30, random_state=42)
# len(train)
# len(test)

# Create table for train data with list of users as index & items as columns
train_matrix = pd.DataFrame(index=user_index, columns=item_index)
# train_matrix.shape



In [55]:
# Fill in train_matrix table with ratings
for row in train.itertuples():
    user = row[1]
    item = row[2]
    train_matrix.loc[user][item] = row[3]  

In [56]:
# Create table for test data with list of users as index & items as columns    
test_matrix = pd.DataFrame(index=user_index, columns=item_index)
# test_matrix.shape




In [57]:
# Fill in test_matrix table with ratings
for row in test.itertuples():
    user = row[1]
    item = row[2]
    test_matrix.loc[user][item] = row[3]


In [66]:
# Begin filtering process to create 5 Core Subset

# Count number of rated items for each user
item_1 = train_matrix.apply(lambda x: x > 0, raw=True).sum(axis=1)
# item_1.value_counts()

# Filter down to the users with greater than or equal to 5 ratings
train1 = train_matrix
train1['item_1'] = item_1
train2 = train1.loc[train1['item_1'] >= 5]
# train2.shape

# Count number of rated users for each item
train2 = train2.drop('item_1',axis=1)
train3 = train2.transpose()
user_1 = train3.apply(lambda x: x > 0, raw=True).sum(axis=1)
# user_1.value_counts()

# Filter down to the items with greater than or equal to 5 ratings
train3['user_1'] = user_1
train4 = train3.loc[train3['user_1'] >= 5]
train4 = train4.drop('user_1',axis=1)
train5 = train4.transpose()
# train5.shape

# Repeat the process for both user and item
item_2 = train5.apply(lambda x: x > 0, raw=True).sum(axis=1)
# item_2.value_counts()
# item_2.shape
train5['item_2'] = item_2
train6 = train5.loc[train5['item_2'] >= 5]
train6 = train6.drop('item_2',axis=1)
train7 = train6.transpose()
user_2 = train7.apply(lambda x: x > 0, raw=True).sum(axis=1)
# user_2.value_counts()
train7['user_2'] = user_2
train8 = train7.loc[train7['user_2'] >= 5]
train8 = train8.drop('user_2',axis=1)
train9 = train8.transpose()
# train9.shape

# Check every user and item has at least 5 ratings
item_3 = train9.apply(lambda x: x > 0, raw=True).sum(axis=1)
# item_3.value_counts()
user_3 = train9.apply(lambda x: x > 0, raw=True).sum(axis=1)
# user_3.value_counts()

# Filter down the test matrix to filtered user and item in train matrix
test9 = test_matrix.loc[train9.index,train9.columns]
# test9.shape

In [92]:
with open("train9.txt", "wb") as f:
    pickle.dump(train9, f)
with open("test9.txt", "wb") as f:
    pickle.dump(test9, f)

In [85]:
item_1.head(5)

k_jI8TeypNwvXQDHM7Z8eA    2
4fXZeX6b23YaAlhkkTldww    2
en8pmJboMdEBRkXIIMsP4Q    2
9YUrhoRIfXL4mVyPRNMgxQ    2
qRYwodYPMMkl7QKnpmc39Q    6
dtype: int64

In [67]:
def collaborative_filtering(train, test, sim='cosine', type='user', knn=5):

    # Fill NAN values in train & test data as 0's
    train_0 = train.fillna(0)
    test_0 = test.fillna(0)

    # Create a similarity matrix of either users or items based on cosine or 
    # pearson correlation measure
    if sim == 'cosine':
        user_dist = pairwise_distances(train_0, metric='cosine')
        item_dist = pairwise_distances(train_0.T, metric='cosine')

    elif sim == 'pearson':
        user_dist = pairwise_distances(train_0, metric='correlation')
        item_dist = pairwise_distances(train_0.T, metric='correlation')
        
    user_sim = 1 - user_dist
    item_sim = 1 - item_dist
 
    # Create a dataframe with mean user ratings
    mean_rating = train.mean(axis=1)
    mean_user_rating = pd.concat([mean_rating] * len(train.columns), axis=1)
    mean_user_rating.columns = train.columns
    
    # Modify a dataframe so that mean user ratings are present only in matrix 
    # positions of rated items and 0's in matrix positions of non-rated items
    mean_user_rating_0 = mean_user_rating
    mean_user_rating_0[train_0 == 0] = 0
    
    # Normalize every user's ratings to mean of zero
    ratings_diff = train_0 - mean_user_rating_0
    
    # Create a dataframe with user's mean user ratings present in all items
    mean_user_rating_f = pd.concat([mean_rating] * len(train.columns), axis=1)
    mean_user_rating_f.columns = train.columns
    
    # Create a placeholder dataframe for predictions of rated items in test data
    pred = pd.DataFrame(index=train.index, columns=train.columns)
    
    # User-Based Collaborative Filtering
    if type == 'user':    
        # Index user similarity matrix with user ids for both rows and columns
        user_sim = pd.DataFrame(user_sim, index=train.index, columns=train.index)
        # When the number of k neareast neighbors is specified
        if knn != 'all':
            user_sim_mat = user_sim.as_matrix()
        
            # Item id's of rated items in test data for each user
            cols = test.columns
            test_rated = test.apply(lambda x: x > 0, raw=True).apply(lambda x: list(cols[x.values]), axis=1)
            
            # Iterate over each user, m
            for m in range(len(user_sim)):
                # Retrieve column of user m's similarities to all other users
                temp = user_sim_mat[m]
                temp = pd.DataFrame(temp,index=train.index, columns=['similarity'])
                # Rank user m's similarities
                temp['rank'] = temp['similarity'].rank(ascending=0)
                
                # Iterate over user m's rated items
                for n in range(len(test_rated[m])):
                    # For user m's nth rated item, extract column of ratings of 
                    # all users corresponding to nth item
                    temp2 = ratings_diff[[test_rated[m][n]]]
                    temp2.columns = ['rating']
                    # Contatenate similarity, rank, rating as one dataframe
                    result = pd.concat([temp, temp2], axis=1)    
                    # Filter down to the users who rated the items
                    result2 = result[result['rating'] != 0]
                    # Filter down to knn number of users with the knn 
                    # highest similairites
                    result3 = result2.nsmallest(int(knn), 'rank')
                
                    # Divide weighted sum of user's knn nearest neighbors' ratings by 
                    # sum of their similarities
                    score = result3['similarity'].dot(result3['rating'])/result3['similarity'].sum()
                    mean = mean_user_rating_f.loc[train.index[m]][test_rated[m][n]]
                
                    # Make a prediction by adding user's mean rating to weighted sum
                    pred.loc[train.index[m]][test_rated[m][n]] = mean + score
                    
        # When the number of k neareast neighbors is not specified and all 
        # available neighbors are used for prediction     
        elif knn == 'all':  
            # Compute user similarity weighted sum of available ratings of 
            # user's every neighbor 
            num_user = user_sim.dot(ratings_diff)
            
            # Sum user similarities    
            sum_sim_user = user_sim.sum(axis=1)
            sum_sim_mat_user = pd.concat([sum_sim_user] * len(train.columns), axis=1)
            sum_sim_mat_user.columns = train.columns

            # Create a dataframe of predictions computed by adding mean user 
            # rating to user similairty weighted sum of user's ratings 
            # divided by sum of user similarities
            pred = mean_user_rating_f + num_user / sum_sim_mat_user
                
    # Item-Based Collaborative Filtering  
    elif type == 'item':
        # Index item similarity matrix with item ids for both rows and columns
        item_sim = pd.DataFrame(item_sim, index=train.columns, columns=train.columns)
        
        # When the number of k neareast neighbors is specified
        if knn != 'all':
            item_sim_mat = item_sim.as_matrix()

            # User id's of rated items in test data for each item
            cols2 = test.T.columns
            test_rated2 = test.T.apply(lambda x: x > 0, raw=True).apply(lambda x: list(cols2[x.values]), axis=1)
        
            # Iterate over each item, m
            for m in range(len(item_sim)):
                # Retrieve column of item m's similarities to all other items
                temp = item_sim_mat[m]
                temp = pd.DataFrame(temp,index=train.columns, columns=['similarity'])
                # Rank item m's similarities
                temp['rank'] = temp['similarity'].rank(ascending=0)

                # Iterate over item m's rated users
                for n in range(len(test_rated2[m])):
                    # For item m's nth rated user, extract column of ratings of 
                    # all items corresponding to nth user
                    temp2 = ratings_diff.T[[test_rated2[m][n]]]
                    temp2.columns = ['rating']
                    # Contatenate similarity, rank, rating as one dataframe
                    result = pd.concat([temp, temp2], axis=1)    
                    # Filter down to the items that are rated by the user
                    result2 = result[result['rating'] != 0]
                    # Filter down to knn number of items with the knn 
                    # highest similairites
                    result3 = result2.nsmallest(int(knn), 'rank')
                
                    # Divide weighted sum of item's knn nearest neighbors' 
                    # ratings by sum of their similarities
                    score = result3['similarity'].dot(result3['rating'])/result3['similarity'].sum()
                    mean = mean_user_rating_f.loc[test_rated2[m][n]][train.columns[m]]
                
                    # Make a prediction by adding user's mean rating to weighted sum
                    pred.loc[test_rated2[m][n]][train.columns[m]] = mean + score
        
        # When the number of k neareast neighbors is not specified and 
        # all available neighbors are used for prediction
        elif knn == 'all':
            # Compute item similarity weighted sum of available ratings of 
            # item's every neighbor
            num_item = ratings_diff.dot(item_sim)

            # Sum item similarities 
            sum_sim_item = item_sim.sum(axis=1)
            sum_sim_item = pd.DataFrame(sum_sim_item, index=train.columns)
            sum_sim_mat_item = pd.concat([sum_sim_item.T] * len(train), axis=0)

            sum_sim_mat_item.index = train.index

            # Create a dataframe of predictions computed by adding mean user 
            # rating to item similairty weighted sum of items' ratings 
            # divided by sum of item similarities
            pred = mean_user_rating_f + num_item / sum_sim_mat_item
            
    pred_0 = pred.fillna(0)
    pred_0_mat = pred_0.as_matrix()            
    return pred_0_mat

In [60]:
test9_0 = test9.fillna(0)
test9_0_mat = test9_0.as_matrix()

In [79]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

def mae(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten() 
    actual = actual[actual.nonzero()].flatten()
    return mean_absolute_error(prediction, actual)
def mse(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten() 
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(prediction, actual)

## run American restaurants in Las Vegas

In [71]:
start_time = time.time()
pred_user_5 = collaborative_filtering(train9, test9, sim='cosine', type='user', knn='5')
pred_user_10 = collaborative_filtering(train9, test9, sim='cosine', type='user', knn='10')
pred_user_20 = collaborative_filtering(train9, test9, sim='cosine', type='user', knn='20')
pred_user_all = collaborative_filtering(train9, test9, sim='cosine', type='user', knn='all')
pred_item_5 = collaborative_filtering(train9, test9, sim='cosine', type='item', knn='5')
pred_item_10 = collaborative_filtering(train9, test9, sim='cosine', type='item', knn='10')
pred_item_20 = collaborative_filtering(train9, test9, sim='cosine', type='item', knn='20')
pred_item_all = collaborative_filtering(train9, test9, sim='cosine', type='item', knn='all')
print ('8 Runs of Collaborative Filtering for Restaurant in Toronto took %s seconds' % (time.time() - start_time))



8 Runs of Collaborative Filtering for Restaurant in Toronto took 57.63646197319031 seconds


In [72]:
start_time = time.time()
pred_user_cosine = collaborative_filtering(train9, test9, sim='cosine', type='user', knn='all')
pred_user_pearson = collaborative_filtering(train9, test9, sim='pearson', type='user', knn='all')
pred_item_cosine = collaborative_filtering(train9, test9, sim='cosine', type='item', knn='all')
pred_item_pearson = collaborative_filtering(train9, test9, sim='pearson', type='item', knn='all')
print ('4 Runs of Collaborative Filtering for Restaurant in Toronto took %s seconds' % (time.time() - start_time))

4 Runs of Collaborative Filtering for Restaurant in Toronto took 1.3039658069610596 seconds


In [93]:
pred_user_5

array([[ 4.2160378,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       ..., 
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ],
       [ 0.       ,  0.       ,  0.       , ...,  0.       ,  0.       ,
         0.       ]])

In [74]:
print ('American Restaurnats in Las Vegas with Varying Number of Nearest Neighbors')
print ('User-based CF using 5 nearest neighbors MAE: ' + str(round(mae(pred_user_5, test9_0_mat),4)))
print ('User-based CF using 10 nearest neighbors MAE: ' + str(round(mae(pred_user_10, test9_0_mat),4)))
print ('User-based CF using 20 nearest neighbors MAE: ' + str(round(mae(pred_user_20, test9_0_mat),4)))
print ('User-based CF using All available neighbors MAE: ' + str(round(mae(pred_user_all, test9_0_mat),4)))
print ('Item-based CF using 5 nearest neighbors MAE: ' + str(round(mae(pred_item_5, test9_0_mat),4)))
print ('Item-based CF using 10 nearest neighbors MAE: ' + str(round(mae(pred_item_10, test9_0_mat),4)))
print ('Item-based CF using 20 nearest neighbors MAE: ' + str(round(mae(pred_item_20, test9_0_mat),4)))
print ('Item-based CF using All available neighbors MAE: ' + str(round(mae(pred_item_all, test9_0_mat),4)))

American Restaurnats in Las Vegas with Varying Number of Nearest Neighbors
User-based CF using 5 nearest neighbors MAE: 3.3643
User-based CF using 10 nearest neighbors MAE: 3.3639
User-based CF using 20 nearest neighbors MAE: 3.362
User-based CF using All available neighbors MAE: 0.9308
Item-based CF using 5 nearest neighbors MAE: 3.3712
Item-based CF using 10 nearest neighbors MAE: 3.369
Item-based CF using 20 nearest neighbors MAE: 3.3686
Item-based CF using All available neighbors MAE: 0.9361


In [81]:
print ('American Restaurnats in Las Vegas with Varying Number of Nearest Neighbors')
print ('User-based CF using 5 nearest neighbors MSE: ' + str(round(mse(pred_user_5, test9_0_mat),4)))
print ('User-based CF using 10 nearest neighbors MSE: ' + str(round(mse(pred_user_10, test9_0_mat),4)))
print ('User-based CF using 20 nearest neighbors MSE: ' + str(round(mse(pred_user_20, test9_0_mat),4)))
print ('User-based CF using All available neighbors MSE: ' + str(round(mse(pred_user_all, test9_0_mat),4)))
print ('Item-based CF using 5 nearest neighbors MSE: ' + str(round(mse(pred_item_5, test9_0_mat),4)))
print ('Item-based CF using 10 nearest neighbors MSE: ' + str(round(mse(pred_item_10, test9_0_mat),4)))
print ('Item-based CF using 20 nearest neighbors MSE: ' + str(round(mse(pred_item_20, test9_0_mat),4)))
print ('Item-based CF using All available neighbors MSE: ' + str(round(mse(pred_item_all, test9_0_mat),4)))

American Restaurnats in Las Vegas with Varying Number of Nearest Neighbors
User-based CF using 5 nearest neighbors MSE: 13.0021
User-based CF using 10 nearest neighbors MSE: 13.0036
User-based CF using 20 nearest neighbors MSE: 13.0001
User-based CF using All available neighbors MSE: 1.3475
Item-based CF using 5 nearest neighbors MSE: 13.0236
Item-based CF using 10 nearest neighbors MSE: 13.0189
Item-based CF using 20 nearest neighbors MSE: 13.0179
Item-based CF using All available neighbors MSE: 1.3597


In [100]:
# fig, ax = plt.subplots()
# y = test9_0_mat
# prediction = pred_item_all
# ax.scatter(y, prediction, edgecolors=(0, 0, 0))
# ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
# ax.set_xlabel('Expected')
# ax.set_ylabel('Predicted')
# plt.show()

In [99]:
prediction

array([[ 4.59091244,  4.59731854,  4.6       , ...,  4.60238919,
         4.59806005,  4.59527225],
       [ 2.90190875,  3.00679237,  3.        , ...,  2.99719351,
         3.01004906,  2.99553396],
       [ 4.34950552,  4.41355194,  4.42269869, ...,  4.40816916,
         4.40131907,  4.37710422],
       ..., 
       [ 3.49583794,  3.47771986,  3.49330436, ...,  3.51585984,
         3.48989928,  3.5       ],
       [ 3.80431016,  3.7990724 ,  3.80340129, ...,  3.80195253,
         3.7997083 ,  3.80931724],
       [ 3.79861863,  3.7938602 ,  3.79761898, ...,  3.7987674 ,
         3.80881777,  3.80112589]])