# Collaborative filtering model

This model exploits the similarities between the preferences of the users and the past ratings given by a user to generate recommendations. 

In [12]:
# Importing important libraries
import pandas as pd
import numpy as np

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold

pd.set_option('max_columns', 3000)
pd.set_option('max_rows', 3000)

In [4]:
# Importing the dataset

df = pd.read_csv('final_dataset.csv')

In [5]:
# Checking the number of rows and columns

df.shape

(313223, 10)

In [6]:
# Peeking into the dataset

df.head()

Unnamed: 0,business_id,categories,latitude,longitude,restaurant_name,review_count,avg_rating,user_rating,review,user_id
0,C9oCPomVP0mtKa8z99E3gg,"Bakeries, Food",43.754093,-79.349548,Bakery Gateau,8,4.5,3.0,Oh? Another patbingsu review? This one was bet...,orh0HRUNCWuQMt9Iia_osg
1,C9oCPomVP0mtKa8z99E3gg,"Bakeries, Food",43.754093,-79.349548,Bakery Gateau,8,4.5,5.0,What really earns them their 5 stars is the un...,G5hDXvDMNuQ3JQnGCKqsKA
2,C9oCPomVP0mtKa8z99E3gg,"Bakeries, Food",43.754093,-79.349548,Bakery Gateau,8,4.5,4.0,Located inside the Galleria Supermarket.\nStop...,0Suzo_S25mTGJfrlcl1CfA
3,C9oCPomVP0mtKa8z99E3gg,"Bakeries, Food",43.754093,-79.349548,Bakery Gateau,8,4.5,5.0,Yummy cakes! U should try their sweet potato c...,cc7Pav2IUvAkVeqylvAsYg
4,C9oCPomVP0mtKa8z99E3gg,"Bakeries, Food",43.754093,-79.349548,Bakery Gateau,8,4.5,5.0,One of my favorite bakeries! This bakery is in...,keLUgL_4y60BkppiAsIk8Q


In [87]:
# Helper dataframe to get the name of the restaurants later for generating recommendations

df_restaurant = df[['business_id','restaurant_name']]

df_restaurant_name = df_restaurant.groupby('business_id').nth([0]).reset_index() 

For creating this recommendation system, I used a Scikit-learn library called Surprise. This library helps to build and analyze recommender systems that deal with rating data. 

In case of recommendations, it is hard to evaluate accuracy because we don’t have the actual ratings against which we can compare our predictions. However, Surprise has built-in features that help to calculate the accuracy of various machine learning models, and thus helps to optimize the hyperparameters of a model and build a more efficient system. 

In [8]:
# For building this model, I will need only the user_id, business_id and the user_rating 

# Creating the required dataframe

model_df = pd.DataFrame({'restaurant': df['business_id'], 'user_id':df['user_id'], 'rating':df['user_rating']})

In [9]:
# Checking the dataframe

model_df.head()

Unnamed: 0,restaurant,user_id,rating
0,C9oCPomVP0mtKa8z99E3gg,orh0HRUNCWuQMt9Iia_osg,3.0
1,C9oCPomVP0mtKa8z99E3gg,G5hDXvDMNuQ3JQnGCKqsKA,5.0
2,C9oCPomVP0mtKa8z99E3gg,0Suzo_S25mTGJfrlcl1CfA,4.0
3,C9oCPomVP0mtKa8z99E3gg,cc7Pav2IUvAkVeqylvAsYg,5.0
4,C9oCPomVP0mtKa8z99E3gg,keLUgL_4y60BkppiAsIk8Q,5.0


In [10]:
# Checking the shape of the dataframe

model_df.shape

(313223, 3)

In [13]:
# A reader which is the rating scale is needed to load the dataset

# Defining the reader
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
# Loading the dataframe and reader in the required format
data = Dataset.load_from_df(model_df[['user_id', 'restaurant', 'rating']], reader)

In [14]:
# Checking the data

data

<surprise.dataset.DatasetAutoFolds at 0x1172ca5f8>

It is a surprise dataset. First I'll fit a baseline model which predicts the ratings assuming they are normally distributed. Then I'll fit a Singular Value Decomposition model (SVD). This model breaks down the sparse user-item rating matrix into two smaller matrices containing user and item features. It then predicts the ratings based on the dot product of these two matrices. 

As the dataset is sparse and splitting into test and train sets will require that the dataset be split so that we have similar users in both the sets. Since, it will be difficult with this dataset, I am using the built in five-fold cross validation function to optimize the hyperparameters.

In [55]:
# Using Baseline model to predict the ratings

cross_validate(NormalPredictor(), data, cv=5)

{'test_rmse': array([1.66529485, 1.66328004, 1.6652706 , 1.6797694 , 1.67259327]),
 'test_mae': array([1.32784114, 1.32675096, 1.33037977, 1.34094351, 1.33645301]),
 'fit_time': (0.4247579574584961,
  0.5753319263458252,
  0.6054520606994629,
  0.5893511772155762,
  0.5911149978637695),
 'test_time': (0.7982769012451172,
  0.7326300144195557,
  0.7411220073699951,
  0.7073042392730713,
  0.8087201118469238)}

The RMSE error is coming out to be around 1.66.  I will nw try to ft the SVD parameter and minimize this error.

In [53]:
# Fitting a vanilla SVD model 

# defining a cross-validation iterator
kf = KFold(n_splits=3)

# Instantiating the model
algo = SVD()

for trainset, testset in kf.split(data):

    # Fitting the model
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.1579
RMSE: 1.1555
RMSE: 1.1608


Now, I will be using grid search to finetune the hyperparameters and minimize the error term using stochastic gradient descent algorithm:

- n_epochs: number of epochs
- lr_all : learning rate for all the features
- reg_all: regularization factor for all the features
- n_factors: Number of user and restaurant features

In [54]:
from surprise.model_selection import GridSearchCV


param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.1599020945200833
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [55]:
param_grid2 = {'n_epochs': [10, 20], 'lr_all': [0.005, 0.007],
              'reg_all': [0.2, 0.4]}
gs2 = GridSearchCV(SVD, param_grid2, measures=['rmse', 'mae'], cv=5)

gs2.fit(data)

# best RMSE score
print(gs2.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs2.best_params['rmse'])

1.1435678764444355
{'n_epochs': 20, 'lr_all': 0.007, 'reg_all': 0.2}


In [56]:
param_grid3 = {'n_epochs': [30, 40], 'lr_all': [0.005, 0.007],
              'reg_all': [0.1, 0.2]}
gs3 = GridSearchCV(SVD, param_grid3, measures=['rmse', 'mae'], cv=5)

gs3.fit(data)

# best RMSE score
print(gs3.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs3.best_params['rmse'])

1.1433409049700305
{'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.2}


In [58]:
param_grid4 = {'n_factors': [50,100,150], 'n_epochs': [50, 70], 'lr_all': [0.005, 0.007],
              'reg_all': [0.1, 0.2, 0.3]}

gs4 = GridSearchCV(SVD, param_grid4, measures=['rmse', 'mae'], cv=5)

gs4.fit(data)

# best RMSE score
print(gs4.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs4.best_params['rmse'])

1.1439156558651953
{'n_factors': 150, 'n_epochs': 50, 'lr_all': 0.005, 'reg_all': 0.2}


In [60]:
param_grid5 = {'n_factors': [150, 200], 'n_epochs': [50, 70], 'lr_all': [0.005, 0.007],
              'reg_all': [0.2, 0.3]}

gs5 = GridSearchCV(SVD, param_grid5, measures=['rmse', 'mae'], cv=5)

gs5.fit(data)

# best RMSE score
print(gs5.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs5.best_params['rmse'])

1.143351304118016
{'n_factors': 200, 'n_epochs': 50, 'lr_all': 0.005, 'reg_all': 0.2}


Based on the above I will be using model 3 to build the model beacuse we culd get the lowest RMSE that too with 100 features.

In [17]:
# Loading the full dataset now
trainset = data.build_full_trainset()

# Instantiating the model
model = SVD(n_epochs = 30, lr_all = 0.005, reg_all = 0.2)

# Fitting the model on the complete dataset
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x105ae4b70>

#### Creating the recommendation function 

In [103]:
def collaborative_fitering(user_id):
    
    '''
    The function provides a list of five recommendations for the restauranst
    
    input: user_id
    
    output: a list of five recommended restaurants that the user has not rated
    
    '''

    # Get a list of all the restaurant ids
    restaurants = model_df['restaurant'].unique()

    # Get a list of restaurants that user has not rated
    not_rated = model_df.loc[model_df['user_id']== user_id, 'restaurant']

    #Remove the list of restaurants that user has rated
    iids_to_predict = np.setdiff1d(restaurants, not_rated)
    
    # Creating a test_set to put in the model to make predictions
    test_set = [[user_id, iid, 4] for iid in iids_to_predict]
    
    # Fitting the model to make predictions for the ratings
    predictions = model.test(test_set)
    
    # Creating an array of the predictions
    pred_ratings = np.array([pred.est for pred in predictions])
    
    # Getting the restaurant_ids with highest predicted ratings for the user
    i_max = pred_ratings.argsort()[-5:][::-1]
    iid = iids_to_predict[i_max]
    
    # Getting the names of the recommended restaurants from the helper dataframe
    recommended_restaurants = [(df_restaurant_name[df_restaurant_name['business_id'] == restaurant_id]['restaurant_name']) for restaurant_id in iid]
    
    print('Recommended restaurants:')
    print(recommended_restaurants)

In [111]:
collaborative_fitering('C9oCPomVP0mtKa8z99E3gg')

Recommended restaurants:
[1725    Zeal Burgers
Name: restaurant_name, dtype: object, 3300    Mallo
Name: restaurant_name, dtype: object, 3783    Baretto Caffe
Name: restaurant_name, dtype: object, 2120    Viva Shawarma
Name: restaurant_name, dtype: object, 4004    Kiyo
Name: restaurant_name, dtype: object]


The collaborative filtering model while taking into account user preferences, suffers from the cold-start problem (it needs user ratings to generate recommendations).