In [508]:
import pandas
import numpy as np
from sklearn.neighbors import KDTree,BallTree,NearestNeighbors
from scipy import spatial
import math
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

Retrieving business, training and testing reviews dataframe from pickle

In [509]:
business_df = pandas.read_pickle('business.pkl')
review_df = pandas.read_pickle('reviews_filtered.pkl')

Setting the data for 4-fold cross validation

In [513]:
from sklearn.model_selection import KFold 
kf = KFold(n_splits=4) 
kf.get_n_splits(review_df)  
print(kf)   
KFold(n_splits=4, random_state=None, shuffle=False)
results = []
for train_index, test_index in kf.split(review_df): 
    results.append([train_index,test_index])
train_index, test_index = results[2]
review_train_df, review_test_df = review_df.iloc[train_index], review_df.iloc[test_index]

KFold(n_splits=4, random_state=None, shuffle=False)


Calculating overall rating average and average of ratings by a particular user and for a particular business

In [514]:
overall_avg_rating = review_train_df["stars"].mean() #average of all the ratings in the dataset
avg_user_rating_df = review_train_df.groupby(["user_id"]).mean() #average of all the ratings by a particular user
avg_business_rating_df = review_train_df.groupby(["business_id"]).mean() #average of all the ratings for a particular business

Calculating rating by a particular user for a particular business by baseline approach

In [515]:
def calc_rating_baseline(user_id, business_id):
    user_bias = 0
    business_bias = 0
    user_rating = avg_user_rating_df[avg_user_rating_df.index == user_id]["stars"]
    if not user_rating.empty:
        user_bias = user_rating.item() - overall_avg_rating
    business_rating = avg_business_rating_df[avg_business_rating_df.index == business_id]["stars"]
    if not business_rating.empty:
        business_bias = business_rating.item() - overall_avg_rating
    return overall_avg_rating + user_bias + business_bias

Hash user ids to matrix indices

In [516]:
user_id_hash = {}
user_ids = review_train_df["user_id"].unique()
for i,v in enumerate(user_ids):
    user_id_hash[v] = i

Hash business ids to matrix indices

In [517]:
business_id_hash = {}
business_ids = review_train_df["business_id"].unique()
for i,v in enumerate(business_ids):
    business_id_hash[v] = i

Create utility matrix

In [518]:
R = len(review_train_df["business_id"].unique())
C = len(review_train_df["user_id"].unique())
U = np.empty((R,C))
U[:] = np.nan

In [519]:
def populate_U(business_id,user_id,rating):
    U[business_id_hash[business_id],user_id_hash[user_id]] = rating

Populate utility matrix

In [520]:
review_train_df.apply(lambda x: populate_U(x.business_id,x.user_id,x.stars),axis = 1)

197        None
198        None
205        None
206        None
211        None
214        None
221        None
222        None
226        None
232        None
239        None
257        None
2376       None
2379       None
2382       None
2384       None
2387       None
2388       None
2390       None
2393       None
2395       None
2396       None
2397       None
8832       None
8836       None
8837       None
8838       None
8840       None
8847       None
8848       None
           ... 
5996070    None
5996087    None
5996092    None
5996093    None
5996094    None
5996095    None
5996097    None
5996099    None
5996105    None
5996112    None
5996121    None
5996123    None
5996128    None
5996139    None
5996143    None
5996149    None
5996151    None
5996153    None
5996652    None
5996659    None
5996663    None
5996668    None
5996669    None
5996670    None
5996672    None
5996676    None
5996683    None
5996689    None
5996695    None
5996699    None
Length: 13976, dtype: ob

Calculating row mean for every row (ratings for every item) excluding nan values which represent no rating

In [521]:
row_means = np.nanmean(U, axis=1)

Centering rating for every business around its mean rating

In [522]:
row_means_col_vec = row_means.reshape((len(U), 1))
centered_U = U - row_means_col_vec
centered_U = np.nan_to_num(centered_U)

In [523]:
def cosine_dist(v1,v2):
    return spatial.distance.correlation(v1,v2)

Computing the k nearest neighbors of each restaurant along with their distances

In [524]:
tree = NearestNeighbors(algorithm = 'ball_tree', metric = cosine_dist,n_neighbors=len(centered_U)).fit(centered_U)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [525]:
dist,indices = tree.kneighbors(centered_U)
dist,indices

  dist = 1.0 - uv / np.sqrt(uu * vv)


(array([[0.        , 0.74831137, 0.78823669, ...,        inf,        inf,
                inf],
        [0.        , 0.65828714, 0.68418319, ...,        inf,        inf,
                inf],
        [0.        , 0.701519  , 0.74338032, ...,        inf,        inf,
                inf],
        ...,
        [       inf,        inf,        inf, ...,        inf,        inf,
                inf],
        [       inf,        inf,        inf, ...,        inf,        inf,
                inf],
        [       inf,        inf,        inf, ...,        inf,        inf,
                inf]]), array([[  0, 945, 453, ...,   0,   0,   0],
        [  1, 464, 466, ...,   0,   0,   0],
        [  2, 259, 375, ...,   0,   0,   0],
        ...,
        [  0,   0,   0, ...,   0,   0,   0],
        [  0,   0,   0, ...,   0,   0,   0],
        [  0,   0,   0, ...,   0,   0,   0]], dtype=int64))

Calculating rating by a particular user for a particular business by just item item collaborative filtering approach

In [526]:
def calc_rating_cf(user_id,business_id,k):
    if user_id not in user_id_hash or business_id not in business_id_hash:
         return calc_rating_baseline(user_id,business_id)
    result = 0
    sim_sum = 0
    u = user_id_hash[user_id]
    b = business_id_hash[business_id]
    distances = dist[b]
    sim_items = indices[b]
    i = 0
    j = 1
    while j < len(distances):
        if not math.isnan(U[sim_items[j],u]):
            temp = business_ids[sim_items[j]]
            result += (1 - distances[j]) * (U[sim_items[j],u])
            sim_sum += (1 - distances[j])
            i += 1
        if i == k:
            break
        j += 1
        
    if result == 0 or sim_sum == 0:
        return calc_rating_baseline(user_id,business_id)
    return result/sim_sum

Collaborative filtering in CF + baseline model

In [527]:
def calc_rating_cf_common(user_id,business_id,k):
    if user_id not in user_id_hash or business_id not in business_id_hash:
         return 0
    result = 0
    sim_sum = 0
    u = user_id_hash[user_id]
    b = business_id_hash[business_id]
    distances = dist[b]
    sim_items = indices[b]
    i = 0
    j = 1
    while j < len(distances):
        if not math.isnan(U[sim_items[j],u]):
            temp = business_ids[sim_items[j]]
            result += (1 - distances[j]) * (U[sim_items[j],u] - calc_rating_baseline(user_id,temp))
            sim_sum += (1 - distances[j])
            i += 1
        if i == k:
            break
        j += 1
        
    if result == 0 or sim_sum == 0:
        return 0
    return result/sim_sum

Calculating rating by a particular user for a particular business by baseline plus item item collaborative filtering approach

In [528]:
def calc_rating_baseline_cf(user_id,business_id,k):
    return calc_rating_baseline(user_id,business_id) + calc_rating_cf_common(user_id,business_id,k)

Calculating user to business ratings for the held out test dataset

In [550]:
review_test_result_df = review_test_df.copy(deep = True)
review_test_result_df["stars"] = 0

In [555]:
review_test_result_df['stars'] = review_test_result_df.apply(lambda x: calc_rating_baseline_cf(x[0], x[1],13),axis = 1)



Calculating RMSE

In [556]:
def calc_RMSE():
    return ((review_test_result_df.stars - review_test_df.stars) ** 2).mean() ** 0.5
result = calc_RMSE()
result

4.870348568185751

Calculating average RMSE over 4 folds

In [487]:
import statistics
baseline = [1.096,1.084,1.024,1.085]
cf = [1.094,1.088,1.021,1.090]
baseline_cf = [1.08,1.018,1.086,1.093]
baseline_RMSE = statistics.mean(baseline)
cf_RMSE = statistics.mean(cf)
baseline_cf_RMSE = statistics.mean(baseline_cf)
baseline_RMSE,cf_RMSE,baseline_cf_RMSE

(1.07225, 1.07325, 1.06925)