In [2]:
from collections import defaultdict
import numpy as np
import json
import math
import matplotlib.pyplot as plt
import pandas as pd
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
with open("data/filter_all_t.json", 'r') as file:
    data = json.load(file)

In [7]:
data.keys()

dict_keys(['train', 'val', 'test'])

In [8]:
allReviews = []
for d in data['train']:
    allReviews.append((d['user_id'],d['business_id'],d['rating']))

for d in data['val']:
    allReviews.append((d['user_id'],d['business_id'],d['rating']))

for d in data['test']:
    allReviews.append((d['user_id'],d['business_id'],d['rating']))

In [9]:
reviewsTrain = allReviews[:len(data['train'])]
reviewsValid = allReviews[len(data['train']) - 1:len(data['train']) - 1 + len(data['val'])]
reviewsTest = allReviews[-len(data['test']):]

## Surprise SVD

In [10]:
ratings = [r for _,_,r in allReviews]
reader = Reader(rating_scale=(min(ratings), max(ratings)))

trainU = [u for u,_,_ in reviewsTrain]
trainI = [i for _,i,_ in reviewsTrain]
trainR = [r for _,_,r in reviewsTrain]

testU = [u for u,_,_ in reviewsTest]
testI = [i for _,i,_ in reviewsTest]
testR = [r for _,_,r in reviewsTest]

# Convert into pandas DataFrame object
trainData = pd.DataFrame({'u': trainU, 'i': trainI, 'r': trainR})

# Load into Dataset object
trainLoader = Dataset.load_from_df(trainData,reader)

# Create Trainset object to train Surprise model
trainset = trainLoader.build_full_trainset()

In [14]:
# param_grid = {"n_factors": [1,2,3,4,5], "lr_all": [0.001,.002,.003,.004,.005], "reg_bu": [.2,.3,.4], "reg_bi": [.2,.3,.4], "reg_pu": [.2,.3,.4], "reg_qi": [.2,.3,.4]}
param_grid = {"lr_all": [.002,.003,.004], "reg_bu": [.01,.05,.1], "reg_bi": [.2,.3,.4], "reg_pu": [.01,.05,.1], "reg_qi": [.01,.05,.1]}

In [15]:
gs = GridSearchCV(SVD, param_grid, measures=["mse"], cv=3, n_jobs=-1)

In [16]:
# Comment this out cause it takes a long time to run
gs.fit(trainLoader)

In [17]:
print(gs.best_params["mse"])
print(gs.best_score)

{'lr_all': 0.004, 'reg_bu': 0.01, 'reg_bi': 0.4, 'reg_pu': 0.1, 'reg_qi': 0.1}
{'mse': 0.6672666133836253}


In [18]:
model = SVD(n_factors=1, n_epochs=30, lr_all=.003, reg_bu=0.01, reg_bi=2.5, reg_pu=0.01, reg_qi=0.06, random_state=0)

In [19]:
model.fit(trainset)
predictions = model.test(reviewsTest)

In [20]:
# Evaluate
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

mse = sse / len(predictions)
mse

0.6769716283799662

In [21]:
# Base MSE to compare with model
avg = sum([r for _,_,r in reviewsTrain]) / len(reviewsTrain)
alwaysPredictMean = [avg for _ in reviewsTest]

sse = 0
for i in range(len(reviewsTest)):
    sse += (alwaysPredictMean[i] - testR[i])**2

print(sse / len(alwaysPredictMean))

0.6822627089464424


In [22]:
def Precision(predictions, k=3):
    userRanks = defaultdict(list)
    
    for p in predictions:
        userRanks[p.uid].append((p.est,p.iid))
        
    for u in userRanks:
        userRanks[u].sort(reverse=True)
        
    totalPreKU = 0
    for u, preds in userRanks.items():
        relevance = 0
        iIds = [p[1] for p in userRanks[u][:k]]
        for _, (_,b) in enumerate(preds):
            if b in testI and b in iIds:
                relevance += 1
        totalPreKU += relevance / k

    return totalPreKU / len(userRanks)

In [23]:
precision = Precision(predictions)
precision

0.7966666666666425

In [24]:
def Recall(predictions, k=3):
    userRanks = defaultdict(list)
    
    for p in predictions:
        userRanks[p.uid].append((p.est,p.iid))
        
    for u in userRanks:
        userRanks[u].sort(reverse=True)
        
    totalRKU = 0
    for u, preds in userRanks.items():
        relevance = 0
        iIds = [p[1] for p in userRanks[u][:k]]
        for _, (_,b) in enumerate(preds):
            if b in testI and b in iIds:
                relevance += 1
        totalRKU += relevance / len(preds)

    return totalRKU / len(userRanks)

In [25]:
recall = Recall(predictions)
recall

0.9159855047592548

In [26]:
# Checking MRR for top 3 restaurant recommendations 
def MRR(predictions, k=3):
    userRanks = defaultdict(list)
    labelRanks = defaultdict(list)

    for p in predictions:
        userRanks[p.uid].append((p.iid,p.est))

    for (u,i,p) in reviewsTest:
        labelRanks[u].append((p,i))

    for u in userRanks:
        userRanks[u].sort(key=lambda x: x[1], reverse=True)

    for u in userRanks:
        labelRanks[u].sort(key=lambda x: x[0], reverse=True)

    totalMRR = 0
    for u,preds in userRanks.items():
        rank = 0
        ps = [p[1] for p in labelRanks[u][:k]]
        for idx, (iid,_) in enumerate(preds[:k]):
            if iid in testI and iid in ps:
                rank = idx + 1
                break
        if rank > 0:
            totalMRR += 1.0 / rank
    
    return totalMRR / len(userRanks) if len(userRanks) > 0 else 0

In [27]:
mrr = MRR(predictions)
mrr

0.9544144144144145

In [28]:
def NDCG(predictions, k=3):
    userRanks = defaultdict(list)
    
    for p in predictions:
        userRanks[p.uid].append((p.iid,p.est,p.r_ui))
        
    totalDCG = 0
    for _,preds in userRanks.items():
        predictedRanks = sorted(preds, key=lambda x: x[1], reverse=True)
        rank = 0
        for idx, (iid,_,actualRating) in enumerate(predictedRanks[:k]):
            if iid in testI and actualRating >= 4:
                rank = idx + 1
                totalDCG += actualRating/np.log2(rank+1)
    
    totalIDCG = 0
    for _,preds in userRanks.items():
        actualRanks = sorted(preds, key=lambda x: x[2], reverse=True)
        rank = 0
        for idx, (iid,_,actualRating) in enumerate(actualRanks[:k]):
            if iid in testI and actualRating >= 4:
                rank = idx + 1
                totalIDCG += actualRating/np.log2(rank+1)

    return totalDCG, totalDCG/totalIDCG

In [29]:
dcg,ndcg = NDCG(predictions)
dcg,ndcg

(28714.45766803598, 0.9507802138805759)

In [30]:
metrics = {}
metrics['MRR'] = mrr
metrics['average MSE'] = mse
metrics["NDCG"] = ndcg
metrics["precision"] = precision
metrics["recall"] = recall

In [31]:
metrics

{'MRR': 0.9544144144144145,
 'average MSE': 0.6769716283799662,
 'NDCG': 0.9507802138805759,
 'precision': 0.7966666666666425,
 'recall': 0.9159855047592548}

In [None]:
f = open("SVD Metrics.txt", 'w')
f.write(str(metrics) + '\n')
f.close()

In [34]:
def recommendTopNRestaurants(predictions, n=3):
    topN = defaultdict(list)
    for p in predictions:
        topN[p.uid].append((p.iid, p.est))
    for uid, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[uid] = ratings[:n]
    return topN

In [35]:
top3 = recommendTopNRestaurants(predictions)

for u, recs in top3.items():
    print(f"User {u}: {recs}")

User 108919790647235091207: [('604bf6a75041fa50c4bce594', 4.465252318619057), ('6055eda33019cb0a47838b25', 4.459080575310336), ('604ba46f20f26f37fb9d7d69', 4.383683957815948)]
User 108111397722253060630: [('604ee8b388c7af3f893e613b', 4.535075435926409), ('604bca76d40e4bc9b841777c', 4.520910933326895), ('6041e2e8475f3961ca526f49', 4.506861273539687)]
User 114961509003938009497: [('6055a6683019cb0a478389d5', 4.534760880471248), ('605121e75b4ccec8d5caea76', 4.524699884086147), ('60519f27a740b9d848c0aad7', 4.5061475368389505)]
User 108970514621946615447: [('6047d70ab1a0aaee3eef9334', 4.584019576196448), ('6050d2cf9c93e55e75b72260', 4.529205319833562), ('6055d1a197d555cc6fb0cf17', 4.524837473795552)]
User 113922271624881467869: [('60413085c6fcf1fddba11e1d', 4.554699048318719), ('604bd29877e81aaed3cc9905', 4.530175929105738), ('6046315010ec061e056b3f81', 4.500252708370489)]
User 106054094716897428670: [('604ced1677e81aaed3cca3f7', 4.441592877179192), ('6040ac0a9d953d1f97fa1903', 4.4361204715