In [3]:
%pip install scikit-surprise

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-win_amd64.whl size=1291215 sha256=fb466e03ae7645ec1eaf0088e2cd68dd6d5f89ae232df7795bf6470854c722ea
  Stored in directory: c:\users\aliz0\appdata\local\pip\cache\wheels\75\fa\bc\739b


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from collections import defaultdict
import numpy as np
import json
import math
import matplotlib.pyplot as plt
import pandas as pd
from surprise import SVD, Reader, Dataset, SVDpp
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
with open("data/filter_all_t.json", 'r') as file:
    data = json.load(file)

In [9]:
data.keys()

dict_keys(['train', 'val', 'test'])

In [10]:
allReviews = []
for d in data['train']:
    allReviews.append((d['user_id'],d['business_id'],d['rating']))

for d in data['val']:
    allReviews.append((d['user_id'],d['business_id'],d['rating']))

for d in data['test']:
    allReviews.append((d['user_id'],d['business_id'],d['rating']))

In [11]:
reviewsTrain = allReviews[:len(data['train'])]
reviewsValid = allReviews[len(data['train']) - 1:len(data['train']) - 1 + len(data['val'])]
reviewsTest = allReviews[-len(data['test']):]

## Surprise SVDpp

In [12]:
ratings = [r for _,_,r in allReviews]
reader = Reader(rating_scale=(min(ratings), max(ratings)))

trainU = [u for u,_,_ in reviewsTrain]
trainI = [i for _,i,_ in reviewsTrain]
trainR = [r for _,_,r in reviewsTrain]

testU = [u for u,_,_ in reviewsTest]
testI = [i for _,i,_ in reviewsTest]
testR = [r for _,_,r in reviewsTest]

# Convert into pandas DataFrame object
trainData = pd.DataFrame({'u': trainU, 'i': trainI, 'r': trainR})

# Load into Dataset object
trainLoader = Dataset.load_from_df(trainData,reader)

# Create Trainset object to train Surprise model
trainset = trainLoader.build_full_trainset()

In [13]:
param_grid = {"n_factors": [1,2,3,4,5], "lr_all": [0.001,.002,.003,.004,.005], "reg_bu": [.2,.3,.4], "reg_bi": [.2,.3,.4], "reg_pu": [.2,.3,.4], "reg_qi": [.2,.3,.4]}

In [14]:
gs = GridSearchCV(SVDpp, param_grid, measures=["mse"], cv=3)

In [None]:
# Comment this out cause it takes a long time to run
gs.fit(trainLoader)

In [None]:
print(gs.best_params["mse"])
print(gs.best_score)

In [17]:
model = SVDpp(n_factors=1, n_epochs=30, lr_all=.003, reg_bu=0.01, reg_bi=2.5, reg_pu=0.01, reg_qi=0.06, random_state=0)

In [18]:
model.fit(trainset)
predictions = model.test(reviewsTest)

In [19]:
# Evaluate
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est)**2

mse = sse / len(predictions)
mse

0.677002364782439

In [20]:
# Base MSE to compare with model
avg = sum([r for _,_,r in reviewsTrain]) / len(reviewsTrain)
alwaysPredictMean = [avg for _ in reviewsTest]

sse = 0
for i in range(len(reviewsTest)):
    sse += (alwaysPredictMean[i] - testR[i])**2

print(sse / len(alwaysPredictMean))

0.6822627089464424


In [21]:
def Precision(predictions, k=3):
    userRanks = defaultdict(list)
    
    for p in predictions:
        userRanks[p.uid].append((p.est,p.iid))
        
    for u in userRanks:
        userRanks[u].sort(reverse=True)
        
    totalPreKU = 0
    for u, preds in userRanks.items():
        relevance = 0
        iIds = [p[1] for p in userRanks[u][:k]]
        for _, (_,b) in enumerate(preds):
            if b in testI and b in iIds:
                relevance += 1
        totalPreKU += relevance / k

    return totalPreKU / len(userRanks)

In [22]:
precision = Precision(predictions)
precision

0.7966666666666425

In [23]:
def Recall(predictions, k=3):
    userRanks = defaultdict(list)
    
    for p in predictions:
        userRanks[p.uid].append((p.est,p.iid))
        
    for u in userRanks:
        userRanks[u].sort(reverse=True)
        
    totalRKU = 0
    for u, preds in userRanks.items():
        relevance = 0
        iIds = [p[1] for p in userRanks[u][:k]]
        for _, (_,b) in enumerate(preds):
            if b in testI and b in iIds:
                relevance += 1
        totalRKU += relevance / len(preds)

    return totalRKU / len(userRanks)

In [24]:
recall = Recall(predictions)
recall

0.9159855047592548

In [25]:
# Checking MRR for top 3 restaurant recommendations 
def MRR(predictions, k=3):
    userRanks = defaultdict(list)
    labelRanks = defaultdict(list)

    for p in predictions:
        userRanks[p.uid].append((p.iid,p.est))

    for (u,i,p) in reviewsTest:
        labelRanks[u].append((p,i))

    for u in userRanks:
        userRanks[u].sort(key=lambda x: x[1], reverse=True)

    for u in userRanks:
        labelRanks[u].sort(key=lambda x: x[0], reverse=True)

    totalMRR = 0
    for u,preds in userRanks.items():
        rank = 0
        ps = [p[1] for p in labelRanks[u][:k]]
        for idx, (iid,_) in enumerate(preds[:k]):
            if iid in testI and iid in ps:
                rank = idx + 1
                break
        if rank > 0:
            totalMRR += 1.0 / rank
    
    return totalMRR / len(userRanks) if len(userRanks) > 0 else 0

In [26]:
mrr = MRR(predictions)
mrr

0.9534234234234235

In [27]:
def NDCG(predictions, k=3):
    userRanks = defaultdict(list)
    
    for p in predictions:
        userRanks[p.uid].append((p.iid,p.est,p.r_ui))
        
    totalDCG = 0
    for _,preds in userRanks.items():
        predictedRanks = sorted(preds, key=lambda x: x[1], reverse=True)
        rank = 0
        for idx, (iid,_,actualRating) in enumerate(predictedRanks[:k]):
            if iid in testI and actualRating >= 4:
                rank = idx + 1
                totalDCG += actualRating/np.log2(rank+1)
    
    totalIDCG = 0
    for _,preds in userRanks.items():
        actualRanks = sorted(preds, key=lambda x: x[2], reverse=True)
        rank = 0
        for idx, (iid,_,actualRating) in enumerate(actualRanks[:k]):
            if iid in testI and actualRating >= 4:
                rank = idx + 1
                totalIDCG += actualRating/np.log2(rank+1)

    return totalDCG, totalDCG/totalIDCG

In [28]:
dcg,ndcg = NDCG(predictions)
dcg,ndcg

(28708.755581239555, 0.9505914089459138)

In [29]:
metrics = {}
metrics['MRR'] = mrr
metrics['average MSE'] = mse
metrics["NDCG"] = ndcg
metrics["precision"] = precision
metrics["recall"] = recall

In [30]:
metrics

{'MRR': 0.9534234234234235,
 'average MSE': 0.677002364782439,
 'NDCG': 0.9505914089459138,
 'precision': 0.7966666666666425,
 'recall': 0.9159855047592548}

In [31]:
f = open("metrics.txt", 'w')
f.write(str(metrics) + '\n')
f.close()

In [32]:
def recommendTopNRestaurants(predictions, n=3):
    topN = defaultdict(list)
    for p in predictions:
        topN[p.uid].append((p.iid, p.est))
    for uid, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[uid] = ratings[:n]
    return topN

In [33]:
top3 = recommendTopNRestaurants(predictions)

for u, recs in top3.items():
    print(f"User {u}: {recs}")

User 108919790647235091207: [('604bf6a75041fa50c4bce594', 4.465252318619057), ('6055eda33019cb0a47838b25', 4.460500808999887), ('604ba46f20f26f37fb9d7d69', 4.383608330050271)]
User 108111397722253060630: [('604ee8b388c7af3f893e613b', 4.535555799969662), ('604bca76d40e4bc9b841777c', 4.520905380050426), ('6041e2e8475f3961ca526f49', 4.506453921022673)]
User 114961509003938009497: [('6055a6683019cb0a478389d5', 4.53424277844989), ('605121e75b4ccec8d5caea76', 4.524364842964519), ('60519f27a740b9d848c0aad7', 4.505609149780677)]
User 108970514621946615447: [('6047d70ab1a0aaee3eef9334', 4.584401567222078), ('6050d2cf9c93e55e75b72260', 4.530830060958892), ('6055d1a197d555cc6fb0cf17', 4.522676712560245)]
User 113922271624881467869: [('60413085c6fcf1fddba11e1d', 4.55337925690176), ('604bd29877e81aaed3cc9905', 4.530625225211462), ('6046315010ec061e056b3f81', 4.49899667749714)]
User 106054094716897428670: [('604ced1677e81aaed3cca3f7', 4.44169595517049), ('6040ac0a9d953d1f97fa1903', 4.433743177944866