# In this workshop we explore user-based and item-based CF using the Surprise Library

In [None]:
pip install scikit-surprise

In [3]:
#from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
# load the Toby dataset as before
file = "/content/drive/My Drive/recsys/simplemovies-transactions.csv"
ratings_df = pd.read_csv(file)
ratings_df.columns = ['user_id','item_id','rating']

In [5]:
# convert the ratings events into Surprise data format
# you must supply the ratings scale as input!
reader = Reader(rating_scale=(1,5)) # assumes data contains: user, item, ratings (in this order)
data = Dataset.load_from_df(ratings_df, reader)

In [None]:
# build a model using user-based or item-based CF
trainset = data.build_full_trainset()  # use all data (ie no train/test split)

# select the model type, below are some examples, you can adjust the parmeters if you wish
#algo = BaselineOnly() # computes baselines for all users and items
#algo = KNNBasic() # default method = User-based CF, default similarity is MSD (euclidean), default k=40
algo = KNNBasic(k=40,sim_options={'name': 'pearson'}) # User-based CF using pearson
#algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}) # item-based CF using cosine
#algo = KNNWithMeans(k=40,sim_options={'name': 'pearson'}) 

algo.fit(trainset) # build the model

In [None]:
# predict ratings for a given target user and target item

#select a target user
rawuid = 'Toby' 

# select an item (e.g. pick any one of the below)
rawiid = 'SnakesOnPlane' # was rated by Toby
#rawiid = 'NightListener' # was not rated by Toby
#rawiid = 'LadyinWater' # was not rated by Toby
#rawiid = 'JustMyLuck' # was not rated by Toby

# convert user and items names (raw ids) into indexes (inner ids)
# surprise jargon: raw ids are the user & item names as given in the datafile, they can be integers or strings
# inner ids are indexes into the sorted rawids
uid = trainset.to_inner_uid(rawuid)
iid = trainset.to_inner_iid(rawiid)
print("inner ids:","user=",uid,"item=",iid)

# if the actual rating is known it can be passed as an argument (but this is not necessary)
realrating = dict(trainset.ur[uid])[iid]; realrating  # retrieve the real rating
pred = algo.predict(rawuid, rawiid, r_ui = realrating, verbose = True)
pred

In [None]:
# if the actual rating is unknown then it can be omitted
pred = algo.predict(rawuid, rawiid)
pred 

In [None]:
# make rating predictions for ALL of the users in the dataset

# first get the unseen items for each user
unseen = trainset.build_anti_testset() # get all ratings that are not in the trainset
print("num unseen=",len(unseen))
# view the unseen items, Note that the rating shown is the global mean rating, the actual rating is unknown
#print("sample of unseen:",unseen[0:9]) 
unseen

In [None]:
# now make ratings predictions for each users for all of their unseen items - this may be slow for big datasets
predictions = algo.test(unseen)
print("num preds made=",len(predictions))
predictions

In [None]:
# to predict only the ratings for the target user on their unseen items (specified earlier by rawuid)
# we extract the targetuser from unseen
# how do the predictions for Toby compare with the results in the earlier workshops?

targetonly = list()
for ruid, riid, r in unseen:
    if (ruid == rawuid):
        targetonly.append((ruid, riid, r))        
print("targetdata=",targetonly)  
predictions = algo.test(targetonly)
predictions

In [15]:
# function to get the topN recommendations for each user
# by ranking the unseen items by their predicted rating 
# input is the rating predictions
# output is a dictionary where keys are (raw) userids and 
# values are lists of tuples: [(raw item id, pred.rating),...] 
# see https://surprise.readthedocs.io/en/stable/FAQ.html

from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))  
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True) # sort on predicted rating
        top_n[uid] = user_ratings[:n]
    return top_n

In [None]:
# now get the actual recommended items for the target user (Toby) - the topN rated items
get_top_n(predictions) 

# To demonstrate testing using a train/test split, we load the movielens data
FYI: this datset is also preloaded into the surprise package, 
hence we could use the below line instead of reading from file

data = Dataset.load_builtin('ml-100k') 

In [17]:
#load the movielens 100K dataset
file = "/content/drive/My Drive/recsys/u_data.csv"
ratings_df = pd.read_csv(file)
ratings_df.columns = ['user_id','item_id','rating','datetime']
ratings_df.drop('datetime',axis=1,inplace=True)

 # convert to surprise format
reader = Reader(rating_scale=(1,5)) # assumes datafile contains: user, item, ratings (in this order)
data = Dataset.load_from_df(ratings_df, reader)

In [None]:
# split data into training and test sets using surprise fn: train_test_split())
# Note: if test_size parameter is float then it represents proportion of the data, if integer it represents absolute number 
trainset, testset  = train_test_split(data, test_size=0.1)  # select 10% of rating events (10% of 100K ~ 10K)
print("testset size=",len(testset))
print("trainset type=",type(trainset),"testsettype=",type(testset))

print('users,items in trainset=',trainset.n_users, trainset.n_items)

testdf = pd.DataFrame(testset)
print('users,items in testset=',len(testdf.iloc[:,0].unique()),len(testdf.iloc[:,1].unique()))
print("testset sample=",testset[0:3])

In [None]:
# rebuild the model using the new train set and KNNBasic
# Note: the argument k is the number of neighbours to take into consideration (default is 40)
# try experimenting with different values of k, is there an optimal value for k as a % of the users?
# What accuracy do we get if we set k = all of the users (917)?
algo = KNNBasic(k=50,sim_options={'name': 'cosine', 'user_based': True }) # User-based CF
algo.fit(trainset)
preds = algo.test(testset)
accuracy.rmse(preds)
accuracy.mae(preds)

In [None]:
# run 5-fold cross-validation.
res = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [20]:
# we can compute PRECISION@K and RECALL@K for a given K and rating threshold

# first, copy the code for fn: precision_recall_at_k() from: https://surprise.readthedocs.io/en/stable/FAQ.html

# then paste the code into the window here and then execute....

from collections import defaultdict

from surprise import Dataset
from surprise import SVD
from surprise.model_selection import KFold


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls



In [None]:
precisions, recalls = precision_recall_at_k(preds, k=5, threshold=4)
 
# average over all tests
print("avg P@K=", sum(prec for prec in precisions.values()) / len(precisions))
print("avg recall=",sum(rec for rec in recalls.values()) / len(recalls))

# Workshop 3A: 

Use the above code to load and test the Jester dataset using **item-based CF** and compare with workshop2 results

*   use similarity = Euclidean (MSD)
*   use MAE as the evaluation metric
*   try altering the number of neighbours (k) used in the Surprise model

Also try to test using user-based CF (any similarity measure will do).
For the Jester dataset: why does user-based fail but item-based succeed?

Remember: you must reload the libraries (scroll back to top) after any crash


In [2]:
file = "/content/drive/My Drive/recsys/jester_ratings.dat"

ratings_df = pd.read_csv(file, sep='\s+',header=None)
ratings_df.columns = ['user_id','item_id','rating']
reader = Reader(rating_scale=(-10,+10)) 
data = Dataset.load_from_df(ratings_df, reader)

In [None]:
trainset, testset  = train_test_split(data, test_size=0.1); len(testset)  # select 10% of rating events (10% of 100K ~ 10K)
print('users,items in trainset=',trainset.n_users, trainset.n_items)
testdf = pd.DataFrame(testset)

In [None]:
# if you wish you can try user-based CF using  KNNBasic 
# but this will also probaby crash due to insufficient memory unless you pay for more colab memory :-)
algo = KNNBasic(k=50,sim_options={'name': 'cosine', 'user_based': True}) 
algo.fit(trainset)
preds = algo.test(testset)
accuracy.rmse(preds)
accuracy.mae(preds)

In [None]:
# now try item-based CF (as demonstrated in workshop2 - this is less memory intensive)
# paste or type appropriate code in here......
# what value of k (number of neighbours) gives the best result?
algo = KNNBasic(k=50,sim_options={'name': 'cosine', 'user_based': False }) 
algo.fit(trainset)
preds = algo.test(testset)
accuracy.rmse(preds)
accuracy.mae(preds)

Now load and test the Book Crossing (BX) dataset (explicit ratings only) using **item-based CF** and compare with workshop 2 results

In [5]:
def loadBX():
  file = "/content/drive/My Drive/recsys/BX-Book-Ratings.csv"
  ratings_df  = pd.read_csv(file, sep=';', error_bad_lines=False, encoding="latin-1")
  ratings_df.columns = ['user_id','item_id','rating']
  ratings_df = ratings_df[ratings_df.rating != 0]  # remove the implicit ratings
  #print(ratings_df.memory_usage(index=True).sum())
  return ratings_df

In [None]:
ratings_df = loadBX()
print(ratings_df.shape)
ratings_df[0:5]

In [7]:
reader = Reader(rating_scale=(1,10)) 
data = Dataset.load_from_df(ratings_df, reader)

Now proceed as above the train and test using item-based CF (cosine similarity).


In [None]:
# paste/type your code here.....

