# Matrix Factorisation Workshop using the Surprise Library

In [None]:
pip install scikit-surprise

In [2]:
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

import os
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [30]:
#load the movielens 100K dataset
file = "/content/drive/My Drive/recsys/u_data.csv"
ratings_df = pd.read_csv(file)
ratings_df.columns = ['user_id','item_id','rating','datetime']
ratings_df.drop('datetime',axis=1,inplace=True)

 # convert to surprise format
reader = Reader(rating_scale=(1,5)) # assumes datafile contains: user, item, ratings (in this order)
data = Dataset.load_from_df(ratings_df, reader)

In [None]:
# split data into training and test sets
trainset, testset  = train_test_split(data, test_size=0.1)  # select 10% of rating events (10% of 100K ~ 10K)
len(testset) 

In [None]:
# select one of the Surprise matrix factorisation algorithms and fit a model

algo = SVD(n_factors = 50) # simon funks algorithm, default is 100 factors
#algo = SVDpp(n_factors = 50) # an extension of SVD that handles implicit ratings
#algo = NMF(n_factors = 50) # non negative matrix factorisation

algo.fit(trainset) # build the model

In [None]:
# pick a target user to make recommendations for
rawuid = 3 

# get a list of all unseen items for all users, then extract the target user
unseen = trainset.build_anti_testset() 
targetonly = list()
for ruid, riid, r in unseen:  
    if (ruid == rawuid):
        targetonly.append((rawuid, riid, r))

print("number of unseen items=", len(targetonly)) # the number of unseen items for the target user (if this is zero then go back and pick another target user)
targetonly[0:4] # show the first 4 of the target users unseen items, the rating shown is the user mean (user bias)

In [9]:
# function to get the topN recommendations for each user
# by ranking the unseen items by their predicted rating 
# input is the rating predictions
# output is a dictionary where keys are (raw) userids and 
# values are lists of tuples: [(raw item id, pred.rating),...] 
# see https://surprise.readthedocs.io/en/stable/FAQ.html

from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))  
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True) # sort on predicted rating
        top_n[uid] = user_ratings[:n]
    return top_n

In [None]:
# make rating predictions and recommendations for the target   
predictions = algo.test(targetonly)
recs = get_top_n(predictions, n=10)
recs  # a list of (itemID, predicted rating) pairs

In [None]:
# to show the recommendations along with the movie names
# we first load the movie titles and create a dict to map movie id to title
file = "/content/drive/My Drive/recsys/u_item.csv"
titles = pd.read_csv(file, dtype=str)
titlemap = dict(zip(titles['movie id'],titles['movie name']))

# now show the recommendations using the  movie names
for user,rlist in recs.items(): 
    for rawiid, rat in rlist:
        print(rat, rawiid, titlemap[str(rawiid)])

In [None]:
# compute MAE for the testset rating predictions
# how does this compare with the best MAE's found in workshop2 and workshop3A?
preds = algo.test(testset)
accuracy.mae(preds)

In [None]:
# to help understand how predictions are made when using matrix factorisation we can 
# compute the rating prediction ourselves from the factorised matrices and the biases: pu,qi,bu,bi

# first we examine (a sample) of the User and Item preference matrix
print(algo.pu[0:10,0:10])
print(algo.qi[0:10,0:10])

In [None]:
# now examine the learned biases (these are useful for cold-start users)
print("userbias:",algo.bu[0:4]) # sample of user biases
print("itembias:",algo.bi[0:4]) # sample of item biases
print("global bias:",algo.default_prediction())# the global mean rating

In [None]:
# now examine the data for the target user & target item
# you can pick any item, but pick one of the items recommended above for easy comparison

rawuid = 3  # pick the same user as above
rawiid = 408 # pick one of the items that was recommended above for this user

# convert to innerids
uid = trainset.to_inner_uid(rawuid)
iid = trainset.to_inner_iid(rawiid)
print("target user preferences:\n", algo.pu[uid,])
print("target item preferences:\n", algo.qi[iid,])
print("target user bias:", algo.bu[uid])
print("target item bias:", algo.bi[iid])

In [None]:
# manually compute the prediction, this should agree with the output from algo.predict() and algo.test()
# scroll back up to see the prediction made using algo.test()
pred = algo.default_prediction() + algo.bu[uid] + algo.bi[iid] + sum(algo.pu[uid,] * algo.qi[iid,]) 
pred

# Workshop 3B: 

Using the Book Crossings (BX) dataset, try to find the best value for n_factors (the number of latent features) when using the SVD algorithm.

Use all explict book ratings (no need to subsample). Make sure you set the correct ratings range (in Reader) when loading the data

In [None]:
# paste/type in code here to load the data and create the training and test sets......





In [None]:
# NOTE: after loading the data and creating training and test sets you can
# explore different number of latent factor using the below code 
# (adjust the factor list as required)
for f in [10,20,30,40,50,60,70,80,90,100,200,500]:
    algo = SVD(n_factors = f)
    algo.fit(trainset)
    preds = algo.test(testset)
    print(f, " ", end="")
    accuracy.mae(preds)