In [1]:
"""


This script demonstrates how implement the "global average rating" recommender
and validate its accuracy with help of 5-fold cross-validation.

"""
import warnings
import os
import math
from sklearn.linear_model import LinearRegression
import numpy as np
import numpy_indexed as npi
from scipy  import sparse
from scipy.sparse import lil_matrix
import pickle
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
#load data
ratings=[]
f = open("ratings.dat", 'r')
for line in f:
    data = line.split('::')
    ratings.append([int(z) for z in data])
f.close()
ratings = np.array(ratings)

#split data into 5 train and test folds
nfolds=5
np.random.seed(17)

seqs=[x%nfolds for x in range(len(ratings))]
np.random.shuffle(seqs)

In [3]:
#create sparse matrix from given ratings data
#Use User and Item-Ids as pointers to the respective row (User-Id) and column (Item-Id) where to store the rating
def sparseMatrix(ratings_data):
    #Ids are sequence from 1 to ItemId-count, e.g. 3706
    #Determine largest Id as dimensions for sparse 2d matrix
    sparseM = lil_matrix((np.max(ratings_data[:,1]),np.max(ratings_data[:,0])))
    #Store rating in Id-index for User and Item, since Id starts with 1, subtract 1 to use 0th row
    for i in range((ratings_data.shape[0])):
        try:
            sparseM[ratings_data[i, 1]-1, ratings_data[i, 0]-1] = ratings_data[i, 2]
        except:
            print("except",i, sparseM.shape, ratings_data[i, 1]-1, ratings_data[i, 0-1])
    return sparseM


In [4]:
def MF(ratings_data, num_factor, learning, regularization, num_iter):
    #get dimensions of User and Item Matrix
    num_users = np.max(ratings_data[:,0])
    num_item = np.max(ratings_data[:,1])
    # Create sparse Matrix from ratings to be used as lookup for error calculation
    RatingsMatrix = sparseMatrix(ratings_data)

    # Initialize user and item matrices with random values
    UserMatrix = np.array(np.random.normal(scale=1/num_factor,
                            size = (num_users, num_factor)), dtype=np.float64)
    ItemMatrix = np.array(np.random.normal(scale=1/num_factor,
                            size = (num_factor, num_item)), dtype=np.float64)
    # Repeat for given number of iterations, here = 75
    for p in range(num_iter):
        # go along the ratings data and take ItemId and UserId for each rating-entry
        for j,i in ratings_data[:,(0,1)]:
            # calculate prediction from existing data
            pred = np.float64(np.dot(UserMatrix[j-1,:],ItemMatrix[:,i-1] ))
            rate = RatingsMatrix[i-1,j-1]
            # calculate difference between prediction and true rating
            err = rate - pred
            # Update weigths of matrices
            UserMatrix2 = UserMatrix[j-1,:] + np.float64(learning) * np.float64((np.float64(2.0)
                            * np.float64(err) * ItemMatrix[:, i-1]) - np.float64(regularization) * UserMatrix[j-1,:])
            ItemMatrix2 = ItemMatrix[:,i-1] + np.float64(learning) * np.float64((np.float64(2.0)
                            * np.float64(err)) * UserMatrix[j-1,:] - np.float64(regularization) * ItemMatrix[:, i-1])
            UserMatrix[j - 1, :] = UserMatrix2
            ItemMatrix[:, i - 1] = ItemMatrix2
        #Calculate RMSE and MAE - block
        #initialize error and counter variables
        errsum = 0.0
        counter = 0
        maesum = 0.0
        for j, i in ratings_data[:, (0, 1)]:
            # calculate difference between true rating and dot product
            rate = RatingsMatrix[i - 1, j - 1]
            predict = np.dot(UserMatrix[j-1,:],ItemMatrix[:,i-1] )
            # round predicted rating if out of bounds
            if(predict > 5):
                predict = 5
            if (predict < 1):
                predict = 1
            # add to cumulative rmse and mae sum
            errsum += math.pow(rate - predict, 2)
            maesum += math.fabs(rate - predict)
            counter += 1
        rmse = math.sqrt(errsum/counter)
        mae = maesum/counter
    # Calculate Prediction Matrix to be returned
    #Moved outside of iteration loop for performace reasons
    Prediction = np.transpose(np.dot(UserMatrix, ItemMatrix))
    return(rmse, mae, Prediction)

In [5]:
#Perform 5 fold Cross-Validation
#Initialize variables/result store arrays
maeTrain = np.zeros(nfolds)
maeTest = np.zeros(nfolds)
rmseTrain=np.zeros(nfolds)
rmseTest=np.zeros(nfolds)
rmsetemp = 0
maetemp = 0

In [None]:
for fold in tqdm(range(5)):
    #Select Training and Test data
    train_sel=np.array([x!=fold for x in seqs])
    test_sel=np.array([x==fold for x in seqs])
    train=ratings[train_sel]
    test=ratings[test_sel]
    #Perform MF for current train data set
    rmseTrain[fold], maeTrain[fold], Prediction = MF(train, 10, 0.005, 0.05, 75)
    skip = 0
    for i in range(test.shape[0]):
        try:
            #Get RMSE and MAE for test data
            rmsetemp += (test[i,2] - Prediction[test[i,1]-1,test[i,0]-1])**2
            maetemp += math.fabs(test[i,2] - Prediction[test[i,1]-1,test[i,0]-1])
        except:
            #If error occurs, skip current line
            skip += 1
    #save RMSE and MAE for current fold
    rmseTest[fold] = math.sqrt(rmsetemp/(test.shape[0] - skip))
    maeTest[fold] =  maetemp/(test.shape[0]-skip)
    #Print results
    print("Set No." + str(fold) + ", rmse on train: " + str(rmseTrain[fold]) + ", rmse on test: " + str(rmseTest[fold]), "skipped entries: "+ str(skip))
    print("Set No." + str(fold) + ", mae on train: " + str(maeTrain[fold]) + ", mae on test: " + str(maeTest[fold]), "skipped entries: "+ str(skip))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))