In [2]:
import os
import math
from sklearn.linear_model import LinearRegression
import numpy as np
import numpy_indexed as npi
from scipy  import sparse
from scipy.sparse import lil_matrix
import pickle
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [3]:
#load data
ratings=[]
f = open("dataset/ratings.dat", 'r')
for line in f:
    data = line.split('::')
    ratings.append([int(z) for z in data])
f.close()
ratings = np.array(ratings)
allUserId = set(range(1,np.max(ratings[:,0])+1))
allItemId = set(range(1,np.max(ratings[:,1])+1))

In [4]:
#split data into 5 train and test folds
nfolds=5

#to make sure you are able to repeat results, set the random seed to something:
np.random.seed(17)

In [5]:
#allocate memory for results:
rmseTrainRatings=np.zeros(nfolds)
rmseTestRatings=np.zeros(nfolds)
rmseTrainUsers=np.zeros(nfolds)
rmseTestUsers=np.zeros(nfolds)
rmseTrainItems=np.zeros(nfolds)
rmseTestItems=np.zeros(nfolds)

In [6]:
maeTrainRatings=np.zeros(nfolds)
maeTestRatings=np.zeros(nfolds)
maeTrainUsers=np.zeros(nfolds)
maeTestUsers=np.zeros(nfolds)
maeTrainItems=np.zeros(nfolds)
maeTestItems=np.zeros(nfolds)

rmseLinear=np.zeros(nfolds)
maeLinear=np.zeros(nfolds)

In [7]:
#Cross validation
seqs=[x%nfolds for x in range(len(ratings))]
np.random.shuffle(seqs)
#for each fold:
for fold in tqdm(range(nfolds)):
    train_sel=np.array([x!=fold for x in seqs])
    test_sel=np.array([x==fold for x in seqs])
    train=ratings[train_sel]
    test=ratings[test_sel]
    
    #calculate model parameters: mean rating over the training set:
    gmr=np.mean(train[:,2])

    #apply the model to the train set:
    rmseTrainRatings[fold]=np.sqrt(np.mean((train[:,2]-gmr)**2))
    maeTrainRatings[fold]=np.mean(np.fabs(train[:,2]-gmr))

    #apply the model to the test set:
    rmseTestRatings[fold]=np.sqrt(np.mean((test[:,2]-gmr)**2))
    maeTestRatings[fold]=np.mean(np.fabs(test[:,2]-gmr))
    #doing the same for user and item average
    meanUser = npi.group_by(train[:,0]).mean(train[:,2])
    meanUser = (meanUser[0].tolist(),meanUser[1].tolist())

    meanItem = npi.group_by(train[:,1]).mean(train[:,2])
    meanItem = (meanItem[0].tolist(),meanItem[1].tolist())

    #finding the missing element in the training set and apply fall-back
    for i in list(set(meanUser[0]) ^ allUserId):
        meanUser[1].insert(i-1,gmr)
    meanUser = meanUser[1]
    for i in list(set(meanItem[0]) ^ allItemId):
        meanItem[1].insert(i-1,gmr)
    meanItem = meanItem[1]

    #Create a list of means of user/item for each user/item in the data set
    #These can be used to calculate the rmse, mae and apply as R_user and R_item in linear regression
    replicatedUserRatings = [meanUser[e-1] for e in train[:,0]]
    replicatedItemRatings = [meanItem[e-1] for e in train[:,1]]

    #calculate mae and rmse
    maeTrainUsers[fold] = np.mean(np.fabs(train[:,2]-replicatedUserRatings))
    maeTrainItems[fold] = np.mean(np.fabs(train[:,2]-replicatedItemRatings))
    rmseTrainUsers[fold] = np.sqrt(np.mean((train[:,2]-replicatedUserRatings)**2))
    rmseTrainItems[fold] = np.sqrt(np.mean((train[:,2]-replicatedItemRatings)**2))

    #use vstack to concanate R_user,R_item and 1:s vector to create input for the linear regression
    inputLstsq = np.vstack((replicatedUserRatings,replicatedItemRatings))
    inputLstsq = np.vstack((inputLstsq,np.ones(len(replicatedUserRatings)))).T
    #applu to np.linalg.lstsq to get the A,B and C constant
    a,b,c = np.linalg.lstsq(inputLstsq,train[:,2])[0]

    #Create a list of means of user/item for each user/item in the data set
    replicatedUserRatings = [meanUser[e-1] for e in test[:,0]]
    replicatedItemRatings = [meanItem[e-1] for e in test[:,1]]
    #calculate mae and rmse
    maeTestUsers[fold] = np.mean(np.fabs(test[:,2]-replicatedUserRatings))
    maeTestItems[fold] = np.mean(np.fabs(test[:,2]-replicatedItemRatings))
    rmseTestUsers[fold] = np.sqrt(np.mean((test[:,2]-replicatedUserRatings)**2))
    rmseTestItems[fold] = np.sqrt(np.mean((test[:,2]-replicatedItemRatings)**2))

    #use vstack to concanate R_user, R_item to test the accuracy of linear regression
    inputLstsq = np.vstack((replicatedUserRatings,replicatedItemRatings)).T
    #print([math.fabs(row[0]*a + row[1]*b + c - row[2]) for row in test[:,[0,1,2]]])
    predictions = [(inputLstsq[i][0]*a + inputLstsq[i][1]*b + c) for i in range(len(inputLstsq))]
    for prediction in predictions:
        if prediction > 5:
            prediction = 5
        elif prediction < 1:
            prediction = 1
    #calculate the rmse and mae on the test set
    rmseLinear[fold] = np.sqrt(np.mean((predictions - test[:,2])**2))
    maeLinear[fold] = np.mean(np.fabs(predictions - test[:,2]))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))






In [8]:
#print the final conclusion:
print("\n")
print("rmse global on TRAIN: ", rmseTrainRatings)
print("rmse global on TEST: ", rmseTestRatings)
print("rmse users on TRAIN: ", rmseTrainUsers)
print("rmse users on TEST: ", rmseTestUsers)
print("rmse items on TRAIN: ", rmseTrainItems)
print("rmse items on TEST: ", rmseTestItems)
print("rmse LINEAR on TEST: ", rmseLinear)

print("mea global on TRAIN: ", maeTrainRatings)
print("mea global on TEST: ", maeTestRatings)
print("mea users on TRAIN: ", maeTrainUsers)
print("mea users on TEST: ", maeTestUsers)
print("mea items on TRAIN: ", maeTrainItems)
print("mea items on TEST: ", maeTestItems)
print("mea LINEAR on TEST: ", maeLinear)



rmse global on TRAIN:  [1.11731523 1.11701016 1.11699529 1.11745511 1.11672984]
rmse global on TEST:  [1.11624744 1.11746575 1.11752522 1.11568491 1.11858662]
rmse users on TRAIN:  [1.02785404 1.02778599 1.02773065 1.02781842 1.02717027]
rmse users on TEST:  [1.03473763 1.03500099 1.03527038 1.03492534 1.03752907]
rmse items on TRAIN:  [0.9744298  0.97409031 0.97433361 0.97460831 0.97362513]
rmse items on TEST:  [0.97849937 0.98006458 0.97906966 0.97780916 0.98185046]
rmse LINEAR on TEST:  [0.92285057 0.92513506 0.92361311 0.92349392 0.92694415]
mea global on TRAIN:  [0.93396319 0.93394206 0.9337667  0.93419246 0.93343936]
mea global on TEST:  [0.93422056 0.93343997 0.93411512 0.93242426 0.93510661]
mea users on TRAIN:  [0.82238788 0.82291956 0.82296344 0.82302252 0.82239199]
mea users on TEST:  [0.8293142  0.82831617 0.82887167 0.82776207 0.83057375]
mea items on TRAIN:  [0.77824597 0.77836482 0.77824482 0.77893227 0.77790328]
mea items on TEST:  [0.78209687 0.78219076 0.78270683 0.