# 4 - Calculates the RMSE and the MAE

In that part we wil calculates the RMSE and the MAE, we will also register the prediction for each line of the dataset. Thereafter, we will use these predictions to calculates a lot of things

### Importing library

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, evaluate, Reader, accuracy, Trainset
from pathlib import Path

In [2]:
from surprise import SVD
from surprise import SVDpp
from surprise import BaselineOnly
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import KNNWithZScore

the next block will be used for the predictions, we need each dataset and to make it easier we use a dictionary

In [3]:
dicoDfTest={}
for fold in range(10):
    dicoDfTest[fold]=pd.read_parquet('dataCV/dfTest_CV'+str(fold)+'.gzip')

This function return a list of testing file in the appropriate format for surprise in terms of the fold asking

In [4]:
def listTestCV(fold) :    
    testInte=list()
    reader = Reader()
    dfTest=pd.read_parquet('dataCV/dfTest_CV'+str(fold)+'.gzip')
    arTest=dfTest.to_numpy()
    data=Dataset.load_from_df(dfTest,reader) 
    testInte.append(data.build_full_trainset().build_testset())

    df=pd.read_parquet('dataCV/dfInf20_CV'+str(fold)+'.gzip')
    data=Dataset.load_from_df(df,reader) 
    testInte.append(data.build_full_trainset().build_testset())

    df=pd.read_parquet('dataCV/dfSup20_CV'+str(fold)+'.gzip')
    data=Dataset.load_from_df(df,reader) 
    testInte.append(data.build_full_trainset().build_testset())

    for j in range(10):
        my_file = Path("dataCV/df"+inteName(j)+'_CV'+str(fold)+'.gzip')
        df=pd.read_parquet(my_file)
        data=Dataset.load_from_df(df,reader) 
        testInte.append(data.build_full_trainset().build_testset())
    for j in range(10,51,5):
        my_file = Path("dataCV/df"+inteName(j)+'_CV'+str(fold)+'.gzip')
        df=pd.read_parquet(my_file)
        data=Dataset.load_from_df(df,reader) 
        testInte.append(data.build_full_trainset().build_testset()) 
    return testInte

In [5]:
def inteName(i):
    if i==0:
        return "1-9"
    elif i<10:
        return str(int(i*10))+"-"+str(int(i*10+9))
    elif i>=50:
        return 'sup500'
    else :
        return str(int(i*10))+"-"+str(int(i*10+49))

This function return training dataset in the appropriate format for surprise in terms of the fold asking

In [6]:
def trainsetCV(fold):
    dfTrain=pd.read_parquet('dataCV3/dfTrain_fold_'+str(fold)+'.gzip')
    reader = Reader()
    data=Dataset.load_from_df(dfTrain,reader)
    train=data.build_full_trainset()
    return train

the three following bloc serve called algorithms more easily, it's an equivalent of the switch case in C++

In [7]:
def algoSVD() :
    algo=SVD()
    return algo
    
def algoSVDpp() :
    algo=SVDpp()
    return algo

def algoNMF() :
    algo=NMF()
    return algo

def algoBaselineOnly() :
    algo=BaselineOnly()
    return algo

def algoCoClustering() :
    algo=CoClustering()
    return algo

def algoNormalPredictor() :
    algo=NormalPredictor()
    return algo

def algoSlopeOne() :
    algo=SlopeOne()
    return algo

def algoKNNBasic() :
    algo=KNNBasic()
    return algo

def algoKNNWithMeans() :
    algo=KNNWithMeans()
    return algo

def algoKNNBaseline() :
    algo=KNNBaseline()
    return algo

def algoKNNWithZScore() :
    algo=KNNWithZScore()
    return algo

In [8]:
algoChoice = {"SVD" : algoSVD,
              "SVDpp" : algoSVDpp,
              "NMF"  : algoNMF,
              "BaselineOnly" : algoBaselineOnly,
              "CoClustering" : algoCoClustering,
              "NormalPredictor" : algoNormalPredictor,
              "SlopeOne" : algoSlopeOne,
              "KNNBasic" : algoKNNBasic,
              "KNNWithMeans" : algoKNNWithMeans,
              "KNNBaseline" : algoKNNBaseline,
              "KNNWithZScore" : algoKNNWithZScore}

In [9]:
algoListe=["BaselineOnly", "SVD", "NMF", "SlopeOne", 
           "CoClustering", "NormalPredictor", "SVDpp",
           "KNNBasic", "KNNWithMeans", "KNNBaseline", "KNNWithZScore"]

this algorithm serve to add the RMSE and the MAE for each testing dataset in the list listTest, in function of the algorithm pass in parameter

In [10]:
def addResult(algo,res,listTest,algoName):
    rmse=list()
    mae=list()
    for i in listTest:
        predictions = algo.test(i)
        rmse.append(accuracy.rmse(predictions, verbose=False))
        mae.append(accuracy.mae(predictions, verbose=False))
    res[algoName]=rmse+mae

this algorithm return a list of predictions, in function of the algorithm pass in parameter

In [11]:
def listPredictions(ar,algo):
    l=list()
    t=len(ar)
    for i in range(t):
        l.append(algo.predict(ar[i,0],ar[i,1],verbose=False)[3])
    return l

In [12]:
results={} # it's the dictionary with al the results

In [None]:
for name in algoListe[:6] :
    print("--==  ",name,"  ==--")
    for fold in range(10): 
        print("Fold : ",fold)
        algo=algoChoice[name]() # We create the algorithm named name
        print("-- Train --")
        train=trainsetCV(fold) 
        algo.fit(train) # We train the algorithm
        print("-- Test --")
        addResult(algo,results,listTestCV(fold),name+str(fold))
        dicoDfTest[fold]["Predicted ratings "+name]=listPredictions(dicoDfTest[fold].to_numpy(),algo)

--==   BaselineOnly   ==--
Fold :  0
-- Train --
Estimating biases using als...
-- Test --
Fold :  1
-- Train --
Estimating biases using als...
-- Test --
Fold :  2
-- Train --
Estimating biases using als...
-- Test --
Fold :  3
-- Train --
Estimating biases using als...
-- Test --
Fold :  4
-- Train --
Estimating biases using als...
-- Test --
Fold :  5
-- Train --
Estimating biases using als...
-- Test --
Fold :  6
-- Train --
Estimating biases using als...
-- Test --
Fold :  7
-- Train --
Estimating biases using als...
-- Test --
Fold :  8
-- Train --
Estimating biases using als...
-- Test --
Fold :  9
-- Train --
Estimating biases using als...
-- Test --
--==   SVD   ==--
Fold :  0
-- Train --
-- Test --
Fold :  1
-- Train --
-- Test --
Fold :  2
-- Train --
-- Test --
Fold :  3
-- Train --
-- Test --
Fold :  4
-- Train --
-- Test --
Fold :  5
-- Train --
-- Test --
Fold :  6
-- Train --
-- Test --
Fold :  7
-- Train --
-- Test --
Fold :  8
-- Train --
-- Test --
Fold :  9
-- Train

We save our result

In [None]:
lRMSEname=list()
lMAEname=list()
for i in range(10):
    lRMSEname.append("RMSE_"+inteName(i))
    lMAEname.append("MAE_"+inteName(i))
for i in range(10,51,5):
    lRMSEname.append("RMSE_"+inteName(i))
    lMAEname.append("MAE_"+inteName(i))

nameColumms=["algoName","RMSE_All","RMSE_<20","RMSE_20+"]+lRMSEname+["MAE_All","MAE_<20","MAE_20+"]+lMAEname

In [None]:
dfRes=pd.DataFrame(columns =nameColumms)
for key in results:
    dfRes.loc[len(dfRes)]=[key]+results[key]
dfRes.to_csv('data/resultRMSEbyInterval_1.csv',index=False)

In [None]:
for i in dicoDfTest :
    dicoDfTest[i].to_csv('data/prediction_fold_'+str(i)'.csv',index=False)

In [None]:
df.concat(dicoDfTest.values())
df.to_csv('data/prediction3.csv',index=False)