# User-Based CF Algorithm for Rating Prediction(Evaluation)

## Import libraries

In [1]:
#import libraries
import pandas as pd 
import numpy as np
from sklearn.utils import shuffle
import collections

## Load the rating data

In [2]:
ratingsHeader = ["UserID", "MovieID", "Rating", "Timestamp"]
ratingsDF = pd.read_table('Raw Data/ratings.dat',sep = '::', names = ratingsHeader, engine = "python")
ratingsDF

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


## K - Fold Cross Validation

* shuffle the dataframe
* split the data into K folds
* split the trainData(K-1) and testData(1) for K times
* return the trainDataList and the testDataList

In [3]:
def splitData(df, folds = 5, shu = True, randomState = None):
    trainDFList = []
    testDFList = []
    #shuffle the dataframe
    df = shuffle(df, random_state = randomState)
    #split the data into K folds
    nums = df.shape[0]
    length = int(nums / folds)
    dfList = []
    dfList.append(df[0:length])
    for i in range(1,folds - 1):
        dfList.append(df[i*length: (i + 1) * length])
    dfList.append(df[(folds - 1) * length :])
    #split the trainData and testData
    for i in range(folds):
        testDF = dfList[i]
        tempList = []
        for j in range(folds):
            if(j != i):
                tempList.append(j)
        trainDF = dfList[tempList[0]]
        for j in range(1,folds - 1):
            trainDF = pd.concat([trainDF, dfList[tempList[j]]])
        trainDFList.append(trainDF)
        testDFList.append(testDF)
    #return the trainDataList and the testDataList
    return trainDFList, testDFList

## Load the train rating data

In [4]:
# load the train rating data
# save the Rating Data into the dictionary
def loadTrainRatingData(row, dataSet, dataSet2):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating
    dataSet2[movieID][userID] = rating

## Load the test rating data

In [5]:
def loadTestRatingData(row, dataSet):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating

## Calculate similarities among users

* get the index
* find the commen Items
* calculate the similarity
* save the similarity

In [6]:
# Calculate similarities among users
# save the similarities into the dictionary
def userSimilarity(trainDataSet):
    trainUserIDList = list(trainDataSet.keys())
    num = len(trainUserIDList)
    userSimilarityDict = collections.defaultdict(dict)
    #calculate similarities among all the users
    for i in range(num):
        for j in range(i, num):
            #get the index
            IDA = trainUserIDList[i]
            IDB = trainUserIDList[j]
            setA = set(trainDataSet[IDA].keys())
            setB = set(trainDataSet[IDB].keys())
            #count the commonItems
            commonItems = len(setA.intersection(setB))
            #calculate the similarity
            similarity = commonItems/np.sqrt(len(setA) * len(setB))
            #save the similarity
            userSimilarityDict[IDA][IDB] = similarity
            userSimilarityDict[IDB][IDA] = similarity
    #return the similarity dictionary
    return userSimilarityDict

## Predict the rating value

* select users rated the movie
* select the K most similar users
* predict the rating value

In [7]:
#predict the rating value
def predictRating(trainDataSet, userSimilarityDict, userID, movieID, K):
    similaritySum = 0.0
    ratingSimSum = 0.0
    #select users rated the movie
    similarUsers = trainDataSet[movieID]
    similarUserDict = collections.defaultdict(float)
    for similarUser in similarUsers:
        if(similarUser == userID):
            continue
        similarUserDict[similarUser] = userSimilarityDict[similarUser][userID]
    #select the K most similar users
    for similarUser, similarity in sorted(similarUserDict.items(), key = lambda d:d[1], reverse = True)[:K]:
        similaritySum += similarity
        ratingSimSum += (similarity * trainDataSet[movieID][similarUser])
    #predict the rating value
    if(similaritySum == 0):
        result = 0
    else:
        result = 1.0 * ratingSimSum / similaritySum
    return result

## Calculate the mean absolute error 

In [8]:
#calculate the mean absolute error on the user
def evaluateUserMAE(trainDataSet, testDataSet, userSimilarityDict, userID, K):
    AESum = 0.0
    n = len(testDataSet[userID].keys())
    for movieID in testDataSet[userID].keys():
        try:
            AESum += np.fabs(predictRating(trainDataSet, userSimilarityDict, userID, movieID, K) - testDataSet[userID][movieID])
        except:
            n -= 1
    return [AESum, n]

In [9]:
#calculate the mean absolute error on the whole dataset
def evaluateMAE(trainDataSet, testDataSet, userSimilarityDict, K):
    AESum = 0.0
    n = 0
    for userID in list(testDataSet.keys()):
        curAE, curN = evaluateUserMAE(trainDataSet, testDataSet, userSimilarityDict, userID, K)
        AESum += curAE
        n += curN
    return AESum / n

## Evalutate the performance of Algorithm

In [10]:
#Evalutate the performance of Algorithm
def userBasedCFAlgorithms(ratingsDF, folds = 5, randomState = None):
    trainRatingDFList, testRatingDFList = splitData(ratingsDF, folds = folds, randomState = randomState)
    print("dataset K MAE")
    for i in range(folds):
        #trainRatingData
        trainRatingDF = trainRatingDFList[i]
        trainDataSet = collections.defaultdict(dict)
        trainDataSet2 = collections.defaultdict(dict)
        trainRatingDF.apply(loadTrainRatingData, axis = 1, dataSet = trainDataSet, dataSet2 = trainDataSet2)
        #testRatingData
        testRatingDF = testRatingDFList[i]
        testDataSet = collections.defaultdict(dict)
        testRatingDF.apply(loadTestRatingData, axis = 1, dataSet = testDataSet)
        #calculate the similarity among users
        userSimilarityDict = userSimilarity(trainDataSet)
        #evaluate the algorithm
        KList = [10, 20, 30, 40, 50,60, 70, 80]
        for K in KList:
            MAE = evaluateMAE(trainDataSet2, testDataSet, userSimilarityDict, K)
            print(i + 1,K, MAE)

In [11]:
userBasedCFAlgorithms(ratingsDF, folds = 5, randomState = 1)

dataset K MAE


KeyboardInterrupt: 