# Item-Based CF Algorithm for Rating Prediction(Demonstration)

## Import libraries

In [1]:
#import libraries
import pandas as pd 
import numpy as np
from sklearn.utils import shuffle
import collections
import time

## Load the rating data

In [2]:
#load the rating data
ratingsHeader = ["UserID", "MovieID", "Rating", "Timestamp"]
ratingsDF = pd.read_table('Raw Data/ratings.dat',sep = '::', names = ratingsHeader, engine = "python")
ratingsDF

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


## K - Fold Cross Validation

* shuffle the dataframe
* split the data into K folds
* split the trainData(K-1) and testData(1) for K times
* return the trainDataList and the testDataList

In [3]:
#split the data
def splitData(df, folds = 5, shu = True, randomState = None):
    trainDFList = []
    testDFList = []
    #shuffle the dataframe
    df = shuffle(df, random_state = randomState)
    #split the data into K folds
    nums = df.shape[0]
    length = int(nums / folds)
    dfList = []
    dfList.append(df[0:length])
    for i in range(1,folds - 1):
        dfList.append(df[i*length: (i + 1) * length])
    dfList.append(df[(folds - 1) * length :])
    #split the trainData and test Data
    for i in range(folds):
        testDF = dfList[i]
        tempList = []
        for j in range(folds):
            if(j != i):
                tempList.append(j)
        trainDF = dfList[tempList[0]]
        for j in range(1,folds - 1):
            trainDF = pd.concat([trainDF, dfList[tempList[j]]])
        trainDFList.append(trainDF)
        testDFList.append(testDF)
    #return the trainDataList and the testDataList
    return trainDFList, testDFList

In [4]:
#split the data
trainRatingDFList, testRatingDFList = splitData(ratingsDF, folds = 5, shu = True, randomState = 1)

## Load the train Rating Data

In [5]:
# load the train rating data
# save the Rating Data into the dictionary
def loadTrainRatingData(row, dataSet, dataSet2):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating
    dataSet2[movieID][userID] = rating

In [6]:
# load the train rating data
trainRatingDF = trainRatingDFList[0]
trainDataSet = collections.defaultdict(dict)
trainDataSet2 = collections.defaultdict(dict)
trainRatingDF.apply(loadTrainRatingData, axis = 1, dataSet = trainDataSet, dataSet2 = trainDataSet2)

120098    None
529184    None
341591    None
470922    None
630004    None
          ... 
491263    None
791624    None
470924    None
491755    None
128037    None
Length: 800168, dtype: object

trainDataSet save the rated movies and rating value for each user

In [7]:
# The rating value that User1 rated on Movie2918 is 4 
trainDataSet[1][2918]

4

trainDataSet2 save the rated user and rating value for each movie

In [8]:
# The rating value that User1 rated on Movie2918 is 4 
trainDataSet2[2918][1]

4

## Load the test rating data

In [9]:
# load the test rating data
# save the Rating Data into the dictionary
def loadTestRatingData(row, dataSet):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating

In [10]:
# load the test rating data
testRatingDF = testRatingDFList[0]
testDataSet = collections.defaultdict(dict)
testRatingDF.apply(loadTestRatingData, axis = 1, dataSet = testDataSet)

630120    None
229398    None
758377    None
159240    None
254252    None
          ... 
659611    None
875199    None
743921    None
527163    None
623363    None
Length: 200041, dtype: object

In [11]:
# The rating value that User1 rated on Movie594 is 4 
testDataSet[1][594]

4

## Calculate similarities among movies

* get the index
* find the commen Items
* calculate the similarity
* save the similarity

In [12]:
# Calculate similarities among movies
# save the similarities into the dictionary
def movieSimilarity(trainDataSet):
    trainMovieList = list(trainDataSet.keys())
    num = len(trainMovieList)
    movieSimilarityDict = collections.defaultdict(dict)
    #calculate similarities among all the movies
    for i in range(num):
        for j in range(i, num):
            #get the index
            IDA = trainMovieList[i]
            IDB = trainMovieList[j]
            setA = set(trainDataSet[IDA].keys())
            setB = set(trainDataSet[IDB].keys())
            #count the commonItems
            commonItems = len(setA.intersection(setB))
            #calculate the similarity
            similarity = commonItems/np.sqrt(len(setA) * len(setB))
            #save the similarity
            movieSimilarityDict[IDA][IDB] = similarity
            movieSimilarityDict[IDB][IDA] = similarity
    #return the similarity dictionary
    return movieSimilarityDict

In [13]:
#calculate similarities among movies
movieSimilarityDict = movieSimilarity(trainDataSet2)

In [14]:
#the similarity between Movie1 and Movie2 is 0.335
movieSimilarityDict[1][2]

0.3350466112843514

## Predict the rating value

* select movies user rated
* select the K most similar movies
* predict the rating value

In [15]:
# predict the rating value
def predictRating(trainDataSet, movieSimilarityDict, userID, movieID, K):
    similaritySum = 0.0
    ratingSimSum = 0.0
    #select the movies user rated
    ratedMovies = trainDataSet[userID]
    similarMovieDict = collections.defaultdict(float)
    for similarMovieID in ratedMovies:
        if(similarMovieID == movieID):
            continue
        similarMovieDict[similarMovieID] = movieSimilarityDict[similarMovieID][movieID]
    #select the K most similar movies
    for similarMovieID, similarity in sorted(similarMovieDict.items(), key = lambda d:d[1], reverse = True)[:K]:
        similaritySum += similarity
        ratingSimSum += similarity * ratedMovies[similarMovieID]
    #predict the rating value
    if(similaritySum == 0):
        result = 0
    else:
        result = 1.0 * ratingSimSum / similaritySum
    return result

In [16]:
#predict the rating value(User1 on Movie594)
predictRating(trainDataSet, movieSimilarityDict, 1, 594, 10)

4.455166940096121

In [17]:
#The rating value that User1 rated on Movie594 is 4 
testDataSet[1][594]

4

## Calculate the mean absolute error 

In [18]:
#calculate the mean absolute error on the user
def evaluateUserMAE(trainDataSet, testDataSet, movieSimilarityDict, userID, K):
    AESum = 0.0
    n = len(testDataSet[userID].keys())
    for movieID in testDataSet[userID].keys():
        try:
            AESum += np.fabs(predictRating(trainDataSet, movieSimilarityDict, userID, movieID, K) - testDataSet[userID][movieID])
        except:
            n -= 1
    return [AESum, n]

In [19]:
#calculate the mean absolute error on User 1
AESum, n = evaluateUserMAE(trainDataSet, testDataSet, movieSimilarityDict, 1, 10)
AESum/n

0.5608424425381069

In [20]:
#calculate the mean absolute error on the whole dataset
def evaluateMAE(trainDataSet, testDataSet, movieSimilarityDict, K):
    AESum = 0.0
    n = 0
    for userID in list(testDataSet.keys()):
        curAE, curN = evaluateUserMAE(trainDataSet, testDataSet, movieSimilarityDict, userID, K)
        AESum += curAE
        n += curN
    return AESum / n

In [21]:
#calculate the mean absolute error on the whole dataset
evaluateMAE(trainDataSet, testDataSet, movieSimilarityDict, 10)

0.7250678732702602