# Content-based Algorithm-KNN(Evaluation)

## Import libraries

In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import collections
from sklearn.utils import shuffle
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import mean_absolute_error
import warnings
warnings.simplefilter('ignore')

## Load the metadata

In [2]:
# load the metadata
moviesMetadataDF = pd.read_csv("The Movies DataSet/processedMoviesMetadata.csv", index_col = 0)
#data processing
moviesMetadataDF['overview'] = moviesMetadataDF['overview'].fillna('')
moviesMetadataDF['tagline'] = moviesMetadataDF['tagline'].fillna('')
moviesMetadataDF['genres'] = moviesMetadataDF['genres'].fillna('')
moviesMetadataDF['description'] = moviesMetadataDF['overview'] + moviesMetadataDF['tagline'] + moviesMetadataDF['genres'] #+ moviesMetadataDF['keywords']
moviesMetadataDF['description'] = moviesMetadataDF['description'].fillna('')
moviesMetadataDF

Unnamed: 0,genres,tmdbId,overview,tagline,title,keywords,cast,movieId,director,description
0,"['Animation', 'Comedy', 'Family']",862,"Led by Woody, Andy's toys live happily in his ...",,Toy Story,"['jealousy', 'toy', 'boy', 'friendship', 'frie...","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",1,John Lasseter,"Led by Woody, Andy's toys live happily in his ..."
1,"['Adventure', 'Fantasy', 'Family']",8844,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,"['board game', 'disappearance', ""based on chil...","['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",2,Joe Johnston,When siblings Judy and Peter discover an encha...
2,"['Romance', 'Comedy']",15602,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"['fishing', 'best friend', 'duringcreditssting...","['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",3,Howard Deutch,A family wedding reignites the ancient feud be...
3,"['Comedy', 'Drama', 'Romance']",31357,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,"['based on novel', 'interracial relationship',...","['Whitney Houston', 'Angela Bassett', 'Loretta...",4,Forest Whitaker,"Cheated on, mistreated and stepped on, the wom..."
4,['Comedy'],11862,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"['baby', 'midlife crisis', 'confidence', 'agin...","['Steve Martin', 'Diane Keaton', 'Martin Short...",5,Charles Shyer,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...,...
3817,"['Comedy', 'Romance']",1597,"Greg Focker is ready to marry his girlfriend, ...",First comes love. Then comes the interrogation.,Meet the Parents,"['cia', 'airport', 'cat', 'jew', 'orderly', 'a...","['Ben Stiller', 'Robert De Niro', 'Teri Polo',...",3948,Jay Roach,"Greg Focker is ready to marry his girlfriend, ..."
3818,"['Crime', 'Drama']",641,The hopes and dreams of four ambitious people ...,,Requiem for a Dream,"['drug addiction', 'junkie', 'heroin', 'speed'...","['Ellen Burstyn', 'Jared Leto', 'Jennifer Conn...",3949,Darren Aronofsky,The hopes and dreams of four ambitious people ...
3819,"['Drama', 'War']",10687,A group of recruits go through Advanced Infant...,The system wanted them to become soldiers. One...,Tigerland,"['independent film', 'awol', 'fort polk louisi...","['Colin Farrell', 'Matthew Davis', 'Clifton Co...",3950,Joel Schumacher,A group of recruits go through Advanced Infant...
3820,"['Drama', 'Romance']",63956,Buddy Visalo (Michael Rispoli) is a factory wo...,The only way to find out what you love is to r...,Two Family House,[],"['Michael Rispoli', 'Kelly Macdonald', 'Kathri...",3951,Raymond De Felitta,Buddy Visalo (Michael Rispoli) is a factory wo...


## Load the rating data

In [3]:
# load the rating data
ratingsHeader = ["UserID", "MovieID", "Rating", "Timestamp"]
ratingsDF = pd.read_table('Raw Data/ratings.dat',sep = '::', names = ratingsHeader, engine = "python")
ratingsDF

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


## K - Fold Cross Validation

* shuffle the dataframe
* split the data into K folds
* split the trainData(K-1) and testData(1) for K times
* return the trainDataList and the testDataList

In [4]:
#split the data
def splitData(df, folds = 5, shu = True, randomState = None):
    trainDFList = []
    testDFList = []
    #shuffle the dataframe
    df = shuffle(df, random_state = randomState)
    #split the data into K folds
    nums = df.shape[0]
    length = int(nums / folds)
    dfList = []
    dfList.append(df[0:length])
    for i in range(1,folds - 1):
        dfList.append(df[i*length: (i + 1) * length])
    dfList.append(df[(folds - 1) * length :])
    #split the trainData and test Data
    for i in range(folds):
        testDF = dfList[i]
        tempList = []
        for j in range(folds):
            if(j != i):
                tempList.append(j)
        trainDF = dfList[tempList[0]]
        for j in range(1,folds - 1):
            trainDF = pd.concat([trainDF, dfList[tempList[j]]])
        trainDFList.append(trainDF)
        testDFList.append(testDF)
    #return the trainDataList and the testDataList
    return trainDFList, testDFList

## Load the rating data

In [5]:
# load the rating data
def loadRatingData(row, dataSet):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating

## Calculate similarities among movies

* represent movies using TF-IDF Vector
* calculate the cosine similarity
* get the index
* save the similarity

In [6]:
# calculate similarities among movies
# save the similarities into the dictionary
def movieSimilarity(moviesMetadataDF):
    #represent movies using TF-IDF Vector
    tfidfVector = TfidfVectorizer(stop_words = 'english')
    tfidf_matrix = tfidfVector.fit_transform(moviesMetadataDF['description'])
    movieIDList = list(moviesMetadataDF['movieId'].values)
    #calculate similarities among all the movies
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    movieSimilarityDict = collections.defaultdict(dict)
    num = cosine_sim.shape[0]
    for i in range(num):
        for j in range(i, num):
            #get the index
            IDA = movieIDList[i]
            IDB = movieIDList[j]
            #get the similarity
            similarity = cosine_sim[i][j]
            #save the similarity
            movieSimilarityDict[IDA][IDB] = similarity
            movieSimilarityDict[IDB][IDA] = similarity
    #return the similarity dictionary
    return movieSimilarityDict

## Predict the rating value

* select movies user rated
* select the K most similar movies
* predict the rating value

In [7]:
# predict the rating value
def predictRating(trainDataSet, movieSimilarityDict, userID, movieID, K):
    similaritySum = 0.0
    ratingSimSum = 0.0
    movieIDList = list(movieSimilarityDict.keys())
    if(movieID in movieIDList):
        #select the movies user rated
        ratedMovies = trainDataSet[userID]
        similarMovieDict = collections.defaultdict(float)
        for similarMovieID in ratedMovies:
            if(similarMovieID == movieID):
                continue
            try:
                similarMovieDict[similarMovieID] = movieSimilarityDict[similarMovieID][movieID]
            except:
                pass
        #select the K most similar movies
        for similarMovieID, similarity in sorted(similarMovieDict.items(), key = lambda d:d[1], reverse = True)[:K]:
            similaritySum += similarity
            ratingSimSum += similarity * ratedMovies[similarMovieID]
    #predict the rating value
    if(similaritySum == 0):
        result = 0
    else:
        result = 1.0 * ratingSimSum / similaritySum
    return result

## Calculate the mean absolute error

In [8]:
#calculate the mean absolute error on the user
def evaluateUserMAE(trainDataSet, testDataSet, movieSimilarityDict, userID, K):
    AESum = 0.0
    n = len(testDataSet[userID].keys())
    for movieID in testDataSet[userID].keys():
        try:
            AESum += np.fabs(predictRating(trainDataSet, movieSimilarityDict, userID, movieID, K) - testDataSet[userID][movieID])
        except:
            n -= 1
    return [AESum, n]

In [9]:
#calculate the mean absolute error on the whole dataset
def evaluateMAE(trainDataSet, testDataSet, movieSimilarityDict, K):
    AESum = 0.0
    n = 0
    count = 0
    for userID in list(testDataSet.keys()):
        curAE, curN = evaluateUserMAE(trainDataSet, testDataSet, movieSimilarityDict, userID, K)
        AESum += curAE
        n += curN
    return AESum / n

## Evalutate the performance of Algorithm

In [11]:
def contentBasedAlgorithmKNN(ratingsDF,moviesMetadataDF, folds = 5, randomState = None):
    print("dataset K MAE")
    trainRatingDFList, testRatingDFList = splitData(ratingsDF, folds = folds, randomState = randomState)
    #calculate the similarity among users
    movieSimilarityDict = movieSimilarity(moviesMetadataDF)
    for i in range(folds):
        #trainRatingData
        trainRatingDF = trainRatingDFList[i]
        trainDataSet = collections.defaultdict(dict)
        trainRatingDF.apply(loadRatingData, axis = 1, dataSet = trainDataSet)
        #testRatingData
        testRatingDF = testRatingDFList[i]
        testDataSet = collections.defaultdict(dict)
        testRatingDF.apply(loadRatingData, axis = 1, dataSet = testDataSet)
        #evaluate the algorithm
        KList = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
        for K in KList:
            mae = evaluateMAE(trainDataSet, testDataSet, movieSimilarityDict, K)
            print(i + 1, K, mae)

In [19]:
contentBasedAlgorithmKNN(ratingsDF,moviesMetadataDF, folds = 5, randomState = 1)

dataset K MAE
1 10 0.8362095421769488
1 20 0.8249277748735614
1 30 0.8218653519018874
1 40 0.8210127809762965
1 50 0.8207833334975302
1 60 0.820731921891451
1 70 0.8208628546756441
1 80 0.8211030262637065
1 90 0.8212573043903727
1 100 0.8214385338394318
2 10 0.8409391952597556
2 20 0.8289932154085637
2 30 0.8261785532871957
2 40 0.8254163995744862
2 50 0.8252218607266112
2 60 0.8251968870523418
2 70 0.8253358872347698
2 80 0.825472966997484
2 90 0.825670492537624
2 100 0.8257840098475085
3 10 0.8378956713527487
3 20 0.8259786062887198
3 30 0.8229652212449453
3 40 0.8221379503134457
3 50 0.8218980061501523
3 60 0.821915983933492
3 70 0.822013530003816
3 80 0.8221110101335025
3 90 0.8221852053515344
3 100 0.8222922664790769
4 10 0.8357687981025184
4 20 0.8239375136175311
4 30 0.8211437934006288
4 40 0.8204093113827626
4 50 0.820102726223129
4 60 0.8201281636010559
4 70 0.8200886640783845
4 80 0.8202033744916168
4 90 0.8203169348901405
4 100 0.8204561853058723
5 10 0.8379262804896614
5 20

In [None]:
contentBasedAlgorithmLR(ratingsDF,moviesMetadataDF, folds = 10, randomState = 1)