# Content-based Algorithm-Logistics Regression(Evaluation)

## Import libraries

In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import collections
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
import warnings
warnings.simplefilter('ignore')

## load the metadata

In [2]:
# load the metadata
moviesMetadataDF = pd.read_csv("The Movies DataSet/processedMoviesMetadata.csv", index_col = 0)
#data processing
moviesMetadataDF['overview'] = moviesMetadataDF['overview'].fillna('')
moviesMetadataDF['tagline'] = moviesMetadataDF['tagline'].fillna('')
moviesMetadataDF['genres'] = moviesMetadataDF['genres'].fillna('')
moviesMetadataDF['description'] = moviesMetadataDF['overview'] + moviesMetadataDF['tagline'] + moviesMetadataDF['genres'] #+ moviesMetadataDF['keywords']
moviesMetadataDF['description'] = moviesMetadataDF['description'].fillna('')
moviesMetadataDF

Unnamed: 0,genres,tmdbId,overview,tagline,title,keywords,cast,movieId,director,description
0,"['Animation', 'Comedy', 'Family']",862,"Led by Woody, Andy's toys live happily in his ...",,Toy Story,"['jealousy', 'toy', 'boy', 'friendship', 'frie...","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",1,John Lasseter,"Led by Woody, Andy's toys live happily in his ..."
1,"['Adventure', 'Fantasy', 'Family']",8844,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,"['board game', 'disappearance', ""based on chil...","['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",2,Joe Johnston,When siblings Judy and Peter discover an encha...
2,"['Romance', 'Comedy']",15602,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"['fishing', 'best friend', 'duringcreditssting...","['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",3,Howard Deutch,A family wedding reignites the ancient feud be...
3,"['Comedy', 'Drama', 'Romance']",31357,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,"['based on novel', 'interracial relationship',...","['Whitney Houston', 'Angela Bassett', 'Loretta...",4,Forest Whitaker,"Cheated on, mistreated and stepped on, the wom..."
4,['Comedy'],11862,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"['baby', 'midlife crisis', 'confidence', 'agin...","['Steve Martin', 'Diane Keaton', 'Martin Short...",5,Charles Shyer,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...,...
3817,"['Comedy', 'Romance']",1597,"Greg Focker is ready to marry his girlfriend, ...",First comes love. Then comes the interrogation.,Meet the Parents,"['cia', 'airport', 'cat', 'jew', 'orderly', 'a...","['Ben Stiller', 'Robert De Niro', 'Teri Polo',...",3948,Jay Roach,"Greg Focker is ready to marry his girlfriend, ..."
3818,"['Crime', 'Drama']",641,The hopes and dreams of four ambitious people ...,,Requiem for a Dream,"['drug addiction', 'junkie', 'heroin', 'speed'...","['Ellen Burstyn', 'Jared Leto', 'Jennifer Conn...",3949,Darren Aronofsky,The hopes and dreams of four ambitious people ...
3819,"['Drama', 'War']",10687,A group of recruits go through Advanced Infant...,The system wanted them to become soldiers. One...,Tigerland,"['independent film', 'awol', 'fort polk louisi...","['Colin Farrell', 'Matthew Davis', 'Clifton Co...",3950,Joel Schumacher,A group of recruits go through Advanced Infant...
3820,"['Drama', 'Romance']",63956,Buddy Visalo (Michael Rispoli) is a factory wo...,The only way to find out what you love is to r...,Two Family House,[],"['Michael Rispoli', 'Kelly Macdonald', 'Kathri...",3951,Raymond De Felitta,Buddy Visalo (Michael Rispoli) is a factory wo...


## Load the rating data

In [3]:
# load the rating data
ratingsHeader = ["UserID", "MovieID", "Rating", "Timestamp"]
ratingsDF = pd.read_table('Raw Data/ratings.dat',sep = '::', names = ratingsHeader, engine = "python")
ratingsDF

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


## K - Fold Cross Validation

* shuffle the dataframe
* split the data into K folds
* split the trainData(K-1) and testData(1) for K times
* return the trainDataList and the testDataList

In [4]:
#split the data
def splitData(df, folds = 5, shu = True, randomState = None):
    trainDFList = []
    testDFList = []
    #shuffle the dataframe
    df = shuffle(df, random_state = randomState)
    #split the data into K folds
    nums = df.shape[0]
    length = int(nums / folds)
    dfList = []
    dfList.append(df[0:length])
    for i in range(1,folds - 1):
        dfList.append(df[i*length: (i + 1) * length])
    dfList.append(df[(folds - 1) * length :])
    #split the trainData and test Data
    for i in range(folds):
        testDF = dfList[i]
        tempList = []
        for j in range(folds):
            if(j != i):
                tempList.append(j)
        trainDF = dfList[tempList[0]]
        for j in range(1,folds - 1):
            trainDF = pd.concat([trainDF, dfList[tempList[j]]])
        trainDFList.append(trainDF)
        testDFList.append(testDF)
    #return the trainDataList and the testDataList
    return trainDFList, testDFList

## Load the rating data

In [5]:
# load the rating data
# save the rating data into the dictionary
def loadRatingData(row, dataSet):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating

## Represent the movie

* Count Vector
* TF-IDF Vector
* Row - Movie, Column - word

In [6]:
# use count vector to represent movies
def movieRepresentationCount(moviesMetadataDF):
    countVector = CountVectorizer(stop_words = 'english')
    count_matrix = countVector.fit_transform(moviesMetadataDF['description'])
    #save vectors into dictionary
    movieIDSE = moviesMetadataDF['movieId']
    moviesVector = collections.defaultdict(np.array)
    for i in range(moviesMetadataDF.shape[0]):
        moviesVector[movieIDSE[i]] = np.squeeze(count_matrix[i].toarray())
    return moviesVector

In [7]:
# use tf-idf vector to represent movies
def movieRepresentationTFIDF(moviesMetadataDF):
    tfidfVector = TfidfVectorizer(stop_words = 'english')
    tfidf_matrix = tfidfVector.fit_transform(moviesMetadataDF['description'])
    #save vectors into dictionary
    movieIDSE = moviesMetadataDF['movieId']
    moviesVector = collections.defaultdict(np.array)
    for i in range(moviesMetadataDF.shape[0]):
        moviesVector[movieIDSE[i]] = np.squeeze(tfidf_matrix[i].toarray())
    return moviesVector

## Calculate the mean absolute error

In [8]:
#calculate the mean absolute error
def evaluateMAE(trainDataSet, testDataSet, moviesVector):
    AESum = 0
    num = 0
    for userID in trainDataSet.keys():
        #generate train data
        train_X = []
        train_y = []
        for movieID, rating in trainDataSet[userID].items():
            if(movieID in moviesVector.keys()):
                train_X.append(moviesVector[movieID])
                train_y.append(rating) 
        train_X = np.array(train_X)
        train_y = np.array(train_y)
        #generate test data
        test_X = []
        test_y = []
        for movieID, rating in testDataSet[userID].items():
            if(movieID in moviesVector.keys()):
                test_X.append(moviesVector[movieID])
                test_y.append(rating)
        test_X = np.array(test_X)
        test_y = np.array(test_y)
        if(len(set(train_y)) == 1):
            continue
        #train the model for user and predict the rating value
        if(train_X.shape[0] > 0 and test_X.shape[0] > 0):
            LRModel = LogisticRegression()
            LRModel.fit(train_X, train_y)
            test_pred_y = LRModel.predict(test_X)
            for i in range(test_y.shape[0]):
                AESum += abs(test_y[i] - test_pred_y[i])
        num += test_X.shape[0]
    return 1.0 * AESum / num

## Evaluate the performance of the model

In [11]:
# evaluate the performance of the model
def contentBasedAlgorithmLR(ratingsDF,moviesMetadataDF, folds = 5, randomState = None):
    print("Dataset MAE")
    trainRatingDFList, testRatingDFList = splitData(ratingsDF, folds = folds, randomState = randomState)
    for i in range(folds):
        #trainRatingData
        trainRatingDF = trainRatingDFList[i]
        trainDataSet = collections.defaultdict(dict)
        trainRatingDF.apply(loadRatingData, axis = 1, dataSet = trainDataSet)
        #testRatingData
        testRatingDF = testRatingDFList[i]
        testDataSet = collections.defaultdict(dict)
        testRatingDF.apply(loadRatingData, axis = 1, dataSet = testDataSet)
        moviesVector = movieRepresentationTFIDF(moviesMetadataDF)
        MAE = evaluateMAE(trainDataSet, testDataSet, moviesVector)
        print(i + 1, MAE)

In [12]:
contentBasedAlgorithmLR(ratingsDF,moviesMetadataDF, folds = 5, randomState = 1)

Dataset MAE
1 0.8277993323107462


KeyboardInterrupt: 