# Cascade Hybrid Algorithm(Demonstration)

## Import libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import collections
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import linear_kernel
import warnings
warnings.simplefilter('ignore')

## Load the metadata

In [2]:
# load the metadata
moviesMetadataDF = pd.read_csv("The Movies DataSet/processedMoviesMetadata.csv", index_col = 0)
#data processing
moviesMetadataDF['overview'] = moviesMetadataDF['overview'].fillna('')
moviesMetadataDF['tagline'] = moviesMetadataDF['tagline'].fillna('')
moviesMetadataDF['genres'] = moviesMetadataDF['genres'].fillna('')
moviesMetadataDF['description'] = moviesMetadataDF['overview'] + moviesMetadataDF['tagline'] + moviesMetadataDF['genres'] #+ moviesMetadataDF['keywords']
moviesMetadataDF['description'] = moviesMetadataDF['description'].fillna('')
moviesMetadataDF

Unnamed: 0,genres,tmdbId,overview,tagline,title,keywords,cast,movieId,director,description
0,"['Animation', 'Comedy', 'Family']",862,"Led by Woody, Andy's toys live happily in his ...",,Toy Story,"['jealousy', 'toy', 'boy', 'friendship', 'frie...","['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",1,John Lasseter,"Led by Woody, Andy's toys live happily in his ..."
1,"['Adventure', 'Fantasy', 'Family']",8844,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,"['board game', 'disappearance', ""based on chil...","['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",2,Joe Johnston,When siblings Judy and Peter discover an encha...
2,"['Romance', 'Comedy']",15602,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"['fishing', 'best friend', 'duringcreditssting...","['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",3,Howard Deutch,A family wedding reignites the ancient feud be...
3,"['Comedy', 'Drama', 'Romance']",31357,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale,"['based on novel', 'interracial relationship',...","['Whitney Houston', 'Angela Bassett', 'Loretta...",4,Forest Whitaker,"Cheated on, mistreated and stepped on, the wom..."
4,['Comedy'],11862,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"['baby', 'midlife crisis', 'confidence', 'agin...","['Steve Martin', 'Diane Keaton', 'Martin Short...",5,Charles Shyer,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...,...
3817,"['Comedy', 'Romance']",1597,"Greg Focker is ready to marry his girlfriend, ...",First comes love. Then comes the interrogation.,Meet the Parents,"['cia', 'airport', 'cat', 'jew', 'orderly', 'a...","['Ben Stiller', 'Robert De Niro', 'Teri Polo',...",3948,Jay Roach,"Greg Focker is ready to marry his girlfriend, ..."
3818,"['Crime', 'Drama']",641,The hopes and dreams of four ambitious people ...,,Requiem for a Dream,"['drug addiction', 'junkie', 'heroin', 'speed'...","['Ellen Burstyn', 'Jared Leto', 'Jennifer Conn...",3949,Darren Aronofsky,The hopes and dreams of four ambitious people ...
3819,"['Drama', 'War']",10687,A group of recruits go through Advanced Infant...,The system wanted them to become soldiers. One...,Tigerland,"['independent film', 'awol', 'fort polk louisi...","['Colin Farrell', 'Matthew Davis', 'Clifton Co...",3950,Joel Schumacher,A group of recruits go through Advanced Infant...
3820,"['Drama', 'Romance']",63956,Buddy Visalo (Michael Rispoli) is a factory wo...,The only way to find out what you love is to r...,Two Family House,[],"['Michael Rispoli', 'Kelly Macdonald', 'Kathri...",3951,Raymond De Felitta,Buddy Visalo (Michael Rispoli) is a factory wo...


## Load the rating data

In [3]:
# load the rating data
ratingsHeader = ["UserID", "MovieID", "Rating", "Timestamp"]
ratingsDF = pd.read_table('Raw Data/ratings.dat',sep = '::', names = ratingsHeader, engine = "python")
ratingsDF

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


## K - Fold Cross Validation

* shuffle the dataframe
* split the data into K folds
* split the trainData(K-1) and testData(1) for K times
* return the trainDataList and the testDataList

In [4]:
#split the data
def splitData(df, folds = 5, shu = True, randomState = None):
    trainDFList = []
    testDFList = []
    #shuffle the dataframe
    df = shuffle(df, random_state = randomState)
    #split the data into K folds
    nums = df.shape[0]
    length = int(nums / folds)
    dfList = []
    dfList.append(df[0:length])
    for i in range(1,folds - 1):
        dfList.append(df[i*length: (i + 1) * length])
    dfList.append(df[(folds - 1) * length :])
    #split the trainData and test Data
    for i in range(folds):
        testDF = dfList[i]
        tempList = []
        for j in range(folds):
            if(j != i):
                tempList.append(j)
        trainDF = dfList[tempList[0]]
        for j in range(1,folds - 1):
            trainDF = pd.concat([trainDF, dfList[tempList[j]]])
        trainDFList.append(trainDF)
        testDFList.append(testDF)
    #return the trainDataList and the testDataList
    return trainDFList, testDFList

In [5]:
#split the data
trainRatingDFList, testRatingDFList = splitData(ratingsDF, folds = 5, shu = True, randomState = 1)

## Load the train rating data

In [6]:
# load the train rating data
# save the Rating Data into the dictionary
def loadTrainRatingData(row, dataSet, dataSet2):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating
    dataSet2[movieID][userID] = rating

In [7]:
# load the train rating data
trainRatingDF = trainRatingDFList[0]
trainDataSet = collections.defaultdict(dict)
trainDataSet2 = collections.defaultdict(dict)
trainRatingDF.apply(loadTrainRatingData, axis = 1, dataSet = trainDataSet, dataSet2 = trainDataSet2)

120098    None
529184    None
341591    None
470922    None
630004    None
          ... 
491263    None
791624    None
470924    None
491755    None
128037    None
Length: 800168, dtype: object

## Load the test rating data

In [8]:
# load the test rating data
# save the Rating Data into the dictionary
def loadTestRatingData(row, dataSet):
    userID = row['UserID']
    movieID = row['MovieID']
    rating = row['Rating']
    dataSet[userID][movieID] = rating

In [9]:
# load the test rating data
testRatingDF = testRatingDFList[0]
testDataSet = collections.defaultdict(dict)
testRatingDF.apply(loadTestRatingData, axis = 1, dataSet = testDataSet)

630120    None
229398    None
758377    None
159240    None
254252    None
          ... 
659611    None
875199    None
743921    None
527163    None
623363    None
Length: 200041, dtype: object

## Calculate similarities among movies(Content-Based)

* represent movies using TF-IDF Vector
* calculate the cosine similarity
* get the index
* save the similarity

In [10]:
# calculate similarities among movies(content-based)
# save the similarities into the dictionary
def contentBasedSimilarityDict(moviesMetadataDF):
    #represent movies using TF-IDF Vector
    tfidfVector = TfidfVectorizer(stop_words = 'english')
    tfidf_matrix = tfidfVector.fit_transform(moviesMetadataDF['description'])
    movieIDList = list(moviesMetadataDF['movieId'].values)
    #calculate similarities among all the movies
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    movieSimilarityDict = collections.defaultdict(dict)
    num = cosine_sim.shape[0]
    for i in range(num):
        for j in range(i, num):
            #get the index
            IDA = movieIDList[i]
            IDB = movieIDList[j]
            #get the similarity
            similarity = cosine_sim[i][j]
            #save the similarity
            movieSimilarityDict[IDA][IDB] = similarity
            movieSimilarityDict[IDB][IDA] = similarity
    #return the similarity dictionary
    return movieSimilarityDict

In [11]:
# calculate similarities among movies(content-based)
CBSimilarityDict = contentBasedSimilarityDict(moviesMetadataDF)

## Calculate similarities among movies(Collaborative Filtering)

* get the index
* find the commen Items
* calculate the similarity
* save the similarity

In [12]:
# calculate similarities among movies(collaborative filtering)
# save the similarities into the dictionary
def collaborativeFilteringSimilarityDict(trainDataSet):
    trainMovieList = list(trainDataSet.keys())
    num = len(trainMovieList)
    movieSimilarityDict = collections.defaultdict(dict)
    #calculate similarities among all the movies
    for i in range(num):
        for j in range(i, num):
            #get the index
            IDA = trainMovieList[i]
            IDB = trainMovieList[j]
            setA = set(trainDataSet[IDA].keys())
            setB = set(trainDataSet[IDB].keys())
            #count the commonItems
            commonItems = len(setA.intersection(setB))
            #calculate the similarity
            similarity = commonItems/np.sqrt(len(setA) * len(setB))
            #save the similarity
            movieSimilarityDict[IDA][IDB] = similarity
            movieSimilarityDict[IDB][IDA] = similarity
    #return the similarity dictionary
    return movieSimilarityDict

In [13]:
#calculate similarities among movies
CFSimilarityDict = collaborativeFilteringSimilarityDict(trainDataSet2)

## Predict the rating value

* select movies user rated
* select similar movies according to the similarity matrix calculated by metadata of movies
* select similar movies according to the similarity matrix calculated by metadata of movies
* predict the rating value

In [14]:
# predict the rating value
def predictRating(trainDataSet, CBSimilarityDict, CFSimilarityDict, userID, movieID, K1, K2):
    similaritySum = 0.0
    ratingSimSum = 0.0
    #select movies user rated
    ratedMovies = trainDataSet[userID]
    CFSimilarMovieDict = collections.defaultdict(float)
    CBMovieIDList = list(CBSimilarityDict.keys())
    if(movieID in CBMovieIDList):
        CBSimilarMovieDict = collections.defaultdict(float)
        for similarMovieID in ratedMovies:
            if(similarMovieID == movieID):
                continue
            try:
                CBSimilarMovieDict[similarMovieID] = CBSimilarityDict[similarMovieID][movieID]
            except:
                pass
        #select similar movies according to the similarity matrix calculated by metadata of movies
        for similarMovieID, similarity in sorted(CBSimilarMovieDict.items(), key = lambda d:d[1], reverse = True)[:K1]:
            try:
                CFSimilarMovieDict[similarMovieID] = CFSimilarityDict[similarMovieID][movieID]
            except:
                pass
    else:
        for similarMovieID in ratedMovies:
            if(similarMovieID == movieID):
                continue
            CFSimilarMovieDict[similarMovieID] = CFSimilarityDict[similarMovieID][movieID]
    #select similar movies according to the similarity matrix calculated by metadata of movies
    for similarMovieID, similarity in sorted(CFSimilarMovieDict.items(), key = lambda d:d[1], reverse = True)[:K2]:
        similaritySum += similarity
        ratingSimSum += similarity * ratedMovies[similarMovieID]
    #predict the rating value
    if(similaritySum == 0):
        result = 0
    else:
        result = 1.0 * ratingSimSum / similaritySum
    return result

In [16]:
#predict the rating value(User1 on Movie594)
predictRating(trainDataSet,  CBSimilarityDict,CFSimilarityDict,  1, 594, 100, 10)

4.455166940096121

In [17]:
#The rating value that User1 rated on Movie594 is 4 
testDataSet[1][594]

4

## Calculate the mean absolute error 

In [18]:
# calculate the mean absolute error on the user
def evaluateUserMAE(trainDataSet, testDataSet, CBSimilarityDict, CFSimilarityDict, userID, K1, K2):
    AESum = 0.0
    n = len(testDataSet[userID].keys())
    for movieID in testDataSet[userID].keys():
        try:
            AESum += np.fabs(predictRating(trainDataSet, CBSimilarityDict, CFSimilarityDict, userID, movieID, K1, K2) - testDataSet[userID][movieID])
        except:
            n -= 1
    return [AESum, n]

In [20]:
#calculate the mean absolute error on User 1
AESum, n = evaluateUserMAE(trainDataSet, testDataSet, CBSimilarityDict, CFSimilarityDict, 1, 200, 10)
AESum/n

0.5608424425381069

In [22]:
#calculate the mean absolute error on the whole dataset
def evaluateMAE(trainDataSet, testDataSet, CBSimilarityDF, CFSimilarityDict, K1, K2):
    AESum = 0.0
    n = 0
    for userID in list(testDataSet.keys()):
        curAE, curN = evaluateUserMAE(trainDataSet, testDataSet, CBSimilarityDF, CFSimilarityDict, userID, K1, K2)
        if(np.isnan(curAE) or np.isnan(curN)):
            continue
        AESum += curAE
        n += curN
    return AESum / n

In [23]:
#calculate the mean absolute error on the whole dataset
evaluateMAE(trainDataSet, testDataSet, CBSimilarityDict, CFSimilarityDict, 200, 10)

0.7366821146298921