In [1]:
import csv

In [2]:
import pandas as pd
import numpy as np

# Data Ingestion

In [5]:
credit = pd.read_csv("tmdb_5000_credits.csv")

In [6]:
movies = pd.read_csv("tmdb_5000_movies.csv")

In [7]:
credit.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
movies.shape

(4803, 20)

# Data Processing Part 1

In [9]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
from ast import literal_eval

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
#get the whole dataset
movies = movies.merge(credit,left_on="id",right_on="movie_id")

In [13]:
#fill na on overview data
movies['overview'] = movies['overview'].fillna('')

In [14]:
movies['cast'] = movies['cast'].apply(literal_eval)
movies['crew'] = movies['crew'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)
movies['genres'] = movies['genres'].apply(literal_eval)

In [15]:
# feature engineering

In [16]:
# get first three characters 
def flatten_char(mylist):
    for i in mylist:
        if isinstance(i,dict):
            charlist = [i['character'] for i in mylist]
            return ",".join(str(i) for i in charlist[0:4])

movies['major_cast'] = movies['cast'].apply(flatten_char)

In [17]:
# get director director
def get_director(x):
    for i in x:
        if i['job'].lower() == 'director':
            return i['name']

movies['director'] = movies['crew'].apply(get_director)

In [18]:
# get keywords
def flatten_key(mylist):
    for i in mylist:
        if isinstance(i,dict):
            keylist = [i['name'] for i in mylist]
            return ",".join(str(i) for i in keylist)

movies['keys_words'] = movies['keywords'].apply(flatten_key)

In [19]:
# get genres
def flatten_genres(mylist):
    for i in mylist:
        if isinstance(i,dict):
            genreslist = [i['name'] for i in mylist]
            return ",".join(str(i) for i in genreslist)
        
movies['genres_list'] = movies['genres'].apply(flatten_genres)

In [20]:
features = ['keys_words','genres_list','overview','popularity','major_cast','director','vote_average']

# Recommendation System #1 - Content Based Filtering

In [21]:
content_movie = pd.DataFrame()
for i in features:
    content_movie[i] = movies[i]

In [22]:
content_movie['title'] = movies['original_title']

In [23]:
# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
# tfidf = TfidfVectorizer(stop_words='english')
# tfidf_matrix[i] = tfidf.fit_transform(content_movie)

In [24]:
# create a column with all the words
def word_list(x):
    return ''.join(x['keys_words']) + ' '.join(x['major_cast']) + ' ' + x['director'] + ' '.join(x['genres_list'])

In [25]:
#fillna with " "
content_movie = content_movie.fillna(' ')

In [26]:
content_movie['words'] = content_movie.apply(word_list,axis=1)

In [27]:
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(content_movie['words'])

In [28]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [29]:
def content_based_filter(title,cosine_sim = cosine_sim):
    # movie index in the database
    i = int(content_movie[content_movie['title']==title].index.values)
    sim_scores = sorted(list(enumerate(cosine_sim[i])),key=lambda x: x[1],reverse = True)
    # get the first 10 scores
    sim_scores = sim_scores[1:11]
    recommend_i = [i[0] for i in sim_scores]
    return content_movie['title'].iloc[recommend_i]
    

In [30]:
content_movie.head()

Unnamed: 0,keys_words,genres_list,overview,popularity,major_cast,director,vote_average,title,words
0,"culture clash,future,space war,space colony,so...","Action,Adventure,Fantasy,Science Fiction","In the 22nd century, a paraplegic Marine is di...",150.437577,"Jake Sully,Neytiri,Dr. Grace Augustine,Col. Qu...",James Cameron,7.2,Avatar,"culture clash,future,space war,space colony,so..."
1,"ocean,drug abuse,exotic island,east india trad...","Adventure,Fantasy,Action","Captain Barbossa, long believed to be dead, ha...",139.082615,"Captain Jack Sparrow,Will Turner,Elizabeth Swa...",Gore Verbinski,6.9,Pirates of the Caribbean: At World's End,"ocean,drug abuse,exotic island,east india trad..."
2,"spy,based on novel,secret agent,sequel,mi6,bri...","Action,Adventure,Crime",A cryptic message from Bond’s past sends him o...,107.376788,"James Bond,Blofeld,Madeleine,M",Sam Mendes,6.3,Spectre,"spy,based on novel,secret agent,sequel,mi6,bri..."
3,"dc comics,crime fighter,terrorist,secret ident...","Action,Crime,Drama,Thriller",Following the death of District Attorney Harve...,112.31295,"Bruce Wayne / Batman,Alfred Pennyworth,James G...",Christopher Nolan,7.6,The Dark Knight Rises,"dc comics,crime fighter,terrorist,secret ident..."
4,"based on novel,mars,medallion,space travel,pri...","Action,Adventure,Science Fiction","John Carter is a war-weary, former military ca...",43.926995,"John Carter,Dejah Thoris,Sola,Tars Tarkas",Andrew Stanton,6.1,John Carter,"based on novel,mars,medallion,space travel,pri..."


In [31]:
content_based_filter('The Godfather')

2674                                   Stomp the Yard
448                                     Cold Mountain
1408                                           Closer
2731                           The Godfather: Part II
784     In the Name of the King: A Dungeon Siege Tale
1874                                      August Rush
2476                                   White Oleander
1900                      The Adventures of Pinocchio
2989                                    Happy Gilmore
119                                     Batman Begins
Name: title, dtype: object

# Recommendation Sytem - Popularity (Demographic) Filtering

In this popularity part, we will provide the top ten most popular movies to the users who are new to the website. Since we don't have any information about those users' interests, we will provide them the most popular movies alternatively.
    
The following script is educated by: https://www.kaggle.com/ibtesama/getting-started-with-a-movie-recommendation-system/data

In Deomographic-filtering recommendation system we will use the following forumula to calculate the weighted rating:
    
    wr = (v/(v+m) * R) + (m/(m+v) * C)
where:
* v is the number of votes for the movie;
* m is the minimum votes required to be listed in the chart;
* R is the average rating of the movie; And
* C is the mean vote across the whole report


In [32]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id', 'title_y', 'cast', 'crew', 'major_cast',
       'director', 'keys_words', 'genres_list'],
      dtype='object')

In [33]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,vote_average,vote_count,movie_id,title_y,cast,crew,major_cast,director,keys_words,genres_list
0,237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.avatarmovie.com/,19995,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,7.2,11800,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","Jake Sully,Neytiri,Dr. Grace Augustine,Col. Qu...",James Cameron,"culture clash,future,space war,space colony,so...","Action,Adventure,Fantasy,Science Fiction"
1,300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://disney.go.com/disneypictures/pirates/,285,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","Captain Jack Sparrow,Will Turner,Elizabeth Swa...",Gore Verbinski,"ocean,drug abuse,exotic island,east india trad...","Adventure,Fantasy,Action"
2,245000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,6.3,4466,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","James Bond,Blofeld,Madeleine,M",Sam Mendes,"spy,based on novel,secret agent,sequel,mi6,bri...","Action,Adventure,Crime"
3,250000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",http://www.thedarkknightrises.com/,49026,"[{'id': 849, 'name': 'dc comics'}, {'id': 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,7.6,9106,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","Bruce Wayne / Batman,Alfred Pennyworth,James G...",Christopher Nolan,"dc comics,crime fighter,terrorist,secret ident...","Action,Crime,Drama,Thriller"
4,260000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://movies.disney.com/john-carter,49529,"[{'id': 818, 'name': 'based on novel'}, {'id':...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,6.1,2124,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","John Carter,Dejah Thoris,Sola,Tars Tarkas",Andrew Stanton,"based on novel,mars,medallion,space travel,pri...","Action,Adventure,Science Fiction"


In [34]:
demo_movie = movies[['movie_id','title_y','major_cast','crew','vote_average','vote_count']]

In [35]:
C = demo_movie['vote_average'].mean()

In [36]:
C

6.092171559442011

Since m is the minimum votes required to be listed in the chart, we require that movies that can at least earn 95% vote counts to be in the chart.

In [37]:
m = demo_movie['vote_count'].quantile(0.95)

In [38]:
m

3040.8999999999996

In [39]:
demo_movie_filtered = demo_movie.copy().loc[demo_movie['vote_count'] >= m]

In [40]:
def weighted_rating(df = demo_movie_filtered, m=m, C=C):
    v = df['vote_count']
    R = df['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [41]:
demo_movie_filtered['weighted_rating'] = demo_movie_filtered.apply(weighted_rating,axis=1)

In [42]:
demo_movie_filtered.sort_values('weighted_rating', ascending=False).head()

Unnamed: 0,movie_id,title_y,major_cast,crew,vote_average,vote_count,weighted_rating
1881,278,The Shawshank Redemption,"Andy Dufresne,Ellis Boyd 'Red' Redding,Warden ...","[{'credit_id': '52fe4231c3a36847f800b153', 'de...",8.5,8205,7.848921
65,155,The Dark Knight,"Bruce Wayne,Joker,Harvey Dent,Alfred","[{'credit_id': '55a0eb4a925141296b0010f8', 'de...",8.2,12002,7.773906
662,550,Fight Club,"The Narrator,Tyler Durden,Robert 'Bob' Paulson...","[{'credit_id': '55731b8192514111610027d7', 'de...",8.3,9413,7.760909
96,27205,Inception,"Dom Cobb,Arthur,Ariadne,Eames","[{'credit_id': '56e8462cc3a368408400354c', 'de...",8.1,13752,7.736417
3232,680,Pulp Fiction,"Vincent Vega,Jules Winfield,Mia Wallace,Butch ...","[{'credit_id': '52fe4269c3a36847f801caa3', 'de...",8.3,8428,7.714609


In [43]:
demo_movie_filtered.sort_values('weighted_rating', ascending=False)['title_y'].head()

1881    The Shawshank Redemption
65               The Dark Knight
662                   Fight Club
96                     Inception
3232                Pulp Fiction
Name: title_y, dtype: object

# Recommendation System #3- Collaborative Filtering 

In [44]:
from surprise import Reader, Dataset, SVD, evaluate
from surprise import SVDpp
from surprise import model_selection
from surprise.model_selection import cross_validate, GridSearchCV

In [45]:
meta_data_movie = pd.read_csv('movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [46]:
meta_data_movie.loc[0]

adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                         21.9469
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
production_countries     

In [47]:
ratings = pd.read_csv("ratings_small.csv")

In [48]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [49]:
ratings = ratings.dropna()

In [50]:
#movies = movies.drop(['movie_id', 'title_x','title_y'], axis=1)

In [51]:
reader=Reader()
data_cf = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)
data_cf.split(n_folds=5)
svd = SVD()
evaluate(svd, data_cf, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8950
MAE:  0.6897
------------
Fold 2
RMSE: 0.8962
MAE:  0.6904
------------
Fold 3
RMSE: 0.9017
MAE:  0.6929
------------
Fold 4
RMSE: 0.9013
MAE:  0.6932
------------
Fold 5
RMSE: 0.8981
MAE:  0.6927
------------
------------
Mean RMSE: 0.8984
Mean MAE : 0.6918
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8949552732452307,
                             0.8961707532417714,
                             0.9016665751207277,
                             0.9012953251882037,
                             0.8981081357947365],
                            'mae': [0.6897381847177811,
                             0.6903791099552992,
                             0.6928633097107469,
                             0.6931997188641745,
                             0.6927304184543744]})

In [52]:
trainset = data_cf.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d545713588>

# Data Processing Part 2: Construct a dataframe to convert movieId, tmdbId, and movie titles 

In [53]:
link_small = pd.read_csv("links_small.csv")

link_small = link_small.dropna()

link_small['tmdbId'] = link_small['tmdbId'].apply(lambda x: int(x)) 

link_small.head()

ratings['movieId'] = ratings['movieId'].apply(lambda x:int(x))
link_small['movieId'] = link_small['movieId'].apply(lambda x:int(x))

ratings = ratings[ratings['movieId'].isin(link_small['movieId'])]

movie_id = pd.DataFrame(ratings['movieId']).drop_duplicates()

tmdb_list = movie_id['movieId'].apply(lambda x: link_small.loc[link_small['movieId']==x,'tmdbId'].values[0])

movie_id['tmdbId']=tmdb_list

movie_id.head()

movie_temp = meta_data_movie.drop_duplicates()
movie_temp.head()

movie_id = movie_id[movie_id['tmdbId'].apply(lambda x:str(x)).isin(movie_temp['id'])]

movie_id['name'] = movie_id['tmdbId'].apply(lambda x: movie_temp.loc[movie_temp['id']==str(x),'original_title'].values[0])

movie_id.head()

ratings = ratings[ratings['movieId'].isin(movie_id['movieId'])]

In [54]:
def cf(userid,num):
    #movieid = movies.loc[movies['original_title']==item,"id"]
    if num>10: return print("Sorry, no more than movies")
    else:
        score_df = movie_id.copy()
        predict_score = movie_id['movieId'].apply(lambda x:svd.predict(userid,x).est)
        score_df['score']=predict_score
        score_df = score_df.sort_values(by='score',ascending = False)
        return print("Your recommended movies are: ", score_df.head(num)['name'].values)

In [55]:
cf('tt0113041',10)

Your recommended movies are:  ['The Shawshank Redemption' 'The Godfather' 'The African Queen'
 'All About Eve' 'The Godfather: Part II' 'It Happened One Night'
 'Cidade de Deus' 'The Usual Suspects' 'Modern Times' 'Roger & Me']


# Recommendation System #4 - Hybrid Recommendation System

In [56]:
def cf_rec(userid,title,num):
    if num >10: return print("Sorry, no more than 10 movies")
    else:
        i = int(movie_id[movie_id['name']==title].index.values)
        sim_scores = sorted(list(enumerate(cosine_sim[i])),key=lambda x: x[1],reverse = True)
            # get the first 10 scores
        sim_scores = sim_scores[1:30]
        recommend_i = [i[0] for i in sim_scores]
        new_movie_set = movie_id.iloc[recommend_i]
        score_df = movie_id.copy()
        predict_score = new_movie_set['movieId'].apply(lambda x:svd.predict(userid,x).est)
        score_df['score']=predict_score
        score_df = score_df.sort_values(by='score',ascending = False)
        return print("Your recommended movies are: ", score_df.head(num)['name'].values)


In [57]:
cf_rec('tt0113041','Avatar',7)

Your recommended movies are:  ['Moon' '卧虎藏龙' 'Drugstore Cowboy' 'Out of Africa' '22 Jump Street'
 'Viridiana' 'Dancer in the Dark']


# Conclusion

Even content-based filtering recommendation system is a good solution to address the cold-start problem, it has a limitation in predicting movies across the genres. In addition, it cannot capture users’ personal taste and favors. It will always return the same recommendation result no matter who the users are, as long as they enter the same favorite movie as the input for the engine. 

Collaborative-filtering recommendation system based on Singular Value Decomposition algorithm can resolve this problem by capturing the similarities between both users and movies. 


Hybrid recommendation system combines ideas from both content-based and collaborative filtering algorithms to predict expected scores of the similar movies that a particular user likes. 
