In [26]:
import pandas as pd
import numpy as np

# #Import data from the clean file 
df = pd.read_csv('../data/metadata_clean.csv')

# #Print the head of the cleaned DataFrame
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995


In [27]:
# 오류(SettingWithCopyError 발생)
#pd.set_option('mode.chained_assignment', 'raise') # SettingWithCopyError

# 경고(SettingWithCopyWarning 발생, 기본 값입니다)
#pd.set_option('mode.chained_assignment', 'warn') # SettingWithCopyWarning

# 무시
pd.set_option('mode.chained_assignment',  None) # <==== 경고를 끈다

In [28]:
#Import the original file
orig_df = pd.read_csv('../data/movies_metadata.csv', low_memory=False)

#Add the useful features into the cleaned dataframe
#df['overview'], df['id'] = orig_df['overview'], orig_df['id']

#df.head()

orig_df['tagline'] = orig_df['tagline'].fillna('')
df['description'] = orig_df['overview'] + orig_df['tagline']
df['description'] = df['description'].fillna('')
df['id'] = orig_df['id']

In [29]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,description,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [30]:
df.shape

(45466, 8)

In [31]:
md = df.drop([19730, 29503, 35587])

In [32]:
#Check EDA Notebook for how and why I got these indices.
md['id'] = md['id'].astype('int')

In [34]:
md.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,description,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [35]:
links_small = pd.read_csv('../data/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

md['description'] = md['description'].fillna('')

In [36]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 8)

In [37]:
smd.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,description,id
0,Toy Story,"['animation', 'comedy', 'family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['adventure', 'fantasy', 'family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['romance', 'comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['comedy', 'drama', 'romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


In [38]:
 print(smd.shape)

(9099, 8)


In [39]:
#Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(smd['description'].values.astype('U'))

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(9099, 268124)

In [40]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [41]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

# Metadata Based Recommender

In [42]:
# Load the keywords and credits files
cred_df = pd.read_csv('../data/credits.csv')
key_df = pd.read_csv('../data/keywords.csv')

In [43]:
# Function to convert all non-integer IDs to NaN
def clean_ids(x):
    try:
        return int(x)
    except:
        return np.nan

In [44]:
#Clean the ids of df
md['id'] = md['id'].apply(clean_ids)

#Filter all rows that have a null ID
# nan 값을 삭제한다. 
md = md[md['id'].notnull()]

In [45]:
# Convert IDs into integer
md['id'] = md['id'].astype('int')
key_df['id'] = key_df['id'].astype('int')
cred_df['id'] = cred_df['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
md = md.merge(cred_df, on='id')
md = md.merge(key_df, on='id')

#Display the head of df
#df.head()
df.shape

(45466, 8)

In [47]:
# Convert the stringified objects into the native python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    md[feature] = md[feature].apply(literal_eval)

In [48]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 11)

In [49]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [50]:
smd['director'] = smd['crew'].apply(get_director)

In [51]:
# Returns the list top 3 elements or entire list; whichever is more.
def generate_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [52]:
#Apply the generate_list function to cast and keywords
smd['cast'] = smd['cast'].apply(generate_list)
smd['keywords'] = smd['keywords'].apply(generate_list)

In [53]:
#Only consider a maximum of 3 genres
smd['genres'] = smd['genres'].apply(lambda x: x[:3])

In [54]:
# Function to sanitize data to prevent ambiguity. It removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        #Strip spaces and convert to lowercase
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [55]:
#Apply the generate_list function to cast, keywords, director and genres
for feature in ['cast', 'director', 'genres', 'keywords']:
    smd[feature] = smd[feature].apply(sanitize)

In [56]:
#Function that creates a soup out of the desired metadata
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [57]:
# Create the new soup feature
smd['soup'] = smd.apply(create_soup, axis=1)

In [58]:
smd['soup']

0        jealousy toy boy tomhanks timallen donrickles ...
1        boardgame disappearance basedonchildren'sbook ...
2        fishing bestfriend duringcreditsstinger walter...
3        basedonnovel interracialrelationship singlemot...
4        baby midlifecrisis confidence stevemartin dian...
                               ...                        
40952    friendship brickmaking sidneypoitier wendycrew...
41172    bollywood akshaykumar ileanad'cruz eshagupta t...
41225    bollywood hrithikroshan poojahegde kabirbedi a...
41391    monster godzilla giantmonster hirokihasegawa y...
41669    music documentary paulmccartney ringostarr joh...
Name: soup, Length: 9219, dtype: object

In [59]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [60]:
#Import cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [61]:
cosine_sim.shape

(9219, 9219)

In [62]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [63]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [65]:
#Build the SVD based Collaborative filter
from surprise import SVD, Reader, Dataset
from surprise.model_selection import KFold

reader = Reader()
ratings = pd.read_csv('../data/ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

#data.split(n_folds=5)
kf = KFold(n_splits=5)
kf.split(data)

svd = SVD()
trainset = data.build_full_trainset()
svd.fit(trainset)

In [67]:
ratings[ratings['userId'] == 1][:5]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [68]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.661298406845673, details={'was_impossible': False})

In [69]:
id_map = pd.read_csv('../data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

indices_map = id_map.set_index('id')

In [70]:
indices_map

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5
...,...
159550.0,161944
392572.0,162542
402672.0,162672
315011.0,163056


In [71]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [72]:
hybrid(1, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
8867,Warcraft,2325.0,6.3,2016,68735,2.958789
8166,The Amazing Spider-Man,6734.0,6.5,2012,1930,2.827746
8658,X-Men: Days of Future Past,6155.0,7.5,2014,127585,2.803375
5538,Spider-Man 2,4432.0,6.7,2004,558,2.752684
6084,Beastmaster 2: Through the Portal of Time,17.0,4.6,1991,27549,2.720521
8636,The Amazing Spider-Man 2,4274.0,6.5,2014,102382,2.70378
5310,Frank Herbert's Dune,114.0,6.7,2000,876,2.701473
6576,The Covenant,295.0,5.2,2006,9954,2.670215
7000,The Forbidden Kingdom,476.0,6.3,2008,1729,2.664175
8521,Thor: The Dark World,4873.0,6.8,2013,76338,2.645765


In [None]:
def hybrid(userId, title):
    #Extract the cosine_sim index of the movie
    idx = cosine_sim_map[title]
    
    #Extract the TMDB ID of the movie
    tmdbId = title_to_id.loc[title]['id']
    
    #Extract the movie ID internally assigned by the dataset
    movie_id = title_to_id.loc[title]['movieId']
    
    #Extract the similarity scores and their corresponding index for every movie from the cosine_sim matrix
    sim_scores = list(enumerate(cosine_sim[str(int(idx))]))
    
    #Sort the (index, score) tuples in decreasing order of similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Select the top 25 tuples, excluding the first 
    #(as it is the similarity score of the movie with itself)
    sim_scores = sim_scores[1:26]
    
    #Store the cosine_sim indices of the top 25 movies in a list
    movie_indices = [i[0] for i in sim_scores]

    #Extract the metadata of the aforementioned movies
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    
    #Compute the predicted ratings using the SVD filter
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, id_to_title.loc[x]['movieId']).est)
    
    #Sort the movies in decreasing order of predicted rating
    movies = movies.sort_values('est', ascending=False)
    
    #Return the top 10 movies as recommendations
    return movies.head(10)