# Content-based Recommendation

Analizing the database of The Movies Dataset: [The Movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset) 


inspired by [Movie Recommender Systems](https://www.kaggle.com/rounakbanik/movie-recommender-systems/notebook)

In [66]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

#You have to install surprise
from surprise import Reader, Dataset, SVD, Dataset, accuracy
from surprise.model_selection import cross_validate, train_test_split


import warnings; warnings.simplefilter('ignore')

In [3]:
md = pd.read_csv('2-Data/md_simple_recommender.csv')

links_small  = pd.read_csv('2-Data/links_small.csv')
print(md.shape)
links_small [:5]

(44048, 24)


Unnamed: 0.1,Unnamed: 0,movieId,imdbId,tmdbId
0,0,1,114709,862.0
1,1,2,113497,8844.0
2,2,3,113228,15602.0
3,3,4,114885,31357.0
4,4,5,113041,11862.0


In [4]:
links_small.dtypes

Unnamed: 0      int64
movieId         int64
imdbId          int64
tmdbId        float64
dtype: object

In [5]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [6]:
links_small.dtypes

dtype('int32')

In [7]:
md.dtypes

Unnamed: 0                int64
Unnamed: 0.1              int64
adult                      bool
budget                    int64
genres                   object
id                        int64
imdb_id                  object
original_language        object
original_title           object
overview                 object
popularity              float64
poster_path              object
production_companies     object
production_countries     object
release_date             object
revenue                 float64
runtime                 float64
spoken_languages         object
status                   object
title                    object
video                      bool
vote_average            float64
vote_count              float64
year                      int64
dtype: object

In [8]:
smd = md[md['id'].isin(links_small)]
print(smd.shape)
smd[:3]

(9082, 24)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,adult,budget,genres,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count,year
0,0,0,False,30000000,"['Animation', 'Comedy', 'Family']",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Toy Story,False,7.7,5415.0,1995
1,1,1,False,65000000,"['Adventure', 'Fantasy', 'Family']",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Jumanji,False,6.9,2413.0,1995
2,2,2,False,0,"['Romance', 'Comedy']",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Grumpier Old Men,False,6.5,92.0,1995


#### Movie Overview Based Recommender

In [9]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['overview'])

In [10]:
tfidf_matrix.shape

(9082, 244086)

#### Cosine Similarity

cosine(x,y)=(x.y⊺)/(||x||.||y||)

In [11]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
cosine_sim[0]

array([1.        , 0.00742877, 0.        , ..., 0.        , 0.00345808,
       0.        ])

In [13]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [14]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

Be careful, make the movie name as complete as possible.

In [15]:
get_recommendations('Inception')[:10]

5230                                Cypher
6388                           Renaissance
1696                                 House
2820      What Ever Happened to Baby Jane?
318                                   Cobb
8556    Mission: Impossible - Rogue Nation
975            Once Upon a Time in America
141                                  Crumb
8854                       Pitch Perfect 2
6034           The Seven-Per-Cent Solution
Name: title, dtype: object

### Metadata Based Recommender

In [16]:
credits = pd.read_csv('2-Data/credits.csv')
keywords = pd.read_csv('2-Data/keywords.csv')

In [17]:
print(credits.dtypes)
print(keywords.dtypes)

Unnamed: 0     int64
cast          object
crew          object
id             int64
dtype: object
Unnamed: 0     int64
id             int64
keywords      object
dtype: object


In [18]:
md['id'] = md['id'].astype('int')
md.shape

(44048, 24)

In [19]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [20]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9202, 29)

In [21]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))

In [22]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [23]:
smd['director'] = smd['crew'].apply(get_director)

In [24]:
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [25]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [26]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [27]:
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x,x, x])

In [28]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [29]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [30]:
s = s[s > 1]

In [31]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [32]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [33]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [34]:
smd['genres'] = smd['genres'].apply(literal_eval)

In [35]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']

smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [36]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [37]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [38]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [39]:
get_recommendations('Inception')[:10]

6613                             The Prestige
3373                                  Memento
4137                                 Insomnia
2077                                Following
8020                    The Dark Knight Rises
8601                             Interstellar
6971                          The Dark Knight
6208                            Batman Begins
5628    Sky Captain and the World of Tomorrow
8488                                  Don Jon
Name: title, dtype: object

In [40]:
m = 452.65
C = 5.29

In [41]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [42]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [43]:
improved_recommendations('Inception')

Unnamed: 0,title,vote_count,vote_average,year,wr
6971,The Dark Knight,12269,8,2008,7.903575
8601,Interstellar,11187,8,2014,7.894612
6613,The Prestige,4510,8,2006,7.752817
3373,Memento,4168,8,2000,7.734522
8020,The Dark Knight Rises,9263,7,2012,6.920331
6208,Batman Begins,7511,7,2005,6.902804
4165,Minority Report,2663,7,2002,6.751567
8195,Looper,4777,6,2012,5.938546
7276,X-Men Origins: Wolverine,4086,6,2009,5.92919
7892,Green Lantern,2551,5,2011,5.043703


## Collaborative Filtering

In [80]:
# instantiate a reader and read in our rating data
ratings_f = pd.read_csv('2-Data/ratings_small.csv')
ratings_f.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,0,1,31,2.5,1260759144
1,1,1,1029,3.0,1260759179
2,2,1,1061,3.0,1260759182
3,3,1,1129,2.0,1260759185
4,4,1,1172,4.0,1260759205


In [81]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_f[['userId','movieId','rating']], reader)

In [82]:
# train SVD on 75% of known rates
trainset, testset = train_test_split(data, test_size=.25)
algorithm = SVD()
algorithm.fit(trainset)
predictions = algorithm.test(testset)

In [83]:
# check the accuracy using Root Mean Square Error
accuracy.rmse(predictions)

RMSE: 0.9005


0.900526769788059

## Hybrid Recommender


In [85]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [87]:
id_map = pd.read_csv('2-Data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [88]:
indices_map = id_map.set_index('id')

In [89]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [91]:
hybrid(1, 'Inception')

Unnamed: 0,title,vote_count,vote_average,year,id,est
3373,Memento,4168.0,8.1,2000,77,3.483963
6971,The Dark Knight,12269.0,8.3,2008,155,3.41145
6613,The Prestige,4510.0,8.0,2006,1124,3.310891
6208,Batman Begins,7511.0,7.5,2005,272,3.1912
8601,Interstellar,11187.0,8.1,2014,157336,3.080559
4165,Minority Report,2663.0,7.1,2002,180,2.948731
6957,Doomsday,374.0,5.8,2008,13460,2.864911
476,The Shadow,140.0,5.4,1994,8850,2.862267
8195,Looper,4777.0,6.6,2012,59967,2.815374
7817,I Am Number Four,1606.0,5.9,2011,46529,2.809198


In [92]:
hybrid(500, 'Inception')

Unnamed: 0,title,vote_count,vote_average,year,id,est
6613,The Prestige,4510.0,8.0,2006,1124,3.55883
3373,Memento,4168.0,8.1,2000,77,3.557489
6971,The Dark Knight,12269.0,8.3,2008,155,3.348588
6630,Déjà Vu,1519.0,6.6,2006,7551,3.146931
5570,The Three Lives of Thomasina,12.0,6.8,1963,15081,3.13534
8601,Interstellar,11187.0,8.1,2014,157336,3.120681
8195,Looper,4777.0,6.6,2012,59967,3.097244
4588,The Core,531.0,5.5,2003,9341,3.055094
476,The Shadow,140.0,5.4,1994,8850,3.048557
2077,Following,363.0,7.2,1998,11660,2.992638
