# Content-based Recommendation

Analizing the database of The Movies Dataset: [The Movies Dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset) 


inspired by [Movie Recommender Systems](https://www.kaggle.com/rounakbanik/movie-recommender-systems/notebook)

In [12]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

#You have to install surprise
from surprise import Reader, Dataset, SVD#, evaluate
from surprise.model_selection import cross_validate


import warnings; warnings.simplefilter('ignore')

In [13]:
md = pd.read_csv('2-Data/md_simple_recommender.csv')

links_small  = pd.read_csv('2-Data/links_small.csv')
print(md.shape)
links_small [:5]

(44048, 24)


Unnamed: 0.1,Unnamed: 0,movieId,imdbId,tmdbId
0,0,1,114709,862.0
1,1,2,113497,8844.0
2,2,3,113228,15602.0
3,3,4,114885,31357.0
4,4,5,113041,11862.0


In [14]:
links_small.dtypes

Unnamed: 0      int64
movieId         int64
imdbId          int64
tmdbId        float64
dtype: object

In [15]:
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [16]:
links_small.dtypes

dtype('int32')

In [17]:
md.dtypes

Unnamed: 0                int64
Unnamed: 0.1              int64
adult                      bool
budget                    int64
genres                   object
id                        int64
imdb_id                  object
original_language        object
original_title           object
overview                 object
popularity              float64
poster_path              object
production_companies     object
production_countries     object
release_date             object
revenue                 float64
runtime                 float64
spoken_languages         object
status                   object
title                    object
video                      bool
vote_average            float64
vote_count              float64
year                      int64
dtype: object

In [22]:
smd = md[md['id'].isin(links_small)]
print(smd.shape)
smd[:3]

(9082, 24)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,adult,budget,genres,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count,year
0,0,0,False,30000000,"['Animation', 'Comedy', 'Family']",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Toy Story,False,7.7,5415.0,1995
1,1,1,False,65000000,"['Adventure', 'Fantasy', 'Family']",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Jumanji,False,6.9,2413.0,1995
2,2,2,False,0,"['Romance', 'Comedy']",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Grumpier Old Men,False,6.5,92.0,1995


#### Movie Overview Based Recommender

In [24]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['overview'])

In [25]:
tfidf_matrix.shape

(9082, 244086)

#### Cosine Similarity

cosine(x,y)=(x.y⊺)/(||x||.||y||)

In [27]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [28]:
cosine_sim[0]

array([1.        , 0.00742877, 0.        , ..., 0.        , 0.00345808,
       0.        ])

In [29]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [30]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

Be careful, make the movie name as complete as possible.

In [36]:
get_recommendations('Inception')[:10]

5230                                Cypher
6388                           Renaissance
1696                                 House
2820      What Ever Happened to Baby Jane?
318                                   Cobb
8556    Mission: Impossible - Rogue Nation
975            Once Upon a Time in America
141                                  Crumb
8854                       Pitch Perfect 2
6034           The Seven-Per-Cent Solution
Name: title, dtype: object

### Metadata Based Recommender