In [15]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [None]:
dataframe_1 = pd.read_csv("../Data/tmdb_5000_credits.csv")
dataframe_2 = pd.read_csv("../Data/tmdb_5000_movies.csv")

In [None]:
# join the two dataframes on the movie id
dataframe_1.columns = ['id','tittle','cast','crew']
dataframe_2 = dataframe_2.merge(dataframe_1, on='id')

In [None]:
# Quick check to see if the merge worked
dataframe_2.head(5)

# **Demographic Filtering**

## Weighted Average Rating

We can use the Weightied Rating (WR) as a metric to rank our movies. The WR is a combination of the following:

\begin{equation} \text Weighted Rating (\bf WR) = \left({{\bf v} \over {\bf v} + {\bf m}} \cdot R\right) + \left({{\bf m} \over {\bf v} + {\bf m}} \cdot C\right) \end{equation}

where,
* v is the number of votes for the movie;
* m is the minimum votes required to be listed in the chart;
* R is the average rating of the movie; And
* C is the mean vote across the whole report

We already have v(**vote_count**) and R (**vote_average**) and C can be calculated as 

In [None]:
C = dataframe_2['vote_average'].mean()
print(C)

In [None]:
m = dataframe_2['vote_count'].quantile(0.9)
print(m)

In [None]:
q_movies = dataframe_2.copy().loc[dataframe_2['vote_count'] >= m]
q_movies.shape

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [None]:
'''
let's sort the DataFrame in descending order based on the score feature column 
and output the title, vote count, vote average, 
and weighted rating (score) of the top 20 movies.
'''

#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

# Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10)

In [None]:
pink = (1, 0.078, 0.574, 1)

In [None]:
pop = dataframe_2.sort_values('popularity', ascending=False)

plt.figure(figsize=(12,4))

plt.barh(pop['title'].head(6),pop['popularity'].head(6), align='center',
        color=pink)

plt.gca().invert_yaxis()

plt.xlabel("Popularity")
plt.title("Popular Movies")

## Content Based Filtering

In [None]:
dataframe_2['overview'].head(5)

In [None]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
dataframe_2['overview'] = dataframe_2['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(dataframe_2['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

We see that over 20,000 different words were used to describe the 4800 movies in our dataset.

With this matrix in hand, we can now compute a similarity score. There are several candidates for this; such as the euclidean, the Pearson and the [cosine similarity scores](https://en.wikipedia.org/wiki/Cosine_similarity). There is no right answer to which score is the best. Different scores work well in different scenarios and it is often a good idea to experiment with different metrics.

We will be using the cosine similarity to calculate a numeric quantity that denotes the similarity between two movies. We use the cosine similarity score since it is independent of magnitude and is relatively easy and fast to calculate. Mathematically, it is defined as follows:

### Cosine Similarity

Mathematically, it is defined as follows:

\begin{equation} \text cos(x,y) = \left({{\bf x \cdot y^{T}} \over {||x||} \cdot {||y||}}\right) = \left({{\sum_{i=1}^{n} x_{i} \cdot y^{T}_{i}} \over {\sqrt {\sum_{i=1}^{n} x_{i}^{2}{\sum_{i=1}^{n} y_{i}^{2}}}}} \right) \end{equation}

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(dataframe_2.index, index=dataframe_2['title']).drop_duplicates()

#### Recomandation System Function

*

In [None]:
'''
Function that takes in movie title as input and outputs most similar movies

Get the index of the movie given its title.

Get the list of cosine similarity scores for that particular movie with all movies. 
Convert it into a list of tuples where the first element is its position 
and the second is the similarity score.

Sort the aforementioned list of tuples 
based on the similarity scores; i.e., the second element.

Get the top 10 elements of this list. 
Ignore the first element as it refers to self 
(the movie most similar to a particular movie is the movie itself).

Return the titles corresponding to the indices of the top elements.
'''
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return dataframe_2['title'].iloc[movie_indices]

In [None]:
get_recommendations('The Dark Knight Rises')

In [None]:
get_recommendations('The Avengers')

### Result
While our system has done a decent job of finding movies with similar plot descriptions, the quality of recommendations is not that great. "The Dark Knight Rises" returns all Batman movies while it is more likely that the people who liked that movie are more inclined to enjoy other Christopher Nolan movies. This is something that cannot be captured by the present system.

## **Credits, Genres and Keywords Based Recommender**

In [None]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    dataframe_2[feature] = dataframe_2[feature].apply(literal_eval)

In [None]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [None]:
# Define new director, cast, genres and keywords features that are in a suitable form.
dataframe_2['director'] = dataframe_2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    dataframe_2[feature] = dataframe_2[feature].apply(get_list)

In [None]:
# Print the new features of the first 3 films
dataframe_2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

The next step would be to convert the names and keyword instances into lowercase and strip all the spaces between them. This is done so that our vectorizer doesn't count the Johnny of "Johnny Depp" and "Johnny Galecki" as the same.

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

We are now in a position to create our "metadata soup", which is a string that contains all the metadata that we want to feed to our vectorizer (namely actors, director and keywords).

In [None]:
def create_soup(x):
    return ' '.join(str(x['keywords'])) + ' ' + ' '.join(str(x['cast'])) + ' ' + str(x['director']) + ' ' + ' '.join(str(x['genres']))

dataframe_2['soup'] = dataframe_2.apply(create_soup, axis=1)

The next steps are the same as what we did with our plot description based recommender. One important difference is that we use the **CountVectorizer()** instead of TF-IDF. This is because we do not want to down-weight the presence of an actor/director if he or she has acted or directed in relatively more movies. It doesn't make much intuitive sense.

In [None]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(dataframe_2['soup'])

In [None]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of our main DataFrame and construct reverse mapping as before
dataframe_2 = dataframe_2.reset_index()
indices = pd.Series(dataframe_2.index, index=dataframe_2['title'])

We can now reuse our **get_recommendations()** function by passing in the new **cosine_sim2** matrix as your second argument.

In [None]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

In [None]:
get_recommendations('The Godfather', cosine_sim2)

# **Collaborative Filtering**

## Single Value Decomposition

In [26]:
reader = Reader()
ratings = pd.read_csv('../Data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [30]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)

We get a mean Root Mean Sqaure Error of 0.89 approx which is more than good enough for our case. Let us now train on our dataset and arrive at predictions.

In [20]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x149458bdf00>

Let us pick user with user Id 1  and check the ratings they has given.

In [21]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [22]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=2.710262364390174, details={'was_impossible': False})

For movie with ID 302, we get an estimated prediction of **2.618**. One startling feature of this recommender system is that it doesn't care what the movie is (or what it contains). It works purely on the basis of an assigned movie ID and tries to predict ratings based on how the other users have predicted the movie.