### Content Based Recommendation System

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.simplefilter('ignore')

### Read the Dataset `movies_metadata.csv`

In [2]:
mvs = pd.read_csv('movies_metadata.csv')
mvs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

## Inferences from the Information of the DF
* Rows - 45466
* Columns - 24
* Missing values are present in almost all the columns with the lease amount of information present in the homepage column
* Except revenue, runtime, vote_average, vote_count all the other columns are non numerical data types

In [3]:
mvs.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


## Inferences from the Describe function of DF

### Create a new column with name 'description' combining `'overview' and 'tagline'` columns in the given dataset

from the info it is visible that the Tagline column has almost half of the values as null values
Overview column has close to 1000 values as null values..

In [4]:
# mvs['tagline'].fillna('')
mvs['description'] = mvs['overview'] + mvs['tagline']

In [5]:
mvs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 25 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

### Lets drop the null values in `description` column

In [6]:
mvs.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
description              25062
dtype: int64

In [7]:
mvs.dropna(subset=['description'], inplace=True)

In [8]:
mvs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20404 entries, 1 to 45463
Data columns (total 25 columns):
adult                    20404 non-null object
belongs_to_collection    2572 non-null object
budget                   20404 non-null object
genres                   20404 non-null object
homepage                 4410 non-null object
id                       20404 non-null object
imdb_id                  20400 non-null object
original_language        20404 non-null object
original_title           20404 non-null object
overview                 20404 non-null object
popularity               20404 non-null object
poster_path              20389 non-null object
production_companies     20404 non-null object
production_countries     20404 non-null object
release_date             20390 non-null object
revenue                  20404 non-null float64
runtime                  20404 non-null float64
spoken_languages         20404 non-null object
status                   20390 non-null objec

### Keep the first occurance and drop duplicates of each title in column `title`

In [9]:
mvs['title'].drop_duplicates(inplace=True)

In [10]:
mvs.shape

(20404, 25)

### As we might have dropped a few rows with duplicate `title` in above step, just reset the index [make sure you are not adding any new column to the dataframe while doing reset index]

In [11]:
mvs.reset_index(inplace=True)

### Generate tf-idf matrix using the column `description`. Consider till 3-grams, with minimum document frequency as 0.

In [12]:
tfidf_vect = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tfidf_vect.fit_transform(mvs['description'])

In [13]:
tfidf_matrix.shape

(20404, 1198573)

### create cosine similarity matrix

In [14]:
cos_sim_matrix = cosine_similarity(tfidf_matrix)

In [15]:
cos_sim_matrix

array([[1.        , 0.00842055, 0.        , ..., 0.00183679, 0.        ,
        0.01214359],
       [0.00842055, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.00405881,
        0.        ],
       ...,
       [0.00183679, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00405881, ..., 0.        , 1.        ,
        0.        ],
       [0.01214359, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [16]:
cos_sim_matrix.shape

(20404, 20404)

### Write a function with name `recommend` which takes `title` as argument and returns a list of 10 recommended title names in the output based on the above cosine similarities

In [17]:
mvs.index

RangeIndex(start=0, stop=20404, step=1)

In [18]:
mvs.title

0                                              Jumanji
1                                     Grumpier Old Men
2                                    Waiting to Exhale
3                          Father of the Bride Part II
4                                                 Heat
5                                              Sabrina
6                                         Tom and Huck
7                                         Sudden Death
8                                            GoldenEye
9                               The American President
10                                               Balto
11                                               Nixon
12                                    Cutthroat Island
13                                              Casino
14                               Sense and Sensibility
15                                          Four Rooms
16                      Ace Ventura: When Nature Calls
17                                         Money Train
18        

In [22]:
def recommend(title, top_n = 10):
    title_id = (mvs.loc[mvs['title']==title].index)[0]
    features = tfidf_vect.get_feature_names()

    top_n_idx = np.flip(np.argsort(cos_sim_matrix[title_id,]),axis=0)[1:(top_n + 1)]
    top_n_sim_values = cos_sim_matrix[title_id, top_n_idx]
    
    # find top n with values > 0
    top_n_idx = top_n_idx[top_n_sim_values > 0]
    scores = top_n_sim_values[top_n_sim_values > 0]
    
    
    # find features from the vectorized matrix
    sim_movies_idx = mvs['title'].iloc[top_n_idx].index
    words = []
    for movie_idx in sim_movies_idx:
        try:
            idx = np.where(np.squeeze(tfidf_matrix[movie_idx,].toarray()) > 0)
        except:
            idx = np.where(np.squeeze(tfidf_matrix[movie_idx,]) > 0)
        
        words.append([" , ".join([features[i] for i in idx[0]])])
        
    # collate results
    res = pd.DataFrame({"movie_title" : mvs['title'].iloc[title_id],
           "sim_movies": mvs['title'].iloc[top_n_idx].values,"words":words,
           "scores":scores}, columns = ["movie_title","sim_movies","scores","words"])
    
    return res

### Give the recommendations from above functions for movies `The Godfather` and `The Dark Knight Rises`

In [23]:
recommend(title='The Godfather', top_n=10)

Unnamed: 0,movie_title,sim_movies,scores,words
0,The Godfather,The Godfather: Part II,0.149801,"[1910s , 1910s new , 1910s new york , 1950s , ..."
1,The Godfather,Honor Thy Father,0.0963,"[1964 , 1964 compelled , 1964 compelled empire..."
2,The Godfather,The Family,0.071295,"[challenging , challenging old , challenging o..."
3,The Godfather,Blood Ties,0.054169,"[1970s , 1970s crime , 1970s crime runs , broo..."
4,The Godfather,Made,0.036224,"[aspiring , aspiring boxers , aspiring boxers ..."
5,The Godfather,Johnny Dangerously,0.035239,"[1930s , 1930s honest , 1930s honest goodheart..."
6,The Godfather,Fury,0.033934,"[attack , attack presumed , attack presumed de..."
7,The Godfather,Live by Night,0.033905,"[american , american dream , centered , center..."
8,The Godfather,Shanghai Triad,0.033472,"[1930 , 1930 shanghai , 1930 shanghai violence..."
9,The Godfather,In Memory of My Father,0.031764,"[accepts , accepts bribe , accepts bribe fathe..."


In [24]:
recommend(title='The Dark Knight Rises', top_n=10)

Unnamed: 0,movie_title,sim_movies,scores,words
0,The Dark Knight Rises,The Dark Knight,0.12828,"[attorney , attorney harvey , attorney harvey ..."
1,The Dark Knight Rises,Batman Forever,0.118706,"[accident , accident left , accident left disf..."
2,The Dark Knight Rises,Batman Returns,0.07432,"[accepted , accepted gotham , accepted gotham ..."
3,The Dark Knight Rises,Batman: Mask of the Phantasm,0.071435,"[andrea , andrea beaumont , andrea beaumont da..."
4,The Dark Knight Rises,Batman,0.069976,"[begins , begins war , begins war crime , city..."
5,The Dark Knight Rises,Batman: Mystery of the Batwoman,0.065782,"[batwoman , batwoman wreaking , batwoman wreak..."
6,The Dark Knight Rises,Batman: Under the Red Hood,0.064691,"[batman , batman faces , batman faces ultimate..."
7,The Dark Knight Rises,Batman Beyond: Return of the Joker,0.055104,"[answers , answers stands , answers stands fac..."
8,The Dark Knight Rises,Batman vs Dracula,0.053249,"[bat , bat gotham , batman , batman stop , bat..."
9,The Dark Knight Rises,Batman: Year One,0.050468,"[abroad , abroad feeding , abroad feeding life..."
