# Movie Recommender System

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from ast import literal_eval

In [38]:
mov = pd.read_csv('tmdb_5000_movies.csv')
mov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [39]:
mov['overview'] = mov['overview'].fillna('')

In [40]:
cre = pd.read_csv('tmdb_5000_credits.csv')
cre = cre.drop(['title'], axis=1)
cre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   cast      4803 non-null   object
 2   crew      4803 non-null   object
dtypes: int64(1), object(2)
memory usage: 112.7+ KB


## Content-Based Filtering
Here, the system will suggest movies based on a particular movie, using metadata (like genre, actor, director, etc.)

Firstly let us look at CBF using movies title and overview. Then we will try with genre, keywords, cast, available languages, etc.

In [41]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

tfidf_matrix = tfidf.fit_transform(mov['overview'])
tfidf_matrix.shape

(4803, 20978)

As we have generated TF-IDF matrix, we use the dot product to get the cosine similarity as it is easy and fast to compute

In [42]:
cos_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [43]:
titles = mov['title']
indices = pd.Series(mov.index, index=mov['title']).drop_duplicates()

In [44]:
def get_recommendations(title, cos_sim=cos_sim):
	index = indices[title]
	similarity_scores = list(enumerate(cos_sim[index]))
	# sort movies index based on similarity scores
	similarity_scores = sorted(similarity_scores, key=lambda x:x[1], reverse=True)
	# get top 20 of sorted
	similarity_scores = similarity_scores[1:21]
	# store the indices of top 20 movies
	movies_indices = [i[0] for i in similarity_scores]
	return titles.iloc[movies_indices]


In [45]:
get_recommendations('Spider-Man')

5                    Spider-Man 3
38       The Amazing Spider-Man 2
20         The Amazing Spider-Man
30                   Spider-Man 2
1534                Arachnophobia
953     Gremlins 2: The New Batch
1720                     Kick-Ass
2740                  The New Guy
3216                     Election
1572    Forgetting Sarah Marshall
2789           The Emperor's Club
1383                        Radio
4643                   Like Crazy
572                          Hook
3428                         Bats
1119               21 Jump Street
3163                    Detention
3383                    Losin' It
641                      Due Date
1677               Goodbye Bafana
Name: title, dtype: object

Recommendation using metadata

In [46]:
mov_modified = mov.merge(cre, left_on='id', right_on='movie_id')
mov_modified = mov_modified.drop(['movie_id'], axis=1)
mov_modified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4803 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [47]:
cols = ['genres', 'keywords', 'cast', 'crew']

for col in cols:
	mov_modified[col] = mov_modified[col].apply(literal_eval)


# to get director from job
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


# get top 3 elements of list
def get_list(x):
    if isinstance(x, list):
        names = [ i['name'] for i in x]
        
        if len(names)  > 3:
            names = names[:3]
        return names
    return []


#apply all functions
mov_modified['director'] = mov_modified['crew'].apply(get_director)

cols2 = ['genres', 'keywords', 'cast']
for col in cols2:
	mov_modified[col] = mov_modified[col].apply(get_list)

For using the metadata as our base for recommendations, we need to preprocess the dataset so that we can use the tfidf vectorizer and cosine similarity.

In [48]:
#striping
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''


cols3 = ['cast', 'keywords', 'director', 'genres']
for col in cols3:
    mov_modified[col] = mov_modified[col].apply(clean_data)

#creating a SOUP
def create_soup(x):
    return ' '.join(x['keywords'])+' '+' '.join(x['cast'])+' '+x['director']+' '+' '.join(x['genres'])
mov_modified['soup'] = mov_modified.apply(create_soup, axis=1)

count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
count_matrix = count.fit_transform(mov_modified['soup'])

#finding similarity
cos_sim2 = cosine_similarity(count_matrix, count_matrix)

In [49]:
get_recommendations('Spider-Man', cos_sim2)

5                                            Spider-Man 3
30                                           Spider-Man 2
1266             Cirque du Freak: The Vampire's Assistant
381                      The Nutcracker: The Untold Story
131                                               G-Force
215                Fantastic 4: Rise of the Silver Surfer
1359                                               Batman
1                Pirates of the Caribbean: At World's End
10                                       Superman Returns
12             Pirates of the Caribbean: Dead Man's Chest
37                             Oz: The Great and Powerful
50                    Prince of Persia: The Sands of Time
61                                      Jupiter Ascending
98                      The Hobbit: An Unexpected Journey
115                                               Hancock
121        Night at the Museum: Battle of the Smithsonian
129                                                  Thor
160           

In [50]:
get_recommendations('Cars')

40                      Cars 2
1876           Bride of Chucky
2419            McFarland, USA
1795                 Silverado
801       The Devil's Advocate
4492                     After
4048               The Calling
4557       Fight to the Finish
3238      Little Miss Sunshine
1678           United Passions
1514    The Quick and the Dead
559               The Majestic
4297    The Trouble with Harry
3187                  Hamlet 2
1595                Glory Road
2606          Raise Your Voice
3108                 Severance
2779                 The Claim
4746              Tiger Orange
3997           Blazing Saddles
Name: title, dtype: object

In [51]:
get_recommendations('Cars', cos_sim2)

40                                        Cars 2
55                                         Brave
202                                        Rio 2
374                                          Rio
569     The SpongeBob Movie: Sponge Out of Water
1142               Why I Did (Not) Eat My Father
343                                  Toy Story 2
596                                        I Spy
933                             Shanghai Knights
1541                                   Toy Story
2097                           Are We There Yet?
503         The Adventures of Rocky & Bullwinkle
1086                         Aliens in the Attic
1152                  Back to the Future Part II
1721                          30 Minutes or Less
1932                                      Sheena
2270                                    Zambezia
2285                          Back to the Future
1072                                Last Holiday
2122                                  Epic Movie
Name: title, dtype: 

## Collaborative Filtering