# Content-Based Filtering

## Load data

In [None]:
import pandas as pd
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


## Tfidf matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
movies_df['overview'] = movies_df['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies_df['overview'] + ' ' + movies_df['genres'] + ' ' + 
                                    movies_df['keywords'] + ' ' + movies_df['production_companies'] + ' ' + 
                                    movies_df['production_countries'])

tfidf_matrix.shape

(4803, 39607)

## Similarity matrix

In [None]:
from sklearn.metrics.pairwise import linear_kernel
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

## Find most similar movies

### Step by step

In [None]:
movie_id = 285
movie_index = movies_df[movies_df['id'] == movie_id].index[0]
movie_index

1

In [None]:
scores = list(enumerate(similarity_matrix[movie_index]))
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
sorted_scores = sorted_scores[1:11]
sorted_scores

[(12, 0.4810334169695928),
 (199, 0.37037824361980876),
 (17, 0.3542999477834066),
 (1331, 0.2907517728182367),
 (2428, 0.268718284744253),
 (106, 0.2681052946543351),
 (2442, 0.2615244833942162),
 (2217, 0.2560244193552387),
 (104, 0.2559567887233112),
 (139, 0.24779105757148945)]

In [None]:
movies_indices = [i[0] for i in sorted_scores]
movies_indices

movies_df['title'].iloc[movies_indices]

12             Pirates of the Caribbean: Dead Man's Chest
199     Pirates of the Caribbean: The Curse of the Bla...
17            Pirates of the Caribbean: On Stranger Tides
1331                                         Nim's Island
2428                                    Brooklyn's Finest
106                                       Shrek the Third
2442                                      Southland Tales
2217                             Everyone Says I Love You
104                                              Poseidon
139                               Mission: Impossible III
Name: title, dtype: object

### Define function to find most similar movies

In [None]:
def find_similar_movies(movie_id, nr_of_movies=10):
    movie_index = movies_df[movies_df['id'] == movie_id].index[0]
    scores = list(enumerate(similarity_matrix[movie_index]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    sorted_scores = sorted_scores[1:nr_of_movies+1]
    movies_indices = [i[0] for i in sorted_scores]
    most_similar_movies = movies_df[['id', 'title']].iloc[movies_indices]
    return most_similar_movies

In [None]:
find_similar_movies(80274, 20)

Unnamed: 0,id,title
0,19995,Avatar
4401,43630,The Helix... Loaded
1326,299687,The 5th Wave
256,262504,Allegiant
1068,7453,The Hitchhiker's Guide to the Galaxy
249,262500,Insurgent
400,157350,Divergent
2442,4723,Southland Tales
292,2486,Eragon
322,18,The Fifth Element


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a1547541-bc41-4190-913e-d8bf46bc6b4a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>