In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
df = pd.read_csv("tmdb_5000_credits.csv")
df.reset_index(inplace = True)
df.head()

Unnamed: 0,index,movie_id,title,cast,crew
0,0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [3]:
df.shape

(4803, 5)

In [4]:
features = ['movie_id' , 'title' , 'cast' , 'crew']
df[features].head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
df[features].isnull().values.any()

np.False_

In [6]:
def combine_features(row):
    return row['movie_id']+' '+row['title']+' '+row['cast']+' '+row['crew']

In [7]:
count_matrix = CountVectorizer().fit_transform(df['crew'])
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.95317455 0.98678375 ... 0.90460132 0.72243161 0.83699884]
 [0.95317455 1.         0.95960533 ... 0.94511902 0.74185023 0.87578137]
 [0.98678375 0.95960533 1.         ... 0.92009212 0.7326647  0.84834869]
 ...
 [0.90460132 0.94511902 0.92009212 ... 1.         0.72228544 0.87785039]
 [0.72243161 0.74185023 0.7326647  ... 0.72228544 1.         0.72011904]
 [0.83699884 0.87578137 0.84834869 ... 0.87785039 0.72011904 1.        ]]


In [8]:
cosine_sim.shape

(4803, 4803)

In [9]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [10]:
movie_user_likes = "John Carter" 
movie_index = get_index_from_title(movie_user_likes)

similar_movies = list(enumerate(cosine_sim[movie_index]))

In [11]:
print(similar_movies)

[(0, np.float64(0.9647425605454233)), (1, np.float64(0.9629587243241523)), (2, np.float64(0.9727817325070918)), (3, np.float64(0.9781056809480556)), (4, np.float64(0.9999999999999956)), (5, np.float64(0.9540739150408032)), (6, np.float64(0.9415708914633901)), (7, np.float64(0.9666979420133932)), (8, np.float64(0.9619786381104299)), (9, np.float64(0.9824770139174409)), (10, np.float64(0.9194717312700168)), (11, np.float64(0.9357589269542596)), (12, np.float64(0.9451378780235988)), (13, np.float64(0.9458074383894907)), (14, np.float64(0.9706896871165868)), (15, np.float64(0.9751435724476075)), (16, np.float64(0.9600359333610909)), (17, np.float64(0.9403260707185492)), (18, np.float64(0.9578524134262479)), (19, np.float64(0.974499070519391)), (20, np.float64(0.9702923111479097)), (21, np.float64(0.9547652652485679)), (22, np.float64(0.978949503924657)), (23, np.float64(0.9607066606461158)), (24, np.float64(0.9731675797235897)), (25, np.float64(0.9739068515650474)), (26, np.float64(0.97809

In [12]:
sorted_similar_movies = sorted(similar_movies , key = lambda x:x[1] , reverse=(True))


In [13]:
i = 0
print("Top 7 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>=7:
        break

Top 7 similar movies to John Carter are:

John Carter
The Hunger Games: Mockingjay - Part 2
Taken 3
Frequency
The Big Short
Catch Me If You Can
The Conjuring 2
