In [1]:
import pandas as pd
import json
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
movies_df = pd.read_csv('datasets/tmdb_5000_movies.csv')
credits_df = pd.read_csv('datasets/tmdb_5000_credits.csv')

In [3]:
movies_df = movies_df.merge(credits_df, on = 'title')
print("Shape of the movies dataset now =", movies_df.shape)

Shape of the movies dataset now = (4809, 23)


In [4]:
movies_df = movies_df[['movie_id', 'title', 'genres', 'overview', 'keywords', 'cast', 'crew']]
print("Shape of the movies dataset now =", movies_df.shape)
movies_df.head()

Shape of the movies dataset now = (4809, 7)


Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
movies_df.isnull().sum()

movie_id    0
title       0
genres      0
overview    3
keywords    0
cast        0
crew        0
dtype: int64

In [6]:
movies_df.dropna(inplace = True)
print("Shape of the movies dataset now =", movies_df.shape)
movies_df.isnull().sum()

Shape of the movies dataset now = (4806, 7)


movie_id    0
title       0
genres      0
overview    0
keywords    0
cast        0
crew        0
dtype: int64

In [7]:
def listify(s):
    return [i['name'] for i in json.loads(s)]

def directorify(s):
    s = json.loads(s)
    l = []
    for i in s:
        if i['job'] == 'Director':
            l.append(i['name'])
    return l

def languagify(s):
    return [i['iso_639_1'] for i in json.loads(s)]

movies_df['genres'] = movies_df['genres'].apply(listify)
movies_df['keywords'] = movies_df['keywords'].apply(listify)
movies_df['cast'] = movies_df['cast'].apply(listify)
movies_df['crew'] = movies_df['crew'].apply(directorify)
movies_df.head()

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,"[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,"[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [8]:
movies_df['tags'] = movies_df.apply(lambda row: ' '.join(row['genres']) + ' ' + ' '.join(row['keywords']) + ' ' + ' '.join(row['cast']) + ' ' + ' '.join(row['crew']) + ' ' + str(row['overview']), axis = 1)
movies_df = movies_df[['movie_id', 'title', 'tags']]
print("Shape of the movies dataset now =", movies_df.shape)
movies_df.head()

Shape of the movies dataset now = (4806, 3)


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,Action Adventure Fantasy Science Fiction cultu...
1,285,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drug abuse exot...
2,206647,Spectre,Action Adventure Crime spy based on novel secr...
3,49026,The Dark Knight Rises,Action Crime Drama Thriller dc comics crime fi...
4,49529,John Carter,Action Adventure Science Fiction based on nove...


In [9]:
ps = PorterStemmer()
def stem(s):
    return " ".join([ps.stem(i) for i in s.split()])
movies_df['tags'] = movies_df['tags'].apply(stem)
movies_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventur fantasi scienc fiction cultur ...
1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drug abus exot i...
2,206647,Spectre,action adventur crime spi base on novel secret...
3,49026,The Dark Knight Rises,action crime drama thriller dc comic crime fig...
4,49529,John Carter,action adventur scienc fiction base on novel m...


In [10]:
cv = CountVectorizer(max_features = 10000, stop_words = 'english')
v = cv.fit_transform(movies_df['tags']).toarray()
print("Shape of the vector =", v.shape)

Shape of the vector = (4806, 10000)


In [11]:
s = cosine_similarity(v)
print("Shape of the similarity matrix =", s.shape)

Shape of the similarity matrix = (4806, 4806)


In [12]:
def recommendation(x):
    idx = movies_df[movies_df['title'] == x].index[0]
    dist = sorted(enumerate(s[idx]), reverse = True, key = lambda d : d[1])
    for i in dist[1 : 6]:
        print(movies_df.iloc[i[0]].title)

In [13]:
recommendation('Spider-Man 2')

Spider-Man 3
Spider-Man
The Amazing Spider-Man 2
The Amazing Spider-Man
The Dark Knight Rises


In [14]:
pickle.dump(movies_df, open('results/movies.pkl', 'wb'))
pickle.dump(s, open('results/cosine_similarity.pkl', 'wb'))