In [1]:
import numpy as np 
import pandas as pd 

movies = pd.read_csv('DataSets/tmdb_5000_movies.csv')
credits = pd.read_csv('DataSets/tmdb_5000_credits.csv')

# <b>Data Preprocessing</b>

In [2]:
movies_new = movies.merge(credits,on='title')

In [3]:
# movies_new.info()

In [4]:
movies_new = movies_new[['genres','keywords','overview','title','movie_id','cast','crew']]

In [5]:
movies_new.isnull().sum()

genres      0
keywords    0
overview    3
title       0
movie_id    0
cast        0
crew        0
dtype: int64

In [6]:
movies_new.dropna(inplace=True)

In [7]:
movies_new.duplicated().sum()

np.int64(0)

In [8]:
import ast

def fetch_genre_names(str):
    genres_name_list = []
    for obj in ast.literal_eval(str):
        genres_name_list.append(obj['name'])
    return genres_name_list

def fetch_characters(str):
    charaters_name_list = []
    cnt = 0
    for obj in ast.literal_eval(str):
        if(cnt!=3):
            charaters_name_list.append(obj['name'])
            cnt+=1
        else:
            break
    return charaters_name_list

def fetch_director(str):
    directors_name_list = []
    for obj in ast.literal_eval(str):
        if obj['job'] == 'Director':
            directors_name_list.append(obj['name'])
            break
    return directors_name_list

In [9]:
# convert string to list
movies_new['genres'] = movies_new['genres'].apply(fetch_genre_names)
movies_new['keywords'] = movies_new['keywords'].apply(fetch_genre_names)
movies_new['cast'] = movies_new['cast'].apply(fetch_characters)
movies_new['crew'] = movies_new['crew'].apply(fetch_director)

In [10]:
movies_new['overview'] = movies_new['overview'].apply(lambda x: x.split())

In [11]:
movies_new['genres'] = movies_new['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies_new['keywords'] = movies_new['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies_new['cast'] = movies_new['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies_new['crew'] = movies_new['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [12]:
movies_new['tags'] = movies_new['genres'] + movies_new['keywords'] + movies_new['overview'] + movies_new['cast'] + movies_new['crew']

In [13]:
data = movies_new[['movie_id','title','tags']]

In [14]:
# convert list to string
data.loc[:,'tags'] = data['tags'].apply(lambda x: " ".join(x))
# lowercase
data.loc[:,'tags'] = data['tags'].apply(lambda x: x.lower())

In [15]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    stem_words = []
    for i in text.split():
        stem_words.append(ps.stem(i))
    return " ".join(stem_words)

In [16]:
data.loc[:,'tags'] = data['tags'].apply(stem)

In [17]:
data

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventur fantasi sciencefict culturecla...
1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drugabus exotici...
2,206647,Spectre,action adventur crime spi basedonnovel secreta...
3,49026,The Dark Knight Rises,action crime drama thriller dccomic crimefight...
4,49529,John Carter,action adventur sciencefict basedonnovel mar m...
...,...,...,...
4804,9367,El Mariachi,action crime thriller unitedstates–mexicobarri...
4805,72766,Newlyweds,comedi romanc a newlyw couple' honeymoon is up...
4806,231617,"Signed, Sealed, Delivered",comedi drama romanc tvmovi date loveatfirstsig...
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


# <b>Vectorization

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english',max_features=5000)

In [19]:
vectors = cv.fit_transform(data['tags']).toarray()

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
cs = cosine_similarity(vectors)

In [21]:
def recommend(movie):
    idx = data[data['title'] == movie].index[0]
    distances = cs[idx]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(data.iloc[i[0]].title)

In [24]:
recommend('Up')

Raising Helen
Memoirs of an Invisible Man
Marley & Me
The Truman Show
Synecdoche, New York


In [None]:
import pickle
pickle.dump(data.to_dict(),open('Output_file/movies.pkl','wb'))
pickle.dump(cs,open('Output_file/similarity.pkl','wb'))