Importing Libraries

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

*******************************************************************************

Data Obtaining and Preprocessing

In [35]:
# reading csv files
movies = pd.read_csv("input/tmdb_5000_movies.csv")

In [36]:
credits_ds = pd.read_csv("input/tmdb_5000_credits.csv")
credits_ds.columns

Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')

In [37]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [38]:
movies.rename(columns={'id': 'movie_id'}, inplace=True)
movies.columns

Index(['budget', 'genres', 'homepage', 'movie_id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count'],
      dtype='object')

In [39]:
movies_df = movies.merge(credits_ds, on='movie_id', suffixes=('', '_y'))
movies_df.drop(movies_df.filter(regex='_y$').columns, axis=1, inplace=True)
movies_df.columns

Index(['budget', 'genres', 'homepage', 'movie_id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

In [40]:
movies_df.iloc[0]

budget                                                          237000000
genres                  [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage                                      http://www.avatarmovie.com/
movie_id                                                            19995
keywords                [{"id": 1463, "name": "culture clash"}, {"id":...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{"name": "Ingenious Film Partners", "id": 289...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2009-12-10
revenue                                                        2787965087
runtime                               

In [41]:
movies_df = movies_df[['genres', 'movie_id', 'title','keywords', 'cast', 'crew']]
movies_df.columns

Index(['genres', 'movie_id', 'title', 'keywords', 'cast', 'crew'], dtype='object')

In [42]:
# changing null values
movies_df.isnull().sum()
movies_df.dropna(inplace=True)
movies_df.shape

(4803, 6)

In [43]:
# duplicate values check
movies_df.duplicated().sum()

0

In [44]:
import ast 
# helper function 
def convert(text, nm):
    l = []  
    for i in ast.literal_eval(text):   
        l.append(i[nm])  
    return l

movies_df['genres'] = movies_df['genres'].apply(convert, nm='name')
movies_df.head(2)


Unnamed: 0,genres,movie_id,title,keywords,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[Adventure, Fantasy, Action]",285,Pirates of the Caribbean: At World's End,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [45]:
movies_df['keywords'] = movies_df['keywords'].apply(convert, nm='name')
movies_df.head(2)

Unnamed: 0,genres,movie_id,title,keywords,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,Avatar,"[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[Adventure, Fantasy, Action]",285,Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [46]:
movies_df.iloc[0]['cast']

def convertCast(text):
    l = []  
    counter =0
    for i in ast.literal_eval(text):   
        if(counter<3):
           l.append(i['name'])  
        counter+=1
    return l

movies_df['cast'] = movies_df['cast'].apply(convertCast)
movies_df.head(2)

Unnamed: 0,genres,movie_id,title,keywords,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,Avatar,"[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[Adventure, Fantasy, Action]",285,Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [47]:
def findDirector(text):
    l = []  
    for i in ast.literal_eval(text):   
        if i['job']=='Director':    
           l.append(i['name'])  
           break
    return l

movies_df['crew'] = movies_df['crew'].apply(findDirector)
movies_df.head(2)

Unnamed: 0,genres,movie_id,title,keywords,cast,crew
0,"[Action, Adventure, Fantasy, Science Fiction]",19995,Avatar,"[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,"[Adventure, Fantasy, Action]",285,Pirates of the Caribbean: At World's End,"[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [48]:
# removing space in all names
def remove_space(word):
    l = []
    for i in word:
        l.append(i.replace(' ', ''))
    return l

movies_df['cast'] = movies_df['cast'].apply(remove_space)
movies_df['crew'] = movies_df['crew'].apply(remove_space)
movies_df['genres'] = movies_df['genres'].apply(remove_space)
movies_df['keywords'] = movies_df['keywords'].apply(remove_space)

movies_df.head(2)

Unnamed: 0,genres,movie_id,title,keywords,cast,crew
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,Avatar,"[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,"[Adventure, Fantasy, Action]",285,Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


In [49]:
movies_df['tags'] = movies_df['genres'] + movies_df['keywords'] + movies_df['crew']+ movies_df['cast']
movies_df.head(2)

Unnamed: 0,genres,movie_id,title,keywords,cast,crew,tags
0,"[Action, Adventure, Fantasy, ScienceFiction]",19995,Avatar,"[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,"[Adventure, Fantasy, Action]",285,Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action, ocean, drugabuse,..."


In [50]:
movies_df = movies_df.drop(['cast', 'crew', 'keywords', 'genres'], axis =1)
movies_df.head(2)


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, ocean, drugabuse,..."


In [51]:
movies_df['tags'] = movies_df['tags'].apply(lambda x: " ".join(x))
movies_df.iloc[0]['tags']

'Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d JamesCameron SamWorthington ZoeSaldana SigourneyWeaver'

In [52]:
# converting the string to lowercase
movies_df['tags'] = movies_df['tags'].apply(lambda x: x.lower())
movies_df.iloc[0]['tags']

'action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d jamescameron samworthington zoesaldana sigourneyweaver'

PREPROCESSING DONE

*******************************************************************************

In [53]:
# preprocessing for words
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [54]:
# preprocessing for words
ps = PorterStemmer()

def stemmed(text):
    l=[]
    for i in text.split():
        if not i in stopwords.words('english'):
           l.append(ps.stem(i))
    
    return " ".join(l)

movies_df['tags'] = movies_df['tags'].apply(stemmed)
movies_df.head(2)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventur fantasi sciencefict culturecla...
1,285,Pirates of the Caribbean: At World's End,adventur fantasi action ocean drugabus exotici...


In [55]:
from sklearn.feature_extraction.text import CountVectorizer


In [56]:
cv = CountVectorizer(max_features=5000)

vector = cv.fit_transform(movies_df['tags']).toarray()
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [57]:
from sklearn.metrics.pairwise import cosine_similarity

In [58]:
similarity = cosine_similarity(vector)
similarity

array([[1.        , 0.12309149, 0.11111111, ..., 0.06085806, 0.        ,
        0.        ],
       [0.12309149, 1.        , 0.12309149, ..., 0.        , 0.        ,
        0.        ],
       [0.11111111, 0.12309149, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.06085806, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

*******************************************************************************

GIVING RECOMMENDATIONS

In [59]:
def recommend(movie):
    index = movies_df[movies_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:11]:
        print(movies_df.iloc[i[0]]['title'])


In [60]:
recommend('Up')

Alpha and Omega: The Legend of the Saw Tooth Cave
Bolt
Animals United
Return to Never Land
Valiant
Hey Arnold! The Movie
Inside Out
The Lion of Judah
Rio
The Smurfs


In [61]:
import pickle
pickle.dump(movies_df, open('artifacts/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('artifacts/similarity.pkl', 'wb'))