In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 

In [18]:
Credits=pd.read_csv('tmdb_5000_credits.csv')
Movies=pd.read_csv('tmdb_5000_movies.csv')

In [19]:
print(Movies.head(4))
print(Movies.shape)

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "name": "spy"}, {"id": 818, "name...                en   
3  [{"id": 849, "name": "dc comics"}, {"id": 853,...                en  

In [20]:
movies=Movies.merge(Credits, on='title')
print(movies.shape)
print(movies.info())

(4809, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status    

In [21]:
movies=movies[['movie_id','title','genres','overview','keywords','cast','crew']]

In [22]:
#to remove the missing data if something is not present as well as removing duplicate data if present
print(movies.isnull().sum())
print(movies.duplicated().sum())

movie_id    0
title       0
genres      0
overview    3
keywords    0
cast        0
crew        0
dtype: int64
0


In [23]:
#no duplicate data so just removing those rows whose overfiew is not there
movies.dropna(inplace=True)

In [24]:
#coverting string into a list
def extract(z):
    l=[]
    for j in ast.literal_eval(z):
        l.append(j["name"])
    return l

#for taking only top 4 cast
def extcast(z):
    l=[]
    c=0
    for i in ast.literal_eval(z):
        l.append(i['name'])
        c+=1
        if c==4:
            break
    return l  

#only taking Director
def extcrew(z):
    l=[]
    for i in ast.literal_eval(z):
        if i['job']=='Director':
            l.append(i['name'])
            break
    return l

In [25]:
movies['genres']=movies['genres'].apply(extract)
movies['keywords']=movies['keywords'].apply(extract)
movies['cast']=movies['cast'].apply(extcast)
movies['crew']=movies['crew'].apply(extcrew)
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [26]:
movies['genres']=movies['genres'].apply(lambda x:[j.replace(" ","") for j in x])
movies['keywords']=movies['keywords'].apply(lambda x:[j.replace(" ","") for j in x])
movies['cast']=movies['cast'].apply(lambda x:[j.replace(" ","") for j in x])
movies['crew']=movies['crew'].apply(lambda x:[j.replace(" ","") for j in x])

In [27]:
movies.head(3)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bondâ€™s, past, send...","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LÃ©aSeydoux, Ralp...",[SamMendes]


In [28]:
movies['total']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [29]:
new_movies=movies[['movie_id','title','total']]

In [31]:
new_movies.head(3)

Unnamed: 0,movie_id,title,total
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bondâ€™s, past, send..."


In [32]:
#FOR MAKING SLIGHTLY SAME WORDS TO A SAME WORD
ps=PorterStemmer()
def st(x):
    y=[]
    for i in x.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
new_movies['total']=new_movies['total'].apply(lambda x:" ".join(x))
new_movies['total']=new_movies['total'].apply(lambda x:x.lower())
new_movies['total']=new_movies['total'].apply(st)

In [34]:
Cv=CountVectorizer(max_features=5200,stop_words='english')
v=Cv.fit_transform(new_movies['total']).toarray()
m,n=v.shape

In [35]:
similarity=np.zeros((m,m))
n1=np.linalg.norm(v,axis=1,keepdims=True)
d=np.dot(v,v.T)
d1=np.dot(n1,n1.T)
similarity=(d/d1)
print(similarity.shape)

(4806, 4806)


In [36]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:8]

[(1214, 0.28676966733820225),
 (2405, 0.26310068027921696),
 (507, 0.25560859370538297),
 (3728, 0.2539166875385041),
 (539, 0.24678382369818683),
 (582, 0.24511108480187255),
 (1202, 0.23918243661746996)]

In [38]:
def recommend(movie):
    movie_i=new_movies[new_movies['title']==movie].index[0]
    m_list=sorted(list(enumerate(similarity[:,movie_i])),reverse=True,key=lambda x:x[1])[1:8]
    print("YOU SHOULD GO FOR THIS MOVIES NEXT -----> ðŸŽ¬")
    for i in m_list:
        print(new_movies.iloc[i[0]].title)

In [39]:
recommend('Superman')

YOU SHOULD GO FOR THIS MOVIES NEXT -----> ðŸŽ¬
Superman II
Superman Returns
Superman IV: The Quest for Peace
Iron Man 2
Superman III
Ant-Man
Batman


In [42]:
import pickle

In [44]:
pickle.dump(new_movies,open('movies.pkl','wb'))

In [45]:
pickle.dump(similarity,open('similarity.pkl','wb'))