In [28]:
import pandas as pd
import numpy as np
import re
from ast import literal_eval
import matplotlib as m
df1 = pd.read_csv("tmdb_5000_credits.csv")
df2 = pd.read_csv("tmdb_5000_movies.csv")

In [29]:
# df1.head()      #before merging - large dataset(df1 - 19212 ,df2 - 96060) then inner join/merge performed so only 4809 remaining
df1.size

19212

In [30]:
df = df1.merge(df2)

In [101]:
df.head()

Unnamed: 0,movie_id,title,tag
0,19995,Avatar,samworthington zoesaldana sigourneyweav stephe...
1,285,Pirates of the Caribbean: At World's End,johnnydepp orlandobloom keiraknightley stellan...
2,206647,Spectre,danielcraig christophwaltz léaseydoux ralphfie...
3,49026,The Dark Knight Rises,christianbal michaelcain garyoldman annehathaw...
4,49529,John Carter,taylorkitsch lynncollin samanthamorton willemd...


In [32]:
#check for null values in df
df.isnull().sum()

movie_id                   0
title                      0
cast                       0
crew                       0
budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
vote_average               0
vote_count                 0
dtype: int64

In [33]:
# df.size     #this row*column
#len(df)      #this gives no. of rows
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   movie_id              4809 non-null   int64  
 1   title                 4809 non-null   object 
 2   cast                  4809 non-null   object 
 3   crew                  4809 non-null   object 
 4   budget                4809 non-null   int64  
 5   genres                4809 non-null   object 
 6   homepage              1713 non-null   object 
 7   id                    4809 non-null   int64  
 8   keywords              4809 non-null   object 
 9   original_language     4809 non-null   object 
 10  original_title        4809 non-null   object 
 11  overview              4806 non-null   object 
 12  popularity            4809 non-null   float64
 13  production_companies  4809 non-null   object 
 14  production_countries  4809 non-null   object 
 15  release_date         

In [34]:
#as only want usefull columns so remove the unnessecery ones(budget, homepage, id, popularity, revenue, runtime, status, vote_count)
df = df[["movie_id", "title", "overview" , "cast", "crew", "genres", "keywords", "original_language", "production_companies", "tagline","production_countries", "release_date",  "vote_average"]]
df["year"] = df["release_date"].apply(lambda x: str(x).split("-")[0])
df.drop("release_date",axis = 1,inplace = True)

In [35]:
df["vote_average"] = df["vote_average"].apply(lambda x : str(round(int(x))))
df.production_countries[0]

'[{"iso_3166_1": "US", "name": "United States of America"}, {"iso_3166_1": "GB", "name": "United Kingdom"}]'

In [36]:

def convert(obj):
    out = []
    for i in literal_eval(obj):
        out.append(i['name'])
    return out


In [37]:
df["genres"] = df["genres"].apply(convert)

In [38]:
df.dropna(axis=1, inplace=True)

In [39]:
#format cast,crew, keywords, production_companies,production_countries

df["keywords"] = df["keywords"].apply(convert)
df["production_companies"] = df["production_companies"].apply(convert)
df["production_countries"] = df["production_countries"].apply(convert)

In [40]:
#Fetching the director
def directorf(obj):
    out = []
    for i in literal_eval(obj):
        if i['job'] == 'Director':
            out.append(i['name'])
            break
    return out

In [41]:
#Fetching top 5 crew members
def castf(obj):
    out = []
    for i in literal_eval(obj)[:5]:
        out.append(i['name']) 
    return out

In [42]:
df["cast"] = df["cast"].apply(castf)
df["crew"] = df["crew"].apply(directorf)

In [43]:
df["original_language"] = df["original_language"].apply(lambda x : x.split())
df["vote_average"] = df["vote_average"].apply(lambda x : x.split())
df["year"] = df["year"].apply(lambda x : x.split())

In [44]:
df["cast"] = df["cast"].apply(lambda x : [i.replace(" ", "") for i in x])
df["crew"] = df["crew"].apply(lambda x : [i.replace(" ", "") for i in x])
df["genres"] = df["genres"].apply(lambda x : [i.replace(" ", "") for i in x])
df["keywords"] = df["keywords"].apply(lambda x : [i.replace(" ", "") for i in x])
df["production_companies"] = df["production_companies"].apply(lambda x : [i.replace(" ", "") for i in x])
df["production_countries"] = df["production_countries"].apply(lambda x : [i.replace(" ", "") for i in x])

In [45]:
df["tag"] = df["cast"] + df["crew"] + df["genres"] + df["keywords"] + df["original_language"] + df["production_companies"] + df["production_countries"] + df["vote_average"] + df["year"]

In [46]:
df = df[["movie_id", "title", "tag"]]
df.head()

Unnamed: 0,movie_id,title,tag
0,19995,Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ..."
1,285,Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste..."
2,206647,Spectre,"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp..."
3,49026,The Dark Knight Rises,"[ChristianBale, MichaelCaine, GaryOldman, Anne..."
4,49529,John Carter,"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi..."


In [47]:
df["tag"] = df["tag"].apply(lambda x : " ".join(x))


In [48]:
#as Sam and sam are treated as 2 different things
df["tag"] = df["tag"].apply(lambda x : x.lower())
df.tag[0]

'samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez jamescameron action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d en ingeniousfilmpartners twentiethcenturyfoxfilmcorporation duneentertainment lightstormentertainment unitedstatesofamerica unitedkingdom 7 2009'

In [49]:
#to remove similar words like actor and actors 
from nltk.stem.porter import PorterStemmer   #class PS
ps = PorterStemmer()
def pstem(text):
    s = []
    for i in text.split():
            s.append(ps.stem(i))
    return " ".join(s)
df["tag"] = df["tag"].apply(pstem)
df.tag[0]

'samworthington zoesaldana sigourneyweav stephenlang michellerodriguez jamescameron action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d en ingeniousfilmpartn twentiethcenturyfoxfilmcorpor duneentertain lightstormentertain unitedstatesofamerica unitedkingdom 7 2009'

In [50]:
#Now to convert these tags to vectors we use "bag of words - combines all the tag to form 1 big tag and the n(max_features) most common words in this are then compared with each single tag".
from sklearn.feature_extraction.text import CountVectorizer   #this is a class and not a fun therefore make an object 1st
cv = CountVectorizer(max_features = 5000 ,stop_words = 'english')  #stop_words means (is,a,or,etc)
vectors = cv.fit_transform(df['tag']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [51]:
#now to compare the similarity between each vectors(row/movie) with every other vector/row/movie we use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.15430335, 0.21622499, ..., 0.14638501, 0.15118579,
        0.10192944],
       [0.15430335, 1.        , 0.15569979, ..., 0.10540926, 0.16329932,
        0.11009638],
       [0.21622499, 0.15569979, 1.        , ..., 0.12309149, 0.19069252,
        0.12856487],
       ...,
       [0.14638501, 0.10540926, 0.12309149, ..., 1.        , 0.25819889,
        0.17407766],
       [0.15118579, 0.16329932, 0.19069252, ..., 0.25819889, 1.        ,
        0.26967994],
       [0.10192944, 0.11009638, 0.12856487, ..., 0.17407766, 0.26967994,
        1.        ]])

In [102]:
def recommend(movie_name):
    index = df[df['title'] == movie_name].index[0]
    dis = sorted(enumerate(similarity[index]), key=lambda x: x[1], reverse=True)[1:6]

    for i in dis:
        print(df.iloc[i[0]].title)
    

In [103]:
recommend('Batman Begins')

The Dark Knight
The Dark Knight Rises
Batman & Robin
Batman v Superman: Dawn of Justice
Batman


In [81]:
import pickle
pickle.dump(df.to_dict(),open('df_pycharm.pkl','wb'))

In [83]:
pickle.dump(similarity,open('similarity_pycharm.pkl','wb'))