In [66]:
# Imports
import numpy as np
import pandas as pd
import ast # String Conversions
from sklearn.feature_extraction.text import CountVectorizer # To Create Vectors
from nltk.stem.porter import PorterStemmer # Remove Related-Words Problem
from sklearn.metrics.pairwise import cosine_similarity # Cosine Distance between Vectors
import pickle


In [52]:
# Creating the Data Frame
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
data = movies.merge(credits, on = 'title')

In [53]:
# We will be using the Information about Genres, ID, Keywords, Title, Overview, Cast and Crew to build
# our recommender system

data_final2 = data[['id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew']]
data_final2.head()

Unnamed: 0,id,title,overview,keywords,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [54]:
# Checking for Missing Data

data_final2.isnull().sum()

data_final2.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2.dropna(inplace=True)


In [55]:
# Checking for Duplicate Data

data_final2.duplicated().sum()

0

In [56]:
# Extract Names from the Genres and Keywords
def nameExtract(name_dict):
    names = []
    for name in ast.literal_eval(name_dict):
        names.append(name['name'])
    return names

# Done Once

data_final2['genres'] = data_final2['genres'].apply(nameExtract)

data_final2['keywords'] = data_final2['keywords'].apply(nameExtract)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2['genres'] = data_final2['genres'].apply(nameExtract)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2['keywords'] = data_final2['keywords'].apply(nameExtract)


In [57]:
# Extract Top Actor Names from Cast
def nameExtract2(name_dict):
    actor_names = []
    top5 = 0 # Top 5 Actors of the Movie will be used
    for name in ast.literal_eval(name_dict):
        actor_names.append(name['name'])
        top5 += 1
        if (top5 == 5):
            break
    return actor_names

# Done Once

data_final2['cast'] = data_final2['cast'].apply(nameExtract2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2['cast'] = data_final2['cast'].apply(nameExtract2)


In [58]:
# Extract Director Name from Crew
# IDEA - Maybe include the Producers for recommendation as well
def directorName(name_dict):
    dir_name = []
    for name in ast.literal_eval(name_dict):
        if name['job'] == 'Director':
            dir_name.append(name['name'])
            break
    return dir_name

# Done Once

data_final2['crew'] = data_final2['crew'].apply(directorName)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2['crew'] = data_final2['crew'].apply(directorName)


In [59]:
# Done Once
data_final2['overview'] = data_final2['overview'].apply(lambda x: str(x).split())

#Correct Tokenisation is Important, Space between Words is harmful to the Recommender due to Ambiguity

data_final2['genres'] = data_final2['genres'].apply(lambda x : [i.replace(" ","") for i in x])

data_final2['keywords'] = data_final2['keywords'].apply(lambda x : [i.replace(" ","") for i in x])

data_final2['cast'] = data_final2['cast'].apply(lambda x : [i.replace(" ","") for i in x])

data_final2['crew'] = data_final2['crew'].apply(lambda x : [i.replace(" ","") for i in x])

#~~

# Making all the Tags 

data_final2['tags'] = data_final2['overview'] + data_final2['genres'] + data_final2['keywords'] + data_final2['cast'] + data_final2['crew']

data_final = data_final2[['id', 'title', 'tags']]

# Advisable to have Tokens in Lowercase

data_final['tags'] = data_final['tags'].apply(lambda x: " ".join(x).lower())

data_final.head()

data_final['tags'][1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2['overview'] = data_final2['overview'].apply(lambda x: str(x).split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2['genres'] = data_final2['genres'].apply(lambda x : [i.replace(" ","") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final2['keywords'] = da

"captain barbossa, long believed to be dead, has come back to life and is headed to the edge of the earth with will turner and elizabeth swann. but nothing is quite as it seems. adventure fantasy action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger johnnydepp orlandobloom keiraknightley stellanskarsgård chowyun-fat goreverbinski"

In [60]:
# Stemming the Words

ps = PorterStemmer()

def stemmer(text):
    stemmed_text = []

    for word in text.split():
        stemmed_text.append(ps.stem(word))

    return " ".join(stemmed_text)

data_final['tags'] = data_final['tags'].apply(stemmer)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final['tags'] = data_final['tags'].apply(stemmer)


In [61]:
# Vectorisation of Movie Tags and Report Nearest Vectors
# Using SKLearn Library, implementing the Bag Of Words Strategy - Try out tfidf and word2vec

# Max Features, Maybe try plotting it out or something?

cv = CountVectorizer(max_features=5000, stop_words='english')

In [62]:
word_vectors = cv.fit_transform(data_final['tags']).toarray()

In [63]:
similarity = cosine_similarity(word_vectors) # Similarity Matrix

In [65]:
# Top 5 Movie Recommendations will be given
def recommender(movie):
    movie_index = data_final[data_final['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True ,key = lambda x: x[1])[1:6]
    for movies in movies_list:
        print(data_final.iloc[movies[0]].title)

recommender('Batman Begins')

The Dark Knight
Batman
Batman
The Dark Knight Rises
Rockaway


In [67]:
pickle.dump(data_final, open("Movies.pkl", 'wb')) # Pickle File to store Movie Data; Can do via to_dict as well

pickle.dump(similarity, open("Similarity.pkl", 'wb'))