In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
movies_data=pd.read_csv(r"/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv")
credits_data=pd.read_csv(r"/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv")

In [3]:
movies_data.head()
movies_data.columns 
# 'budget','genres','homepage','id','keywords','original_language','original_title','overview','popularity','production_companies','production_countries','release_date','revenue','runtime','spoken_languages','status','tagline','title','vote_average','vote_count'
movies_data.shape  # 4803,20
print(movies_data.isna().sum())


In [4]:
credits_data.head()
credits_data.columns # 'movie_id', 'title', 'cast', 'crew'
credits_data.shape  # 4803,4
print(credits_data.isna().sum())

As these data-set ve many columns that are not nessential, so we take columns that are more important

In [5]:
selected_columns=['genres','keywords','overview','production_companies','title']
movies=movies_data[selected_columns]


# Merging 2 data-sets: 

In [6]:
movies=movies.merge(credits_data,on="title")    # As "title" column present on both data-set, so mergeing on"title"


## Data-set insights

In [7]:
movies.head()
movies.shape   # 4809,8

# handling Null values
movies.isna().sum()
movies.fillna("unknown",inplace=True)

# Duplicate values
movies.duplicated().sum()

## Preprocessing Data(Column-wise)

In [8]:
#'movie_id' column
movies['movie_id'].head()
movies['movie_id'][0]  

# we no need to preprocess this column

In [9]:
#'title' column
movies['title'].head()
movies['title'][0]  

# we no need to preprocess this column



In [10]:
## 'genres' column
movies['genres'].head()
movies['genres'][0]

# to make all words lower-case
movies["genres"]=movies["genres"].str.lower()

# Here we only need "names" from "genres"
# we also need to remove " " As 'Science','Fiction' & 'Science Fiction' can be represent as different 
def genre_extractor(df):
    genres=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "genre-column"
    return genres

movies["genres"]=movies["genres"].apply(genre_extractor)
movies["genres"]

In [11]:
# 'keywords' column
movies['keywords'].head()
movies['keywords'][0]

# to make all words lower-case
movies["keywords"]=movies["keywords"].str.lower()

# Here we only need "names" from "key-words"
# we also need to remove " " As 'Science','Fiction' & 'Science Fiction' can be represent as different  
def keywords_extractor(df):
    keywords=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return keywords

movies["keywords"]=movies["keywords"].apply(keywords_extractor)
movies["keywords"]

In [12]:
# 'overview'
movies["overview"].head()
movies["overview"][0]


# make all words lower-case
movies["overview"]=movies["overview"].str.lower()

# Remove "punctuation" 
movies["overview"]=movies['overview'].str.replace('[^\w\s]','')

# Remove "stop-words" as those are not important 
# Also need to "Stem" these words , As "play" , "plays" & "playing" can be represent as different.
stop_words=stopwords.words('english')
ps=PorterStemmer()

def preprocess(words):
    preprocessed=[word for word in words.split() if word not in stop_words]  # removoing stop-words
    preprocessed=[ps.stem(word) for word in preprocessed]                   # steming 
    preprocessed=' '.join(preprocessed) # join words to sentence
    return preprocessed

movies['overview']=movies['overview'].apply(preprocess)
movies['overview']

In [13]:
#'cast' column
movies['cast'].head()
movies['cast'][0]

# to make all words lower-case
movies['cast']=movies['cast'].str.lower()

# Here we only need "names" from "key-words"
# As there are many "cast" in a movie , I'm just taking "5 front casts"
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def casts_extractor(df):
    cast=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return cast[:5]

movies['cast']=movies['cast'].apply(casts_extractor)
movies['cast'][1]

In [14]:
#'crew' column
movies['crew'].head()
movies['crew'][0]

# to make all words lower-case
movies['crew']=movies['crew'].str.lower()

# Crews are important but it's lots of information to process & also not all crew members equally important for our task 
# So I'm taking only "director" & "producer"
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def director_extractor(df):
    """ Extract Director name from Crew Members"""
    name=[i["name"].replace(" ","-") for i in eval(df) if i["job"]=="director"]    # extract only list of "director-names" 
    return [name[0] if len(name)!=0 else ""]  # if there are multiple "directors" return 1st && if no-director name listed then return "" 

movies['director']=movies['crew'].apply(director_extractor)

def producer_extractor(df):
    """ Extract Producer name from Crew Members"""
    name=[i["name"].replace(" ","-") for i in eval(df) if i["job"]=="producer"]    # extract only list of "producer-names" 
    return [name[0] if len(name)!=0 else ""]  # if there are multiple "producers" return 1st && if no-producer name listed then return "" 

movies['producer']=movies['crew'].apply(producer_extractor)


## As we extracted valuable info. from "crew" column, so we can now drop that
movies.drop(['crew'],axis=1,inplace=True)

## In Some movies there are same "producer" as "director". So we have take one when they are same 
movies[['producer','director']]


In [15]:
#'production_companies' column
movies['production_companies'].head()
movies['production_companies'][0]

# to make all words lower-case
movies['production_companies']=movies['production_companies'].str.lower()

# I did not know "production_companies" important or not but for me, some times I ignore movies thats are not belongs to known "production_companies"
# so I'm keeping this column you can skip it
# Here I'm taking only 2 front-line "production_companies" 
# we also need to remove " " As 'johnny','depp' & 'johnny depp' can be represent as different  
def production_companies_extractor(df):
    """ Extract production_companies name """
    name=[i["name"].replace(" ","-") for i in eval(df)]    # extract only "name"s from "keywords-column"
    return name[:2] 

movies['production_companies']=movies['production_companies'].apply(production_companies_extractor)
movies['production_companies']

# Prepare data-set for Recommender-System

In [16]:
## We need to merge ['genres','keywords','overview','production_companies','cast','director','producer']

# we need to convert 'tags' column values from list to string & after that we can add it with "overview" column
# And we also need them as string to do modelling
movies["tags"]=movies['genres']+movies['keywords']+movies['cast']+movies['director']+movies['producer']+movies['production_companies']
movies['tags']=movies['tags'].apply(' '.join)   # converts list to strings

# Now we can add "overview" column to "tag"
movies['tags']=movies['tags']+movies['overview']

movies['tags']


In [17]:
## For our System we now need 3 columns "movie_id","title","tag"
movie_refined_data=movies[["movie_id","title","tags"]]
movie_refined_data.head()
movie_refined_data.shape  #4809,3

In [20]:
cv=CountVectorizer(max_features=5000)
word_vectors=cv.fit_transform(movie_refined_data["tags"]).toarray()
#cv.get_feature_names()

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
word_vector=tfidf.fit_transform(movie_refined_data["tags"]).toarray()

word=tfidf.get_feature_names()
#word

## Calculating Similarity or distance-metrics
* Here we need to calculate similarity of each movie with rest of movies
* In High-Dimensional Space "Eucleadian Distance" is not a good measure so we are using "Cosine Similarity"
* 

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix=cosine_similarity(word_vector)
# creates a matrix containing matrix of "similarity_score" of each movies with each other
similarity_matrix.shape

In [78]:
## Recommendation Engine
def recommend(movie_name):
    #movie_name=movie_name.title()
    if movie_name in movie_refined_data["title"].values: # checks if movie name in data or not
        movie_index=movie_refined_data[movie_refined_data["title"]==(movie_name)].index[0] # return movie_Id from movie name
        similarity=similarity_matrix[movie_index]
        recommend_movies=sorted(list(enumerate(similarity)),reverse=True,key=lambda x:x[1])[1:6] # return top-5 most similar(movie_Id,similarity_matrix)

        print("Your Recommended Movies:-")
        for movie in recommend_movies:
            print(movie_refined_data.iloc[movie[0]].title)
    else:
        print("Sorry!!! Try with another Movie name")

    #return recommend_movie_index

    


In [81]:
## Predict Your Next Movie
recommend("Pirates of the Caribbean: On Stranger Tides")
