In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Recommender Systems

## Types of Recommender Systems
#### Content Based Recommender Systems
#### Collaborative Filtering based Systems
#### Hybrid Recommender Systems

In [None]:
# Example of Content Based Filtering Recommender System
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Reading the Data and  placing it into a dataframe
movies_df=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits_df=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [None]:
#Examining the Data
movies_df.head(10)

In [None]:
#Datatypes of the Features
movies_df.dtypes
# movies_df.size
# A lof of them are Object like JSON

In [None]:
# Examining the Credits
credits_df.head()
# credits_df.size

In [None]:
# Now we must merge the dataframe to get one common dataframe
movies_df=movies_df.merge(credits_df,on='title')

In [None]:
# Here we can see that the Dataframes have been merged
movies_df.shape

In [None]:
movies_df.info()

In [None]:
# For recommendation systems it is essential to create tags

# So for any dataset we have to examine which of the features are useful for creating tags and which are not

# So we will list the Columns that are essential and the Columns not Listed will be dropped off

# All of the features that may influence a viewers decision are kept 

# 1) Genres
# 2) id
# 3) Keywords
# 4) Title
# 5) Overview
# 6) Cast 
# 7) Crew

# Language is dropped as here it is highly highly skewed in favour of English
# Some other features like Production House, Revenue, Release Date, Vote average, vote count can be an important factor
# But we are keeping the approach very crude and so we will avoid the Numeric Values

#Dropping the rest of the Columns

movies_df=movies_df[['genres','movie_id','title','overview','keywords','cast','crew','vote_average','vote_count','popularity','revenue']]


In [None]:
# Re-examing the Data
movies_df.info()

## Data Pre-Processing

In [None]:
# Cleaning, Processing, Imputing steps
movies_df.isnull().sum()

In [None]:
# Dropping the 3 Null Values in the Overview columns
movies_df.dropna(inplace=True)

In [None]:
# Checkinng for Duplicates
movies_df.duplicated().sum()
# No Duplicates and so it is good to go

In [None]:
movies_df.iloc[0].genres

In [None]:
import ast
def convertdicttolist(obj):
    list1=[]
    for i in ast.literal_eval(obj):
        list1.append(i['name'])
    return list1
        

In [None]:
movies_df['genres']=movies_df['genres'].apply(convertdicttolist)

In [None]:
# Applying this to all the Keywords, Cast and Crew
movies_df['keywords']=movies_df['keywords'].apply(convertdicttolist)
movies_df['cast']=movies_df['cast'].apply(convertdicttolist)

In [None]:
#  Checking the Final Output
movies_df.head()

In [None]:
# Now we also want to limit the Number of entries for a feature
movies_df.iloc[0].cast
# So qw do not need such a humongous cast and only need the top 3-4 of them

In [None]:
# So now we will reduce the Number of entries in the Cast to 3 Only
def getTop3(list):
    counter=0
    listnew=[]
    for i in list:
        if(counter<3):
            listnew.append(i)
        counter=counter+1
    return listnew

In [None]:
movies_df['cast']=movies_df['cast'].apply(getTop3)

In [None]:
movies_df['cast']

In [None]:
# It gets a Slightly trickier for crew as we would like to get the Directors,
movies_df['crew'][0]
# We are only Interested in Director and Producer so we will cut off the rest 

In [None]:
# Getting the Director and Producer
def fetchDandP(obj):
    DPlist=[]
    for i in ast.literal_eval(obj):
        if(i['job']=='Director' or i['job']=='Producer'):
            DPlist.append(i['name'])
    return DPlist

In [None]:
movies_df['crew']=movies_df['crew'].apply(fetchDandP)

In [None]:
movies_df.iloc[0].crew

In [None]:
# Now we will split the Overview words into a list as well
movies_df['overview']=movies_df['overview'].apply(lambda x:x.split())

In [None]:
# As we can see that the Overview has been splitted into a List
movies_df.iloc[0:10].overview

In [None]:
# Now since we have everything in lists we will have to concatenate the lists
# Before that we will have to concat the Strings with spaces and so we will have to apply transformations
movies_df['genres']=movies_df['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies_df['keywords']=movies_df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['cast']=movies_df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['crew']=movies_df['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
# We can see that the transformation has been applied and the words have been joined
movies_df.head()

In [None]:
# Create a new columns tags and then add all the columns to that column
movies_df['tags']=movies_df['overview']+movies_df['genres']+movies_df['cast']+movies_df['crew']+movies_df['keywords']

In [None]:
# Creating a New dataframe with a only the required features
new_movies_df=movies_df[['movie_id','title','tags','vote_average','popularity']]

In [None]:
new_movies_df.head(10)

In [None]:
# Now we will convert the List of tags into Strings for better Usability
new_movies_df['tags']=new_movies_df['tags'].apply(lambda x: " ".join(x))

In [None]:
# Converting everything to lowercase
new_movies_df['tags']=new_movies_df['tags'].apply(lambda x: x.lower())

## Text Vectorization

### Applying stemming to get a cleaner corpus

In [None]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    string=" ".join(y)
    return string

In [None]:
# Stemming the Column
new_movies_df['tags']=new_movies_df['tags'].apply(stem)

In [None]:
# Approaches
# 1) Bag of Words
# 2) Word2Vec
# 3) tfidf
#  We will be using both Bag of Words and TF * IDF and will take the better one
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vectors=cv.fit_transform(new_movies_df['tags']).toarray()

In [None]:
# This will be a pretty saprse Matrix
vectors.shape

In [None]:
# TF IDF

tfIdfVectorizer=TfidfVectorizer(use_idf=True,max_features=5000,stop_words='english')
tfIdf = tfIdfVectorizer.fit_transform(new_movies_df['tags'])
# We will be using this vector, which is same as vectors
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

In [None]:
# Calculating the Cosine distance in the bag of words and TF IDF Model to find the similarity

from sklearn.metrics.pairwise import cosine_similarity 

In [None]:
similarity_bow=cosine_similarity(vectors)
similarity_tfidf=cosine_similarity(tfIdf)

In [None]:
# Distance of each Movie with Each Movie so shape will be n x n 
print(similarity_bow.shape)
print(similarity_tfidf.shape)

## Creating the Recommendation Function (Final):

In [None]:
# How to not loose index while sorting
# Call the enumerate Function
print(sorted(list(enumerate(similarity_bow[0])),reverse=True,key=lambda x:x[1])[1:11])
print(sorted(list(enumerate(similarity_tfidf[0])),reverse=True,key=lambda x:x[1])[1:11])

In [None]:
def recommend(movie):
    # Fetch the Index from the Similarity Array
    # Then sort the distances in descending order of that index and return the top5 
    # They will be the most similar movies to the  given Movie
    movie_index=new_movies_df[new_movies_df['title']==movie].index[0]
    distances_bow=similarity_bow[movie_index]
    distances_tfidf=similarity_tfidf[movie_index]
    movies_list_bow=sorted(list(enumerate(distances_bow)),reverse=True,key=lambda x:x[1])[1:11]
    movies_list_tfidf=sorted(list(enumerate(distances_tfidf)),reverse=True,key=lambda x:x[1])[1:11]
    print("Movies recommended by Bag of Words Method")
    for i in movies_list_bow:
        
        #print(i[0])
        print(new_movies_df.iloc[i[0]].title)
    print("Movies recommended by TF-IDF")
    for i in movies_list_tfidf:
        
        #print(i[0])
        print(new_movies_df.iloc[i[0]].title)

In [None]:
recommend('Batman Begins')

# Collaborative Filtering 

### Building a Collaborative Filtering Recommender System on the Same Topic of Movie Recommendation 