# MOVIE RECOMMENDER SYSTEM

I am Rishabh Jain and I am building a movie recommender system on the dataset tmdb_5000

### STEP 1: Importing the libraries

In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
import ast

### STEP 2: Merge the datasets

In [7]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credits,on='title')


### STEP 3: Doing a whole bunch of data preprocessing

- Extracting useful features
- Making a tags column
- Text preprocessing

In [8]:
# Extracting useful features only
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

# Making the functions for eassy implementation

def change(string):
    return [i['name'] for i in ast.literal_eval(string)]

def get_director(string):
    for i in ast.literal_eval(string):
        if i['job'] == 'Director':
            return [i['name']]
    return []

def get_top_cast(string):
    return [i['name'] for i in ast.literal_eval(string)][:3]

# Applying the changes and making the tags column

movies['genres'] = movies['genres'].apply(change)
movies['keywords'] = movies['keywords'].apply(change)
movies['cast'] = movies['cast'].apply(get_top_cast)
movies['crew'] = movies['crew'].apply(get_director)
movies['overview'] = movies['overview'].fillna('').apply(lambda x: x.split())

movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Creating a new dataset with the important tags column only in features
new_movies = movies[['movie_id','title','tags']]
new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join(x))

# Stemming the data
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join([ps.stem(i) for i in x.split()]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies['tags'] = new_movies['tags'].apply(lambda x: ' '.join([ps.stem(i) for i in x.split()]))


###STEP 4: Using Tfidfvectorizer

In [9]:
# Vectorizing the processed tags column conatining strings
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(max_features=5000,stop_words='english',lowercase=True)
vector = tf.fit_transform(new_movies['tags']).toarray()


###STEP 5: Computing the cosine similarity and Making the recommend function

In [18]:
# Calculating the cosine similarity in the main recommend function and returning the top 10 results recommended excluding the movie itself
from sklearn.metrics.pairwise import cosine_similarity
def recommend(movie):
    try:
        movie_index = new_movies[new_movies['title'].str.lower() == movie.lower()].index[0]
        distances = cosine_similarity(vector[movie_index].reshape(1,-1),vector).flatten()
        best_index = distances.argsort()[::-1][1:11]
        return new_movies['title'].iloc[best_index].tolist()
    except IndexError:
        return ["Movie not found"]


###STEP 6: Calling the recommend function

In [19]:
movie = input("Enter the movie name: ")
recommend(movie)

Enter the movie name: avatar


['Aliens',
 'Alien³',
 'Silent Running',
 'Alien',
 'Spaceballs',
 'Moonraker',
 'Mission to Mars',
 'Lifeforce',
 'Treasure Planet',
 'Star Trek Into Darkness']