# **Movie Recommender System**

In [1]:


import numpy as np
import pandas as pd

In [2]:
# importing datasets

movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [3]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [4]:
# Merging Tables - movies and credits on the basis of 'title'

movies = movies.merge(credits,on='title') 

In [5]:
movies.shape

(4809, 23)

In [6]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
# Selecting necessary columns from the table

# genres 
# id
# keywords
# title
# overview
# cast 
# crew

movies = movies[['movie_id', 'genres', 'keywords', 'title', 'overview', 'cast', 'crew']]

In [8]:
# finding any null value

movies.isnull().sum()

movie_id    0
genres      0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

In [9]:
# dropping null value

movies.dropna(inplace=True)

In [10]:
# Finding any duplicates

movies.duplicated().sum()

0

In [11]:
# in genres, we need to make it as a list. extract the name

movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
# For this, we need to call a library called ast. 
# ast.literal_eval will make the string as a list. 

import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')


[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [13]:
# creating a function that will extract the name from genres

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [14]:
# Applying the function in genres

movies['genres'] = movies['genres'].apply(convert)

In [15]:
# same thing for keywords

movies['keywords'] = movies['keywords'].apply(convert)

In [16]:
# For cast, we will take the first 3 names. 
# we will create a counter in the function
# counter will start at 0 and break on 3

def counter(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i['name'])
        else:
            break
    return L

In [17]:
movies['cast'] = movies['cast'].apply(counter)

In [18]:
# fetching director from crew

def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L
    

In [19]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [20]:
# Converting overview string to list
# using a lambda function

movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [21]:
movies.head(1)

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]


In [22]:
# deleting space between words

movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [23]:
movies.head(1)

Unnamed: 0,movie_id,genres,keywords,title,overview,cast,crew
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]


In [24]:
# Creating tags combining overview, genre, keywords, cast and crew

movies['tags'] = movies['genres']+movies['keywords']+movies['cast']+movies['crew']+movies['overview']

In [25]:
# Creating a new dataframe with id, title, and tags

new_df = movies[['movie_id','title','tags']]

In [26]:
# Combining tags list to string

new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [29]:
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventure fantasy sciencefiction cultur...


In [28]:
# converting all tags to lowercase

new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


 # # *sklearn* to vectorization and Finding Similarity

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000, stop_words='english')

In [31]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [32]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

In [34]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zoo', 'zooeydeschanel', 'zoëkravitz'],
      dtype=object)

In [35]:
# stemming 
# nltk

!pip install nltk



In [36]:
import nltk

In [37]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()


In [39]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [40]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [41]:
new_df['tags'][0]

'action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav stephenlang michellerodriguez giovanniribisi joeldavidmoor cchpounder wesstudi lazalonso dileeprao mattgerald seananthonymoran jasonwhyt scottlawr kellykilgour jamespatrickpitt seanpatrickmurphi peterdillon kevindorman kelsonhenderson davidvanhorn jacobtomuri michaelblain-rozgay joncurri lukehawk woodyschultz petermensah soniaye jahnelcurfman ilramchoi kylawarren lisaroumain debrawilson chrismala taylorkibbi jodielandau julielamm cullenb.madden josephbradymadden frankietorr austinwilson sarawilson tamicawashington-mil lucybri nathanmeist gerryblair matthewchamberlain paulyat wraywilson jamesgaylyn melvinlenoclarkiii carvonfutrel brandonjelk micahmoch hanniyahmuhammad christophernolen christaoliv aprilmariethoma bravitaa.threatt colinbleasda

In [42]:
# calculating cosine similarity between vectors

from sklearn.metrics.pairwise import cosine_similarity

In [43]:
similarity = cosine_similarity(vectors)

In [44]:
sorted(similarity[0], reverse = True)

[1.0000000000000002,
 0.23473823893078552,
 0.23294541397390256,
 0.2309782890611944,
 0.2252817784447915,
 0.21912524504463887,
 0.21398024625545647,
 0.21398024625545647,
 0.2123976976214366,
 0.21170244960998524,
 0.20935894733965596,
 0.20256711147285467,
 0.19839002137983244,
 0.19738550848793068,
 0.1930468356263361,
 0.19258222162991084,
 0.1904761904761905,
 0.1889822365046136,
 0.18824832213400472,
 0.1860968420796942,
 0.18609684207969418,
 0.1852396434087371,
 0.18523964340873708,
 0.18257418583505539,
 0.18257418583505539,
 0.18257418583505536,
 0.18184824186332701,
 0.18156825980064073,
 0.1800205749557739,
 0.17817416127494962,
 0.17699808135119716,
 0.17699808135119713,
 0.17636890537566463,
 0.17496355305594127,
 0.17295817388759027,
 0.17251638983558856,
 0.1711841970043652,
 0.17118419700436519,
 0.17078251276599332,
 0.1690308509457033,
 0.1683587574253685,
 0.16718346377260584,
 0.1649572197684645,
 0.1648119469795901,
 0.16448792373994225,
 0.16366341767699427,
 0.

In [47]:
# Recommend function

def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)


# **Recommend**

**recommend('*movie name*')**



In [49]:
recommend('Avatar')

Lifeforce
Aliens vs Predator: Requiem
Battle: Los Angeles
Titan A.E.
Independence Day
