# Movie Recommender System

## Imports

In [140]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset

In [141]:
movies = pd.read_csv('../data/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/tmdb_5000_credits.csv')

In [142]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [143]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [144]:
movies.shape

(4803, 20)

In [145]:
credits.shape

(4803, 4)

In [146]:
movies = movies.merge(credits, on='title')
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [147]:
movies.shape

(4809, 23)

In [148]:
features = ['id', 'title', 'release_date', 'runtime', 'overview', 'genres', 'keywords', 'cast', 'crew', 'production_countries', 'budget', 'revenue', 'popularity', 'vote_average', 'vote_count']
movies = movies[features]
movies.head(1)

Unnamed: 0,id,title,release_date,runtime,overview,genres,keywords,cast,crew,production_countries,budget,revenue,popularity,vote_average,vote_count
0,19995,Avatar,2009-12-10,162.0,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",237000000,2787965087,150.437577,7.2,11800


## EDA

In [149]:
movies.isnull().sum()

id                      0
title                   0
release_date            1
runtime                 2
overview                3
genres                  0
keywords                0
cast                    0
crew                    0
production_countries    0
budget                  0
revenue                 0
popularity              0
vote_average            0
vote_count              0
dtype: int64

In [150]:
movies.dropna(inplace=True)

In [151]:
movies['release_date'] = movies['release_date'].apply(lambda x: x[:4])
movies = movies.rename(columns={"release_date": "year"})
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,crew,production_countries,budget,revenue,popularity,vote_average,vote_count
0,19995,Avatar,2009,162.0,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",237000000,2787965087,150.437577,7.2,11800


In [152]:
movies['paid_off'] = movies['budget'] - movies['revenue']
movies['paid_off'] = movies['paid_off'].apply(lambda x: 1 if x < 0 else 0)

movies = movies.drop(['budget', 'revenue'], axis = 1)
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,crew,production_countries,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",150.437577,7.2,11800,1


In [153]:
import ast

def convert(item):
    result = []
    for i in ast.literal_eval(item):
        result.append(i['name'])
    return result

In [154]:
movies['genres'] = movies['genres'].apply(convert)
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,crew,production_countries,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",150.437577,7.2,11800,1


In [155]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,crew,production_countries,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",150.437577,7.2,11800,1


In [156]:
def convert_five(item):
    result = []
    counter = 0
    for i in ast.literal_eval(item):
        if counter != 5:
            result.append(i['name'])
            counter += 1
        else:
            break
    return result

In [157]:
movies['cast'] = movies['cast'].apply(convert_five)
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,crew,production_countries,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",150.437577,7.2,11800,1


In [158]:
def convert_director(item):
    result = []
    for i in ast.literal_eval(item):
        if i['job'] == 'Director':
            result.append(i['name'])
            break
    return result

In [159]:
movies['crew'] = movies['crew'].apply(convert_director)
movies = movies.rename(columns={"crew": "director"})
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,director,production_countries,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",150.437577,7.2,11800,1


In [160]:
def convert_first_iso(item):
    result = []
    for i in ast.literal_eval(item):
        result.append(i['iso_3166_1'])
        break
    return result

In [161]:
movies['production_countries'] = movies['production_countries'].apply(convert_first_iso)
movies = movies.rename(columns={"production_countries": "country"})
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,director,country,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],[US],150.437577,7.2,11800,1


In [162]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,director,country,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],[US],150.437577,7.2,11800,1


In [164]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['director'] = movies['director'].apply(lambda x: [i.replace(' ', '') for i in x])
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,director,country,popularity,vote_average,vote_count,paid_off
0,19995,Avatar,2009,162.0,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],[US],150.437577,7.2,11800,1


In [165]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['cast'] + movies['director'] + movies['country']
movies.head(1)

Unnamed: 0,id,title,year,runtime,overview,genres,keywords,cast,director,country,popularity,vote_average,vote_count,paid_off,tags
0,19995,Avatar,2009,162.0,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],[US],150.437577,7.2,11800,1,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [166]:
df = movies[['id', 'title', 'year', 'runtime', 'popularity', 'vote_average', 'vote_count', 'paid_off', 'tags']]
df.head(1)

Unnamed: 0,id,title,year,runtime,popularity,vote_average,vote_count,paid_off,tags
0,19995,Avatar,2009,162.0,150.437577,7.2,11800,1,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [168]:
df['tags'] = df['tags'].apply(lambda x: ' '.join(x))
df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: ' '.join(x))


Unnamed: 0,id,title,year,runtime,popularity,vote_average,vote_count,paid_off,tags
0,19995,Avatar,2009,162.0,150.437577,7.2,11800,1,"In the 22nd century, a paraplegic Marine is di..."


In [170]:
df['tags'] = df['tags'].apply(lambda x: x.lower())
df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: x.lower())


Unnamed: 0,id,title,year,runtime,popularity,vote_average,vote_count,paid_off,tags
0,19995,Avatar,2009,162.0,150.437577,7.2,11800,1,"in the 22nd century, a paraplegic marine is di..."


## Features

In [176]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

In [177]:
ps = PorterStemmer()

In [178]:
def stem(text):
    result = []
    for i in text.split():
        result.append(ps.stem(i))
    return ' '.join(result)

In [None]:
df['tags'] = df['tags'].apply(stem)
df.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)


In [315]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [316]:
vectors = cv.fit_transform(df['tags'])

In [317]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      shape=(5000,), dtype=object)

In [318]:
from sklearn.metrics.pairwise import cosine_similarity

In [319]:
similarity = cosine_similarity(vectors)

In [320]:
def recommend(movie):
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]

    for i in movie_list:
        print(df.iloc[i[0]].title)

In [322]:
recommend("Eraserhead")

Hotel Transylvania 2
Coyote Ugly
Chuck & Buck
Ghost Rider: Spirit of Vengeance
The Hills Have Eyes 2
Circle
A Hard Day's Night
Disaster Movie
Teenage Mutant Ninja Turtles II: The Secret of the Ooze
Slither


In [323]:
import pickle

In [325]:
pickle.dump(df.to_dict(), open('movies.pkl', 'wb'))

In [326]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))