# K-nearest neighbors: Movie recommendation system

## Notebooks set-up

In [1]:
# Standard library imports
import json

# Third party imports
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

## 1. Data loading
### 1.1. Load

In [2]:
movies = pd.read_csv('../data/raw/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/raw/tmdb_5000_credits.csv')

### 1.2. Inspect

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [5]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


### 1.3. Join

In [7]:
# Combine the datasets (hint: you don't need SQL here - Pandas can do SQL-like joins directly). See documentation
# for Pandas pd.merge(). Another hint: the 'movie_id' in the credits data and the 'id' in the movies data are the same

credits.rename({'movie_id': 'id'}, axis=1, inplace=True)

data_df = pd.merge(movies, credits, on='id', how='outer')
data_df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,4000000,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...",,5,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...",en,Four Rooms,It's Ted the Bellhop's first night on the job....,22.87623,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...",...,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,6.5,530,Four Rooms,"[{""cast_id"": 42, ""character"": ""Ted the Bellhop...","[{""credit_id"": ""52fe420dc3a36847f800012d"", ""de..."
1,11000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 28, ""...",http://www.starwars.com/films/star-wars-episod...,11,"[{""id"": 803, ""name"": ""android""}, {""id"": 4270, ...",en,Star Wars,Princess Leia is captured and held hostage by ...,126.393695,"[{""name"": ""Lucasfilm"", ""id"": 1}, {""name"": ""Twe...",...,121.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"A long time ago in a galaxy far, far away...",Star Wars,8.1,6624,Star Wars,"[{""cast_id"": 3, ""character"": ""Luke Skywalker"",...","[{""credit_id"": ""52fe420dc3a36847f8000437"", ""de..."
2,94000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",http://movies.disney.com/finding-nemo,12,"[{""id"": 494, ""name"": ""father son relationship""...",en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",85.688789,"[{""name"": ""Pixar Animation Studios"", ""id"": 3}]",...,100.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"There are 3.7 trillion fish in the ocean, they...",Finding Nemo,7.6,6122,Finding Nemo,"[{""cast_id"": 8, ""character"": ""Marlin (voice)"",...","[{""credit_id"": ""52fe420ec3a36847f80006b1"", ""de..."
3,55000000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",,13,"[{""id"": 422, ""name"": ""vietnam veteran""}, {""id""...",en,Forrest Gump,A man with a low IQ has accomplished great thi...,138.133331,"[{""name"": ""Paramount Pictures"", ""id"": 4}]",...,142.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"The world will never be the same, once you've ...",Forrest Gump,8.2,7927,Forrest Gump,"[{""cast_id"": 7, ""character"": ""Forrest Gump"", ""...","[{""credit_id"": ""52fe420ec3a36847f800076b"", ""de..."
4,15000000,"[{""id"": 18, ""name"": ""Drama""}]",http://www.dreamworks.com/ab/,14,"[{""id"": 255, ""name"": ""male nudity""}, {""id"": 29...",en,American Beauty,"Lester Burnham, a depressed suburban father in...",80.878605,"[{""name"": ""DreamWorks SKG"", ""id"": 27}, {""name""...",...,122.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Look closer.,American Beauty,7.9,3313,American Beauty,"[{""cast_id"": 6, ""character"": ""Lester Burnham"",...","[{""credit_id"": ""52fe420ec3a36847f8000809"", ""de..."


## 2. EDA

### 2.1. Feature encoding

In [8]:
# Make a copy to work with while encoding so that we have the original to go back to if needed
encoded_data_df = data_df.copy()

Some of the features contain per-cell JSON formatted data. We can use our Python/Pandas chops to extract and parse any data we want into a useful format. This requires some item-by-item processing and is necessarily messy.

In the two cells below, I wrote a function to extract the cast names two different ways - one with a loop and one using .apply(). The apply version is better, but harder to read. I included both to help you understand what the a.apply() method is doing. Take a look and both and try to write two additional .apply() lambda functions that extract the keywords and genres.


#### 2.1.1. Extract cast names: loop

In [9]:
# Empty list to hold extracted values
extracted_values = []

# Loop on the elements of the cast column
for json_string in data_df['cast']:

    # Load the json string into a python dictionary
    json_list = json.loads(json_string)

    # Empty list to hold values from this element
    values = []

    # Loop on the first three elements of the json list
    for item in json_list[:3]:

        # Extract the value for the name key
        value = item['name']

        # Add it to the list
        values.append(value)

    extracted_values.append(values)

#### 2.1.2. Extract cast names: lambda apply()

In [10]:
encoded_data_df['cast'] = data_df['cast'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)

#### 2.1.3. Extract other features

In [11]:
# Same for the 'keywords' column

encoded_data_df['keyword'] = data_df['keywords'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)


In [14]:
# And the 'genres' column
encoded_data_df['genres'] = data_df['genres'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)

In [16]:
# Check the result
encoded_data_df.info(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

### 2.2. Missing and/or extreme values

In [17]:
# If you followed the example .apply() method to extract cast and genera, missing values should already
# be handled, but not a bad idea to double check

# See how many missing values are in each column
encoded_data_df.isnull().sum().sort_values(ascending=False)


homepage                3091
tagline                  844
overview                   3
runtime                    2
release_date               1
id                         0
genres                     0
budget                     0
original_title             0
original_language          0
keywords                   0
production_companies       0
production_countries       0
revenue                    0
spoken_languages           0
popularity                 0
status                     0
title_x                    0
vote_average               0
vote_count                 0
title_y                    0
cast                       0
crew                       0
keyword                    0
dtype: int64

### 2.3. Combine features

In [24]:
# Combine the cats, keywords and genres features to one single string feature called 'tags'. This way, we
# Have one string feature that contains a bunch of relevant information about the movie.

# Safe concatenation of features into 'tags'
def make_tags(row):
    overview = row['overview'] if isinstance(row['overview'], str) else ''
    genres = row['genres'] if isinstance(row['genres'], list) else []
    keywords = row['keywords'] if isinstance(row['keywords'], list) else []
    cast = row['cast'] if isinstance(row['cast'], list) else []

    return [overview] + genres + keywords + cast

# Apply safely
encoded_data_df['tags'] = encoded_data_df.apply(make_tags, axis=1)

# Join into a string (make sure all items are str)
encoded_data_df['tags'] = encoded_data_df['tags'].apply(
    lambda x: ', '.join([str(i) for i in x])
)

# Example row
print(encoded_data_df.iloc[0].tags)





It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.


## 3. Model training

In [None]:
# Use TFIDFVectorizer() from Scikit-learn to encode the tags feature, use the result to train
# a Scikit-learn NearestNeighbors() model.

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'tags' feature
tfidf_matrix = tfidf.fit_transform(encoded_data_df['tags'])

# Initialize Nearest Neighbors
model = NearestNeighbors(metric='cosine', algorithm='brute')

# Fit the model on the TF-IDF matrix
model.fit(tfidf_matrix)



0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


## 4. Recommender

In [26]:
# Recommender function

def get_movie_recommendations(movie_title):
    '''Takes a movie title string, looks up TFIDF feature vector for that movie
    and returns title of top 5 most similar movies.'''

    # Find the query movie in the encoded data, get the index
    movie_index = encoded_data_df[encoded_data_df["title"] == movie_title].index[0]

    # Get the distances and indexes of similar movies
    distances, indices = model.kneighbors(tfidf_matrix[movie_index])

    # Extract the titles of the similar movie
    similar_movies = [(encoded_data_df["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    
    return similar_movies[1:]

In [35]:
# Fix column name from title_x to title
if 'title_x' in encoded_data_df.columns and 'title' not in encoded_data_df.columns:
    encoded_data_df.rename(columns={'title_x': 'title'}, inplace=True)

# 'Target' movie
input_movie = "Mission: Impossible"

# Call the recommendation function
recommendations = get_movie_recommendations(input_movie)

# Print the results
print("Film recommendations '{}'".format(input_movie))
for movie, distance in recommendations:
    print("- Film: {}".format(movie))

Film recommendations 'Mission: Impossible'
- Film: Mission: Impossible III
- Film: Mission: Impossible II
- Film: The Sentinel
- Film: Compadres


## 5. Save the assets

Next week, we will be deploying this model as a web app, so save the assets needed for the model to work.

In [36]:
# Save the assets
import pickle

encoded_data_df.to_parquet('../data/processed/movies.parquet')
pickle.dump(model, open('../models/model.pkl', 'wb'))
pickle.dump(tfidf_matrix, open('../data/processed/tfidf_matrix.pkl', 'wb'))
pickle.dump(encoded_data_df, open('../data/processed/encoded_features_df.pkl', 'wb'))