# K-nearest neighbors: Movie recommendation system

## Notebooks set-up

In [None]:
# Standard library imports
import json

# Third party imports
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

## 1. Data loading
### 1.1. Load

In [None]:
movies = pd.read_csv('../data/raw/tmdb_5000_movies.csv')
credits = pd.read_csv('../data/raw/tmdb_5000_credits.csv')

### 1.2. Inspect

In [None]:
# Your code here...

### 1.3. Join

In [1]:
# Combine the datasets (hint: you don't need SQL here - Pandas can do SQL-like joins directly). See documentation
# for Pandas pd.merge(). Another hint: the 'movie_id' in the credits data and the 'id' in the movies data are the same

## 2. EDA

### 2.1. Feature encoding

In [None]:
# Make a copy to work with while encoding so that we have the original to go back to if needed
encoded_data_df = data_df.copy()

Some of the features contain per-cell JSON formatted data. We can use our Python/Pandas chops to extract and parse any data we want into a useful format. This requires some item-by-item processing and is necessarily messy.

In the two cells below, I wrote a function to extract the cast names two different ways - one with a loop and one using .apply(). The apply version is better, but harder to read. I included both to help you understand what the a.apply() method is doing. Take a look and both and try to write two additional .apply() lambda functions that extract the keywords and genres.


#### 2.1.1. Extract cast names: loop

In [None]:
# Empty list to hold extracted values
extracted_values = []

# Loop on the elements of the cast column
for json_string in data_df['cast']:

    # Load the json string into a python dictionary
    json_list = json.loads(json_string)

    # Empty list to hold values from this element
    values = []

    # Loop on the first three elements of the json list
    for item in json_list[:3]:

        # Extract the value for the name key
        value = item['name']

        # Add it to the list
        values.append(value)

    extracted_values.append(values)

#### 2.1.2. Extract cast names: lambda apply()

In [None]:
encoded_data_df['cast'] = data_df['cast'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)

#### 2.1.3. Extract other features

In [None]:
# Same for the 'keywords' column


In [None]:
# And the 'genres' column


In [None]:
# Check the result
encoded_data_df.head(3)

### 2.2. Missing and/or extreme values

In [None]:
# If you followed the example .apply() method to extract cast and genera, missing values should already
# be handled, but not a bad idea to double check

### 2.3. Combine features

In [None]:
# Combine the cats, keywords and genres features to one single string feature called 'tags'. This way, we
# Have one string feature that contains a bunch of relevant information about the movie.

## 3. Model training

In [None]:
# Use TFIDFVectorizer() from Scikit-learn to encode the tags feature, use the result to train
# a Scikit-learn NearestNeighbors() model.

## 4. Recommender

In [None]:
# Recommender function

def get_movie_recommendations(movie_title):
    '''Takes a movie title string, looks up TFIDF feature vector for that movie
    and returns title of top 5 most similar movies.'''

    # Your code here...

In [None]:
# 'Target' movie
input_movie = "How to Train Your Dragon"

# Call the recommendation function
recommendations = get_movie_recommendations(input_movie)

# Print the results
print("Film recommendations '{}'".format(input_movie))
for movie, distance in recommendations:
    print("- Film: {}".format(movie))

## 5. Save the assets

Next week, we will be deploying this model as a web app, so save the assets needed for the model to work.

In [None]:
# Save the assets
encoded_data_df.to_parquet('../data/processed/movies.parquet')
pickle.dump(model, open('../models/model.pkl', 'wb'))
pickle.dump(tfidf_matrix, open('../data/processed/tfidf_matrix.pkl', 'wb'))
pickle.dump(encoded_data_df, open('../data/processed/encoded_features_df.pkl', 'wb'))