In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
pd.options.mode.chained_assignment = None
from sklearn.neighbors import NearestNeighbors

In [2]:
data_path = os.path.join(os.getcwd(), 'data', 'the-movies-dataset')
movies = pd.read_csv(os.path.join(data_path, 'movies_preprocessed.csv'))
movies.drop_duplicates(subset='id', inplace=True)

## NN_Recommender

In [3]:
class NNRecommender():
    features = ['id', 'popularity', 'runtime', 'vote_average', 'vote_count', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music', 'Mystery',
       'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western',
       'en', 'fr', 'it', 'ja', 'de', 'es', 'ru', 'woman director',
       'independent film', 'murder', 'based on novel', 'musical', 'sex',
       'violence', 'nudity', 'biography', 'revenge', 'suspense', 'love',
       'female nudity', 'sport', 'police', 'teenager', 'duringcreditsstinger',
       'sequel', 'friendship', 'world war ii']
    
    def __init__(self, data, k=6, metric='minkowski', leaf_size=30):
        # normalize data
        data['popularity'] = data['popularity'] / data['popularity'].max()
        data['runtime'] = data['runtime'] / data['runtime'].max()
        data['vote_average'] = data['vote_average'] / data['vote_average'].max()
        data['vote_count'] = data['vote_count'] / data['vote_count'].max()
        # impute missing data
        data['runtime'] = data['runtime'].fillna(data['runtime'].mean())
        self.data = data[self.features]
        self.nn = NearestNeighbors(n_neighbors=k, metric=metric, leaf_size=leaf_size)
        self.nn.fit(self.data.drop('id', axis=1))
        
    def _indices_to_movie_id(self, indices):
        movie_ids = []
        for index in indices:
            movie_ids.append(self.data.iloc[index]['id'].values)
        return movie_ids[0][1:]
        
    def recommend(self, movie_id, return_distance=False):
        movie = self.data[self.data['id'] == movie_id].drop('id', axis=1).values
        distances, indices = self.nn.kneighbors(movie)
        recommendations = self._indices_to_movie_id(indices)
        if return_distance:
            return recommendations, distances
        return recommendations

In [4]:
def translate_id_to_title(ids, data):
    titles = []
    for i in ids:
        titles.append(data[data['id'] == i]['title'].values[0])
    return titles

In [5]:
nnr = NNRecommender(movies, k=10, metric='cosine')

## Sample Tests

In [6]:
test_movies = [603, 13, 1726, 680, 665, 583, 103, 128]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['The Matrix']
Recommendations:
['Æon Flux', "Independents' Day", 'Lucy', 'Edge of Tomorrow', 'Superman: Doomsday', 'Green Lantern: First Flight', 'Testament', 'The Chronicles of Riddick', 'Battle: Los Angeles']

Movie:  ['Forrest Gump']
Recommendations:
['Silver Linings Playbook', 'Three Steps Above Heaven', 'Looking: The Movie', 'Spring', 'Hands Across the Table', 'The Low Down', 'Feast of Love', 'Manhattan', 'Miss You Already']

Movie:  ['Iron Man']
Recommendations:
['Iron Man 3', 'Star Wars', 'The Empire Strikes Back', 'X-Men: First Class', 'Rogue One: A Star Wars Story', 'Return of the Jedi', 'Star Trek', 'Star Wars: Episode I - The Phantom Menace', '2012']

Movie:  ['Pulp Fiction']
Recommendations:
['Now You See Me', 'Reservoir Dogs', "Ocean's Eleven", 'Headhunters', 'The Frozen Ground', 'Get Carter', 'Kalifornia', 'Cohen and Tate', 'If I Die Before I Wake']

Movie:  ['Ben-Hur']
Recommendations:
['The Count of Monte-Cristo', 'The Message', 'The Mission', 'Scott of the Ant