# Nearest Neighbor Recommender: People behind Movies

In [12]:
import os
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 600)
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer

In [13]:
data_path = os.path.join(os.getcwd(), 'data')
movies = pd.read_csv(os.path.join(data_path, 'movies_preprocessed.csv'))
movies = movies.fillna("")

### Remove unimportant features

In [14]:
features = ['id','title','star_1','star_2', 'star_3', 'writer', 'producer', 'director', 'Action', 'Adventure', 'Animation', 'Comedy', 'Crime','Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western','woman director', 'independent film', 'murder', 'based on novel', 'musical', 'sex', 'violence', 'nudity', 'biography', 'revenge', 'suspense', 'love', 'female nudity', 'sport', 'police', 'teenager', 'duringcreditsstinger', 'sequel', 'friendship', 'world war ii']
movies = movies[features]

### Remove unimportant actors

In [15]:
def remove_unimportant_items(movie_dataset, column, file, top, filter_=True):
    popular_items = pd.read_json(os.path.join(data_path, file))[:top]
        
    #set all non popular actors null
    for idx, movie in movie_dataset.iterrows():
        if(filter_ and (not any(popular_items['name'] == movie[column]))):
            movie_dataset.at[idx, column] = []
        else:
            movie_dataset.at[idx, column] = [movie_dataset.at[idx, column]]
            
    return movie_dataset

In [17]:
#transform actors
FILTER_DATA = False
movies = remove_unimportant_items(movies, 'star_1', 'popular_actors.json', top=500, filter_ = FILTER_DATA)
movies = remove_unimportant_items(movies, 'star_2', 'popular_actors.json', top=300, filter_ = FILTER_DATA)
movies = remove_unimportant_items(movies, 'star_3', 'popular_actors.json', top=100, filter_ = FILTER_DATA)

In [18]:
#transform writers
movies = remove_unimportant_items(movies, 'writer', 'top_writer.json', top=14)

In [19]:
#transform directors
movies = remove_unimportant_items(movies, 'director', 'top_directors.json', top=100)

In [20]:
#transform producers
movies = remove_unimportant_items(movies, 'producer', 'top_producers.json', top=12)

### Merge Actor columns

In [21]:
movies['actor'] = movies['star_1']+movies['star_2']+movies['star_3']

del movies['star_1']
del movies['star_2']
del movies['star_3']

### Merge Writers, Producers and Directors

In [22]:
movies['director'] = movies['writer'] + movies['director'] + movies['producer']

del movies['producer']
del movies['writer']

### Transforming features

In [25]:
def feature_one_hot(movie_dataset, column): 
    'Returns the given movie dataset with a transformed to one-hot encoded genre column.'

    # MultiLabelBinarizer is a very fast solution for one hot encoding on large dataframes
    mlb = MultiLabelBinarizer()
        
    one_hot_genre = pd.DataFrame(mlb.fit_transform(movie_dataset[column]),
                                 columns= [column + ' ' + str(class_) for class_ in mlb.classes_], 
                                 index=movie_dataset[column].index)

    movie_dataset = pd.concat([movie_dataset, one_hot_genre], sort=False, axis=1)
    movie_dataset.drop(column, axis=1)
    
    return movie_dataset

### A: Actors separated

In [26]:
#Actors
movies = feature_one_hot(movies, 'star_1')
movies = feature_one_hot(movies, 'star_2')
movies = feature_one_hot(movies, 'star_3')
del movies['star_1']
del movies['star_2']
del movies['star_3']

KeyError: 'star_1'

### B: Actors together

In [416]:
movies = feature_one_hot(movies, 'actor')
del movies['actor']

### Binarize all the other stuff

In [400]:
#Writers
movies = feature_one_hot(movies, 'writer')
del movies['writer']

In [420]:
#Directors
movies = feature_one_hot(movies, 'director')
del movies['director']

In [417]:
#Producers
movies = feature_one_hot(movies, 'producer')
del movies['producer']

KeyError: 'producer'

## NN_Recommender

In [421]:
class NNRecommender():
    
    def __init__(self, data, k=6, metric='minkowski', leaf_size=30):
        self.data = data.drop('title', axis=1)
        self.nn = NearestNeighbors(n_neighbors=k, metric=metric, leaf_size=leaf_size)
        self.nn.fit(self.data.drop('id', axis=1))
    
    def _indices_to_movie_id(self, indices):
        movie_ids = []
        for index in indices:
            movie_ids.append(self.data.iloc[index]['id'].values)
        return movie_ids[0][1:]
    
    def recommend(self, movie_id, return_distance=False):
        movie = self.data[self.data['id']==movie_id].drop('id', axis=1).values
        distances, indices = self.nn.kneighbors(movie)
        recommendations = self._indices_to_movie_id(indices)
        if return_distance:
            return recommendations, distances
        else:
            return recommendations

In [422]:
def translate_id_to_title(ids, data):
    titles = []
    for i in ids:
        titles.append(data[data['id'] == i]['title'].values[0])
    return titles

In [423]:
nnr = NNRecommender(movies,
                    k=6, 
                    metric='minkowski', 
                    leaf_size=30)

## Only Actors Top 100

In [887]:
test_movies = [13, 1726, 680, 665, 583, 103, 128]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
["The 'Burbs", 'Forrest Gump', 'Toy Story', 'Apollo 13', 'Sleepless in Seattle']

Movie:  ['Iron Man']
Recommendations:
['The Singing Detective', 'Heart and Souls', 'Restoration', 'Two Girls and a Guy', '1969']

Movie:  ['Pulp Fiction']
Recommendations:
['Changing Lanes', 'Jackie Brown', 'Pulp Fiction', 'The Long Kiss Goodnight', 'Unbreakable']

Movie:  ['Ben-Hur']
Recommendations:
['A Cruel Romance', 'The Jungle Book', "Things to Do in Denver When You're Dead", 'Jury Duty', 'Train of Life']

Movie:  ['Life of Brian']
Recommendations:
['A Cruel Romance', 'The Jungle Book', "Things to Do in Denver When You're Dead", 'Jury Duty', 'Train of Life']

Movie:  ['Taxi Driver']
Recommendations:
['A Cruel Romance', 'The Jungle Book', "Things to Do in Denver When You're Dead", 'Jury Duty', 'Train of Life']

Movie:  ['Princess Mononoke']
Recommendations:
['A Cruel Romance', 'The Jungle Book', "Things to Do in Denver When You're Dead", 'Jury Duty', 'Train o

## Only Actors Top 200

In [859]:
test_movies = [13, 1726, 680, 665, 583, 103, 128]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
["The 'Burbs", 'Forrest Gump', 'Toy Story', 'Apollo 13', 'Sleepless in Seattle']

Movie:  ['Iron Man']
Recommendations:
['Chaplin', 'Heart and Souls', 'Two Girls and a Guy', '1969', 'The Singing Detective']

Movie:  ['Pulp Fiction']
Recommendations:
['xXx: State of the Union', 'Jackie Brown', 'Pulp Fiction', 'The Long Kiss Goodnight', 'Twisted']

Movie:  ['Ben-Hur']
Recommendations:
['Testament of Orpheus', 'Dreaming of Joseph Lees', "Things to Do in Denver When You're Dead", 'Kids', 'Turbo: A Power Rangers Movie']

Movie:  ['Life of Brian']
Recommendations:
['Testament of Orpheus', 'Dreaming of Joseph Lees', "Things to Do in Denver When You're Dead", 'Kids', 'Turbo: A Power Rangers Movie']

Movie:  ['Taxi Driver']
Recommendations:
['Testament of Orpheus', 'Dreaming of Joseph Lees', "Things to Do in Denver When You're Dead", 'Kids', 'Turbo: A Power Rangers Movie']

Movie:  ['Princess Mononoke']
Recommendations:
['Testament of Orpheus', 'Dreamin

## Top 100 Actors and Top 14 Writer

In [964]:
test_movies = [13, 1726, 680, 665, 583, 103, 128]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
["The 'Burbs", 'Forrest Gump', 'Toy Story', 'Apollo 13', 'Philadelphia']

Movie:  ['Iron Man']
Recommendations:
['The Singing Detective', 'Heart and Souls', 'Restoration', 'Two Girls and a Guy', '1969']

Movie:  ['Pulp Fiction']
Recommendations:
['Changing Lanes', 'Jackie Brown', 'Pulp Fiction', 'The Long Kiss Goodnight', 'Unbreakable']

Movie:  ['Ben-Hur']
Recommendations:
['True Confessions', 'I Like It Like That', "Things to Do in Denver When You're Dead", 'Jury Duty', 'The Guardian']

Movie:  ['Life of Brian']
Recommendations:
['True Confessions', 'I Like It Like That', "Things to Do in Denver When You're Dead", 'Jury Duty', 'The Guardian']

Movie:  ['Taxi Driver']
Recommendations:
['True Confessions', 'I Like It Like That', "Things to Do in Denver When You're Dead", 'Jury Duty', 'The Guardian']

Movie:  ['Princess Mononoke']
Recommendations:
['True Confessions', 'I Like It Like That', "Things to Do in Denver When You're Dead", 'Jury Duty',

## Top 500, 300, 100 Actors, Top 14 Writer

In [981]:
test_movies = [13, 1726, 680, 665, 583, 103, 128]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
["The 'Burbs", 'Forrest Gump', 'Toy Story', 'Apollo 13', 'Sleepless in Seattle']

Movie:  ['Iron Man']
Recommendations:
['The Singing Detective', 'Black and White', 'Two Girls and a Guy', 'Heart and Souls', '1969']

Movie:  ['Pulp Fiction']
Recommendations:
['Zambezia', "National Lampoon's Loaded Weapon 1", 'Pulp Fiction', 'The Long Kiss Goodnight', '1408']

Movie:  ['Ben-Hur']
Recommendations:
['Earthquake', 'Soylent Green', 'Ben-Hur', 'Touch of Evil', 'Planet of the Apes']

Movie:  ['Life of Brian']
Recommendations:
['Personal Velocity', 'Rawhead Rex', "Things to Do in Denver When You're Dead", 'Kids', 'Selena']

Movie:  ['Taxi Driver']
Recommendations:
['Once Upon a Time in America', 'A Bronx Tale', 'Taxi Driver', "Mary Shelley's Frankenstein", 'GoodFellas']

Movie:  ['Princess Mononoke']
Recommendations:
['Personal Velocity', 'Rawhead Rex', "Things to Do in Denver When You're Dead", 'Kids', 'Selena']



## Top 500, 300, 100 Actors, Top 14 Writer, Top 20 Directors

In [1067]:
test_movies = [13, 1726, 680, 665, 583, 103, 128]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
["The 'Burbs", 'Forrest Gump', 'Toy Story', 'Apollo 13', 'Sleepless in Seattle']

Movie:  ['Iron Man']
Recommendations:
['The Singing Detective', 'Black and White', 'Two Girls and a Guy', 'Heart and Souls', '1969']

Movie:  ['Pulp Fiction']
Recommendations:
['Death Proof', 'Jackie Brown', "National Lampoon's Loaded Weapon 1", 'The Long Kiss Goodnight', 'Twisted']

Movie:  ['Ben-Hur']
Recommendations:
['Earthquake', 'Soylent Green', 'Ben-Hur', 'Touch of Evil', 'Planet of the Apes']

Movie:  ['Life of Brian']
Recommendations:
['Personal Velocity', 'Rawhead Rex', "Things to Do in Denver When You're Dead", 'Kids', 'Selena']

Movie:  ['Taxi Driver']
Recommendations:
['The King of Comedy', 'Taxi Driver', 'Raging Bull', 'A Bronx Tale', 'The Age of Innocence']

Movie:  ['Princess Mononoke']
Recommendations:
['Personal Velocity', 'Rawhead Rex', "Things to Do in Denver When You're Dead", 'Kids', 'Selena']



## Top 500, 300, 100 Actors, Top 14 Writer, Top 20 Directors, 12 Producers

In [1179]:
test_movies = [13, 1726, 680, 665, 583, 103, 128, 15121, 1984, 562, 176]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
["The 'Burbs", 'Forrest Gump', 'Toy Story', 'Apollo 13', 'Sleepless in Seattle']

Movie:  ['Iron Man']
Recommendations:
['The Singing Detective', 'Black and White', 'Two Girls and a Guy', 'Heart and Souls', '1969']

Movie:  ['Pulp Fiction']
Recommendations:
['Death Proof', 'Jackie Brown', "National Lampoon's Loaded Weapon 1", 'The Long Kiss Goodnight', 'Twisted']

Movie:  ['Ben-Hur']
Recommendations:
['Earthquake', 'Soylent Green', 'Ben-Hur', 'Touch of Evil', 'Planet of the Apes']

Movie:  ['Life of Brian']
Recommendations:
['Personal Velocity', 'Rawhead Rex', "Things to Do in Denver When You're Dead", 'Kids', 'Selena']

Movie:  ['Taxi Driver']
Recommendations:
['The King of Comedy', 'Taxi Driver', 'Raging Bull', 'A Bronx Tale', 'The Age of Innocence']

Movie:  ['Princess Mononoke']
Recommendations:
['Personal Velocity', 'Rawhead Rex', "Things to Do in Denver When You're Dead", 'Kids', 'Selena']

Movie:  ['The Sound of Music']
Recommendations:


## All Actors together

In [174]:
test_movies = [13, 1726, 680, 665, 583, 103, 128, 15121, 1984, 562, 176]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
["The 'Burbs", 'Philadelphia', 'Apollo 13', 'Forrest Gump', 'That Thing You Do!']

Movie:  ['Iron Man']
Recommendations:
['Heart and Souls', 'Two Girls and a Guy', 'Home for the Holidays', 'Only You', 'The Gingerbread Man']

Movie:  ['Pulp Fiction']
Recommendations:
['Amos & Andrew', 'Fresh', 'The Great White Hype', 'Fluke', 'The Long Kiss Goodnight']

Movie:  ['Ben-Hur']
Recommendations:
['Earthquake', 'Soylent Green', 'Touch of Evil', 'Ben-Hur', 'Planet of the Apes']

Movie:  ['Life of Brian']
Recommendations:
['The Face Behind the Mask', 'Smokers Only', "Antonia's Line", 'Living in Oblivion', 'Princess Mononoke']

Movie:  ['Taxi Driver']
Recommendations:
['The King of Comedy', 'Taxi Driver', 'Raging Bull', 'A Bronx Tale', 'The Age of Innocence']

Movie:  ['Princess Mononoke']
Recommendations:
['The Face Behind the Mask', 'Smokers Only', "Antonia's Line", 'Living in Oblivion', 'Princess Mononoke']

Movie:  ['The Sound of Music']
Recommendatio

## With every actor kept in one list

In [314]:
test_movies = [13, 1726, 680, 665, 583, 103, 128, 15121, 1984, 562, 176, 11]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
['That Thing You Do!', 'Forrest Gump', 'Toy Story', 'Apollo 13', 'Sleepless in Seattle']

Movie:  ['Iron Man']
Recommendations:
['Heart and Souls', 'Two Girls and a Guy', 'Home for the Holidays', 'Only You', 'The Gingerbread Man']

Movie:  ['Pulp Fiction']
Recommendations:
['Amos & Andrew', 'Fresh', 'The Great White Hype', 'Fluke', 'The Long Kiss Goodnight']

Movie:  ['Ben-Hur']
Recommendations:
['Earthquake', 'Soylent Green', 'Touch of Evil', 'Ben-Hur', 'Planet of the Apes']

Movie:  ['Life of Brian']
Recommendations:
['The Face Behind the Mask', 'Smokers Only', "Antonia's Line", 'Living in Oblivion', 'Princess Mononoke']

Movie:  ['Taxi Driver']
Recommendations:
['The King of Comedy', 'Taxi Driver', 'Raging Bull', 'A Bronx Tale', 'The Age of Innocence']

Movie:  ['Princess Mononoke']
Recommendations:
['The Face Behind the Mask', 'Smokers Only', "Antonia's Line", 'Living in Oblivion', 'Princess Mononoke']

Movie:  ['The Sound of Music']
Recomm

### People plus Genre

In [356]:
test_movies = [13, 1726, 680, 665, 583, 103, 128, 15121, 1984, 562, 176, 11]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
['Sleepless in Seattle', 'Nothing in Common', 'Together', 'Sidewalks of New York', 'That Thing You Do!']

Movie:  ['Iron Man']
Recommendations:
['Iron Man 2', 'Iron Man 3', 'America 3000', '20,000 Leagues Under the Sea', 'The Time Machine']

Movie:  ['Pulp Fiction']
Recommendations:
['Physical Evidence', 'The Hitch-Hiker', 'Snatch', 'Incognito', 'Fresh']

Movie:  ['Ben-Hur']
Recommendations:
['Scott of the Antarctic', 'The Lost City of Z', 'The Message', 'The Guillotines', 'Memories of the Sword']

Movie:  ['Life of Brian']
Recommendations:
['Haiku Tunnel', 'Splendor', 'So Fine', 'Jay and Silent Bob Strike Back', 'Abbott and Costello Meet the Mummy']

Movie:  ['Taxi Driver']
Recommendations:
['Taxi Driver', 'Raging Bull', 'True Confessions', 'Casino', 'A Bronx Tale']

Movie:  ['Princess Mononoke']
Recommendations:
['Pom Poko', 'Princess Mononoke', 'Yu☆Gi☆Oh!', 'Valhalla', 'Pokémon Origins']

Movie:  ['The Sound of Music']
Recommendations:
['Rol

### Actors merged Writer, Producer, Director merged plus Genre plus Keywords 

In [424]:
test_movies = [13, 1726, 680, 665, 583, 103, 128, 15121, 1984, 562, 176, 11]
for movie_id in test_movies:
    print('Movie: ', translate_id_to_title([movie_id], movies))
    recommendations = nnr.recommend(movie_id=movie_id)
    titles = translate_id_to_title(recommendations, movies)
    print('Recommendations:\n{}'.format(titles))
    print()

Movie:  ['Forrest Gump']
Recommendations:
['Looking: The Movie', 'Jules and Jim', 'Three Steps Above Heaven', 'The Low Down', 'Hands Across the Table']

Movie:  ['Iron Man']
Recommendations:
['Iron Man 3', 'America 3000', 'The Time Machine', 'Power Rangers', 'Mr. India']

Movie:  ['Pulp Fiction']
Recommendations:
['Heaven', 'T-Men', 'Reservoir Dogs', 'Knockaround Guys', 'The Formula']

Movie:  ['Ben-Hur']
Recommendations:
['The Last of the Mohicans', 'Memories of the Sword', 'Scott of the Antarctic', "Arn: The Kingdom at Road's End", 'Mountains of the Moon']

Movie:  ['Life of Brian']
Recommendations:
['Denise Calls Up', 'Splendor', 'So Fine', 'Jay and Silent Bob Strike Back', 'Haiku Tunnel']

Movie:  ['Taxi Driver']
Recommendations:
['True Confessions', 'A Bronx Tale', 'Casino', 'Bang the Drum Slowly', 'Detour']

Movie:  ['Princess Mononoke']
Recommendations:
['Mei and the Kittenbus', "Howl's Moving Castle", 'Yu☆Gi☆Oh!', "Kiki's Delivery Service", 'Valhalla']

Movie:  ['The Sound of M

In [431]:
#Print all column names with values != 0
id_ = 680
print(movies[movies['id'] == id_]['title'])
for mov in movies[movies['id']== id_]:
    if(movies[movies['id']== id_][mov].item() != 0):
        print(mov)


291    Pulp Fiction
Name: title, dtype: object
id
title
Crime
Thriller
actor Samuel L. Jackson
director Quentin Tarantino


In [433]:
#Print all column names with values != 0
movie_name = "The Prince"
for mov in movies[movies['title']== movie_name]:
    if(movies[movies['title']==movie_name][mov].item() != 0):
        print(mov)


id
title
Action
Thriller
actor Bruce Willis


In [24]:
movies[movies['title']=='Forrest Gump']

Unnamed: 0,id,title,director,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,woman director,independent film,murder,based on novel,musical,sex,violence,nudity,biography,revenge,suspense,love,female nudity,sport,police,teenager,duringcreditsstinger,sequel,friendship,world war ii,actor
350,13,Forrest Gump,[],0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,"[Tom Hanks, Robin Wright, Gary Sinise]"
