## Neighboors method

In [1]:
# data processing libraries
import numpy as np
import pandas as pd

# plotting libraries
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# machine-learning libraries
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
#from scipy.sparse import csr_matrix

# miscellaneous
import pickle
from sklearn.metrics.pairwise import cosine_similarity

Read movie ratings in

In [31]:
df_R = pd.read_csv('user_rating.csv', index_col=0)
titles = pd.read_csv('processed_titles.csv', index_col=0)

In [3]:
# initialize the unsupervised model NearestNeighbors
cosin_model = NearestNeighbors(metric='cosine')

In [4]:
# fit it to the Ratings matrix
cosin_model.fit(df_R)

NearestNeighbors(metric='cosine')

### Save and load the model

In [5]:
with open('cosin_recommender.pkl', 'wb') as file:
    pickle.dump(cosin_model, file)

In [6]:
with open('cosin_recommender.pkl', 'rb') as file:
    cosin_model = pickle.load(file)

### Add user info

In [7]:
#user info
new_user_query = {
    10: 4,  # Billy Madison (1995)
    100: 3, # Bambi (1942)
    555: 3.5,  # Mortal Kombat (1995)
    756: 2,  # Inside Man (2006)
    1225: 5,  # Babe: Pig in the City (1998)
}

In [8]:
new_user_dataframe = pd.DataFrame(new_user_query,columns = df_R.columns, index = ['new user'])
new_user_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new user,,,,,,,,,,,...,,,,,,,,,,


In [9]:
#impute NaNs with mean 
new_user_imputed = new_user_dataframe.fillna(df_R.mean())
new_user_imputed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new user,1.381967,0.277869,0.659836,1.322951,1.417213,0.316393,0.142623,1.566393,0.255738,0.385246,...,0.12623,0.087705,0.118033,0.128689,0.139344,0.136885,0.098361,0.081967,0.108197,0.106557


**calculate the score**

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [10]:
# calculates the distances to all other users in the data!
similarity_scores, neighbor_ids = cosin_model.kneighbors(
    new_user_imputed,
    n_neighbors=5,
    return_distance=True
)


In [11]:
neighbor_ids[0]

array([413, 598, 479, 273,  67])

In [12]:
# only look at ratings for users that are similar!
neighborhood = df_R.iloc[neighbor_ids[0]]
neighborhood

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
413,4.0,4.0,3.0,4.0,5.0,0.0,4.0,5.0,5.0,4.0,...,3.5,2.0,4.0,3.0,4.0,0.0,0.0,2.0,2.0,0.0
598,3.0,1.5,4.5,4.0,3.5,3.5,2.5,3.5,0.0,3.5,...,2.5,1.5,3.0,0.0,2.5,3.0,1.5,1.5,4.0,0.0
479,3.0,2.5,4.0,4.5,3.5,0.0,5.0,5.0,0.0,3.0,...,0.0,0.0,0.0,3.5,0.0,0.0,2.5,4.0,0.0,0.0
273,4.0,0.0,4.0,4.0,4.0,4.5,3.5,4.5,0.0,3.5,...,2.0,3.0,0.0,3.5,0.0,0.0,3.0,3.0,3.5,3.5
67,2.5,2.0,4.0,4.0,3.0,4.0,0.0,2.5,0.0,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
new_user_query.keys()

dict_keys([10, 100, 555, 756, 1225])

In [14]:
neighborhood.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '1225', '1226', '1227', '1228', '1229', '1230', '1231', '1232', '1233',
       '1234'],
      dtype='object', length=1235)

In [15]:
# Convert keys in new_user_query to strings
query_keys = [str(key) for key in new_user_query.keys()]

# Filter out the movies already seen
neighborhood_filtered = neighborhood.drop(query_keys, axis=1)

In [16]:
neighborhood_filtered

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1224,1226,1227,1228,1229,1230,1231,1232,1233,1234
413,4.0,4.0,3.0,4.0,5.0,0.0,4.0,5.0,5.0,4.0,...,0.0,2.0,4.0,3.0,4.0,0.0,0.0,2.0,2.0,0.0
598,3.0,1.5,4.5,4.0,3.5,3.5,2.5,3.5,0.0,3.5,...,0.0,1.5,3.0,0.0,2.5,3.0,1.5,1.5,4.0,0.0
479,3.0,2.5,4.0,4.5,3.5,0.0,5.0,5.0,0.0,3.0,...,0.0,0.0,0.0,3.5,0.0,0.0,2.5,4.0,0.0,0.0
273,4.0,0.0,4.0,4.0,4.0,4.5,3.5,4.5,0.0,3.5,...,4.5,3.0,0.0,3.5,0.0,0.0,3.0,3.0,3.5,3.5
67,2.5,2.0,4.0,4.0,3.0,4.0,0.0,2.5,0.0,4.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood

df_score = neighborhood_filtered.sum()
df_score

0       16.5
1       10.0
2       19.5
3       20.5
4       19.0
        ... 
1230     3.0
1231     7.0
1232    10.5
1233     9.5
1234     3.5
Length: 1230, dtype: float64

In [18]:
df_score_ranked = df_score.sort_values(ascending=False).index.tolist()
df_score_ranked

['58',
 '144',
 '130',
 '181',
 '373',
 '126',
 '63',
 '14',
 '60',
 '81',
 '300',
 '151',
 '52',
 '59',
 '322',
 '474',
 '156',
 '57',
 '292',
 '425',
 '180',
 '77',
 '593',
 '19',
 '672',
 '182',
 '62',
 '1022',
 '484',
 '712',
 '101',
 '22',
 '118',
 '15',
 '721',
 '65',
 '61',
 '64',
 '668',
 '468',
 '69',
 '677',
 '493',
 '594',
 '717',
 '3',
 '109',
 '7',
 '171',
 '612',
 '83',
 '73',
 '459',
 '448',
 '24',
 '42',
 '215',
 '1105',
 '451',
 '280',
 '460',
 '676',
 '2',
 '76',
 '462',
 '614',
 '1093',
 '962',
 '175',
 '185',
 '596',
 '674',
 '23',
 '267',
 '995',
 '269',
 '465',
 '121',
 '610',
 '959',
 '96',
 '53',
 '32',
 '291',
 '177',
 '760',
 '298',
 '31',
 '615',
 '1145',
 '328',
 '747',
 '447',
 '316',
 '529',
 '140',
 '4',
 '1085',
 '387',
 '233',
 '170',
 '711',
 '142',
 '665',
 '160',
 '469',
 '41',
 '286',
 '68',
 '1090',
 '440',
 '1193',
 '1201',
 '9',
 '485',
 '153',
 '706',
 '29',
 '34',
 '138',
 '646',
 '72',
 '1084',
 '437',
 '75',
 '1077',
 '449',
 '40',
 '1062',
 

In [19]:
#top 10 recommendations (k)
recommendations = df_score_ranked[:10]
recommendations

['58', '144', '130', '181', '373', '126', '63', '14', '60', '81']

In [39]:
titles

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
52,2,Jumanji (1995),Adventure|Children|Fantasy
154,3,Grumpier Old Men (1995),Comedy|Romance
357,4,Waiting to Exhale (1995),Comedy|Drama|Romance
561,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
52238,1230,Annie Hall (1977),Comedy|Romance
52260,1231,"Right Stuff, The (1983)",Drama
52284,1232,Stalker (1979),Drama|Mystery|Sci-Fi
52305,1233,"Boot, Das (Boat, The) (1981)",Action|Drama|War


## Make a function out of this

In [40]:
# collaborative filtering = look at ratings only!
def recommend_neighborhood(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained nearest neighbors model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    # construct a user vector
    new_user_dataframe = pd.DataFrame(new_user_query,columns = df_R.columns, index = ['new user'])
    new_user_imputed = new_user_dataframe.fillna(df_R.mean())
   
    # 2. scoring
    # find n neighbors
    similarity_scores, neighbor_ids = cosin_model.kneighbors(
    new_user_imputed,
    n_neighbors=5,
    return_distance=True)
    neighborhood = df_R.iloc[neighbor_ids[0]]

    # 3. ranking
    # filter out movies allready seen by the user:
    # 1 - Convert keys in new_user_query to strings
    query_keys = [str(key) for key in new_user_query.keys()]
    # 2 - Filter out the movies already seen
    neighborhood_filtered = neighborhood.drop(query_keys, axis=1)

    # return the top-k highst rated movie ids or titles
    df_score = neighborhood_filtered.sum()
    df_score_ranked = df_score.sort_values(ascending=False).index.astype(int).tolist()[:k]
    recommendations = df_score_ranked[:k]
    recommended_movies = titles[titles['movieId'].isin(recommendations)]
    
    return recommended_movies[['movieId', 'title']].values.tolist()

In [41]:
recommend_neighborhood(new_user_query, cosin_model, df_R, k=10)

[[14, 'Nixon (1995)'],
 [58, 'Postman, The (Postino, Il) (1994)'],
 [60, 'Indian in the Cupboard, The (1995)'],
 [63,
  "Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)"],
 [81, "Things to Do in Denver When You're Dead (1995)"],
 [126, 'NeverEnding Story III, The (1994)'],
 [144, 'Brothers McMullen, The (1995)'],
 [181, 'Mighty Morphin Power Rangers: The Movie (1995)'],
 [373, 'Red Rock West (1992)']]