In [1]:
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pickle

In [2]:
#load data
df_mov = pd.read_csv('../data/movies.csv')
df_rat = pd.read_csv('../data/ratings.csv')
df_mov.drop_duplicates(subset=['title'], keep='first', inplace=True)
df = pd.merge(df_rat, df_mov , on='movieId', how='inner')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


#### new user query

In [3]:
user = {'title' : ["Toy Story (1995)", "Jurassic Park (1993)" , "The Butterfly Effect (2004)", 
                "Braveheart (1995)", "Inception (2010)"],'rating' : [2,2,5,1,5]}

In [4]:
user = pd.DataFrame(user)
user

Unnamed: 0,title,rating
0,Toy Story (1995),2
1,Jurassic Park (1993),2
2,The Butterfly Effect (2004),5
3,Braveheart (1995),1
4,Inception (2010),5


In [5]:
uniq_mov = pd.DataFrame(df_rat['movieId'].unique())
uniq_mov

Unnamed: 0,0
0,1
1,3
2,6
3,47
4,50
...,...
9719,160341
9720,160527
9721,160836
9722,163937


In [6]:
df_uniq = pd.merge(uniq_mov, df_mov, left_on=0, right_on='movieId')
df_uniq

Unnamed: 0,0,movieId,title,genres
0,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,3,3,Grumpier Old Men (1995),Comedy|Romance
2,6,6,Heat (1995),Action|Crime|Thriller
3,47,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,50,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...
9714,160341,160341,Bloodmoon (1997),Action|Thriller
9715,160527,160527,Sympathy for the Underdog (1971),Action|Crime|Drama
9716,160836,160836,Hazard (2005),Action|Drama|Thriller
9717,163937,163937,Blair Witch (2016),Horror|Thriller


In [7]:
user_ratings = pd.merge(df_uniq, user, how = 'outer')
user_ratings

Unnamed: 0,0,movieId,title,genres,rating
0,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0
1,3,3,Grumpier Old Men (1995),Comedy|Romance,
2,6,6,Heat (1995),Action|Crime|Thriller,
3,47,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,
4,50,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,
...,...,...,...,...,...
9714,160341,160341,Bloodmoon (1997),Action|Thriller,
9715,160527,160527,Sympathy for the Underdog (1971),Action|Crime|Drama,
9716,160836,160836,Hazard (2005),Action|Drama|Thriller,
9717,163937,163937,Blair Witch (2016),Horror|Thriller,


In [8]:
query = user_ratings['rating']
query

0       2.0
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
9714    NaN
9715    NaN
9716    NaN
9717    NaN
9718    NaN
Name: rating, Length: 9719, dtype: float64

In [9]:
query = np.array(query)

In [10]:
query

array([ 2., nan, nan, ..., nan, nan, nan])

#### Transform into a matrix

In [11]:
R = df.pivot_table(values='rating', index='userId', columns='movieId')

In [12]:
R

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [13]:
R.loc['n_user'] = query
query

array([ 2., nan, nan, ..., nan, nan, nan])

In [14]:
R

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,
610,5.0,,,,,5.0,,,,,...,,,,,,,,,,


In [15]:
#R.fillna(R.median(),inplace=True)
R.fillna(1,inplace=True)
R

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,1.0,4.0,1.0,1.0,4.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
608,2.5,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
609,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
610,5.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Create cosim matrix

In [16]:
## pick an active user and find the top 10 most similar users
cosim = pd.DataFrame(cosine_similarity(R))
cosim #just to see what the cosim looks like

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
0,1.000000,0.887730,0.889981,0.861423,0.892659,0.842868,0.876601,0.892919,0.889276,0.860537,...,0.880744,0.791371,0.876220,0.870029,0.773919,0.876963,0.834077,0.894605,0.758580,0.896914
1,0.887730,1.000000,0.974474,0.915144,0.970195,0.906930,0.942630,0.970274,0.971654,0.948630,...,0.947345,0.821907,0.955889,0.935174,0.818596,0.921149,0.859691,0.977471,0.812522,0.984922
2,0.889981,0.974474,1.000000,0.916099,0.970685,0.906096,0.942787,0.970376,0.972731,0.946911,...,0.947425,0.823548,0.956989,0.936771,0.817234,0.921924,0.857499,0.977954,0.806318,0.985985
3,0.861423,0.915144,0.916099,1.000000,0.920149,0.860646,0.896617,0.914918,0.914129,0.891000,...,0.902095,0.828654,0.902465,0.888095,0.802621,0.880431,0.824470,0.920015,0.769829,0.926296
4,0.892659,0.970195,0.970685,0.920149,1.000000,0.923092,0.943751,0.980160,0.968058,0.943540,...,0.964383,0.828176,0.963360,0.940133,0.823742,0.926566,0.863589,0.980038,0.805204,0.981605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.876963,0.921149,0.921924,0.880431,0.926566,0.877436,0.909596,0.929075,0.919422,0.893639,...,0.914776,0.809679,0.915318,0.896364,0.795477,1.000000,0.854051,0.931245,0.781792,0.931905
607,0.834077,0.859691,0.857499,0.824470,0.863589,0.820368,0.872795,0.868562,0.862750,0.838747,...,0.856052,0.765629,0.855838,0.845103,0.770906,0.854051,1.000000,0.867066,0.781762,0.866931
608,0.894605,0.977471,0.977954,0.920015,0.980038,0.920562,0.948866,0.983936,0.975125,0.949888,...,0.963116,0.828454,0.967514,0.942715,0.824419,0.931245,0.867066,1.000000,0.809649,0.988641
609,0.758580,0.812522,0.806318,0.769829,0.805204,0.746386,0.805799,0.807173,0.808649,0.794621,...,0.791310,0.702440,0.792519,0.786260,0.708012,0.781792,0.781762,0.809649,1.000000,0.813344


In [17]:
cosim = pd.DataFrame(cosine_similarity(R)[-1])
u=pd.DataFrame(cosim)
u

Unnamed: 0,0
0,0.896914
1,0.984922
2,0.985985
3,0.926296
4,0.981605
...,...
606,0.931905
607,0.866931
608,0.988641
609,0.813344


In [18]:
u = u.sort_values(by=[0], ascending=[False]).head(11)  #order by most similar users
u

Unnamed: 0,0
610,1.0
441,0.997708
507,0.99535
292,0.995117
213,0.994368
244,0.99431
430,0.993842
310,0.993544
477,0.993359
162,0.993117


In [19]:
similar_users = list(u.index)[1:]
similar_users

[441, 507, 292, 213, 244, 430, 310, 477, 162, 332]

#### With these subset of users, calculate the average rating

In [20]:
users_sim = R.loc[similar_users, :]
users_sim

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
441,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
507,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
292,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
213,3.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
244,1.0,1.0,1.0,1.0,1.0,5.0,1.0,1.0,1.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
430,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
310,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
477,4.0,4.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
162,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
332,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
movie_ratings_avg = pd.DataFrame(users_sim.mean())
movie_ratings_avg

Unnamed: 0_level_0,0
movieId,Unnamed: 1_level_1
1,2.15
2,1.30
3,1.20
4,1.20
5,1.00
...,...
193581,1.00
193583,1.00
193585,1.00
193587,1.00


#### Recommend movies that the similar users liked most and that the active user has not seen yet.

In [22]:
recommended_movies = movie_ratings_avg.sort_values(by=[0], ascending=[False]).head(10)
recommended_movies

Unnamed: 0_level_0,0
movieId,Unnamed: 1_level_1
356,3.4
589,3.25
4306,3.2
2571,3.1
318,2.95
593,2.8
58559,2.7
1196,2.65
7153,2.6
4993,2.6


In [23]:
recommended_movies = pd.merge(recommended_movies, df_mov, on='movieId', how = 'inner')
recommended_movies

Unnamed: 0,movieId,0,title,genres
0,356,3.4,Forrest Gump (1994),Comedy|Drama|Romance|War
1,589,3.25,Terminator 2: Judgment Day (1991),Action|Sci-Fi
2,4306,3.2,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
3,2571,3.1,"Matrix, The (1999)",Action|Sci-Fi|Thriller
4,318,2.95,"Shawshank Redemption, The (1994)",Crime|Drama
5,593,2.8,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
6,58559,2.7,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
7,1196,2.65,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
8,7153,2.6,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
9,4993,2.6,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy


In [24]:
recommended_movies = recommended_movies[['title','genres']]
recommended_movies

Unnamed: 0,title,genres
0,Forrest Gump (1994),Comedy|Drama|Romance|War
1,Terminator 2: Judgment Day (1991),Action|Sci-Fi
2,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
3,"Matrix, The (1999)",Action|Sci-Fi|Thriller
4,"Shawshank Redemption, The (1994)",Crime|Drama
5,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
6,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX
7,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
8,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
9,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
