### Task: 

Calculate the average rating for each movie in the dataset - done

Filter out movies that have been watched by less than 20 users - done

Recommend the top ten movies that a user has not seen yet

Write a function recommend_popular(query, ratings, k=10) that gets a user query of rated movie-ids and the ratings table as input. It returns a list of k movie-ids.

The user query is a python dictionary that looks like this: {12: 5, 234: 1, 234: 4.5}.


In [1]:
import numpy as np
from sklearn.decomposition import NMF 
import pandas as pd

## Load and check the data

In [2]:
movies= pd.read_csv('ml-latest-small/movies.csv' , sep = ',')
ratings= pd.read_csv('ml-latest-small/ratings.csv' , sep = ',')
links = pd.read_csv('ml-latest-small/links.csv' , sep = ',')
tags = pd.read_csv('ml-latest-small/tags.csv' , sep = ',')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### Data preprocessing

In [7]:
# calculate the number of ratings per movie
rating_count = ratings.groupby('movieId')[['rating']].count()

In [8]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = rating_count[rating_count['rating']>20].index

In [9]:
av_rating = ratings.groupby('movieId')[['rating']].mean()

In [10]:
# filter the ratings matrix and only keep the popular movies
df = ratings[ratings['movieId'].isin(popular_movies)].copy()

In [11]:
# remake user ids since they are not sequential
user_ids = df['userId'].unique()
user_id_map = {v:k for k,v in enumerate(user_ids)}
df['userId'] = df['userId'].map(user_id_map)

### Need to be done: 
Create another df (df_with_titles or smth) where movie titles are joined to the table so they can be extracted later in the function.


Save to csv

In [12]:
# remake movie ids since they are not sequential
movie_ids = df['movieId'].unique()
movie_id_map = {v:k for k,v in enumerate(movie_ids)}
df['movieId'] = df['movieId'].map(movie_id_map)

In [13]:
#add same mapping to movies df
movies['old_movieId'] = movies['movieId']

In [14]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931
...,...,...,...,...
100803,609,808,4.0,1493847175
100808,609,643,4.0,1493846503
100829,609,809,5.0,1493845631
100830,609,644,4.0,1493879365


In [15]:
df_titles = df.merge(movies, on= 'movieId')

In [53]:
titles = df_titles[['movieId','title', 'genres']]
titles = titles.drop_duplicates()

In [54]:
df.to_csv('processed_ids.csv')
titles.to_csv('processed_titles.csv')

In [18]:
# Initialize a sparse user-item rating matrix
from scipy.sparse import csr_matrix
# (data, (row_ind, col_ind) 
R = csr_matrix((df['rating'], (df['userId'], df['movieId'])))

In [19]:
R.shape

(610, 1235)

In [20]:
R = R.todense()

In [21]:
df_R = pd.DataFrame(R)
df_R

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df_R.to_csv('user_rating.csv')

## NMF MODEL

In [23]:
## Calculate Q matrix
nmf_model = NMF(n_components=2000, max_iter=300, tol = 0.0001, verbose = 2)

In [24]:
nmf_model.fit(R)



violation: 1.0
violation: 0.1439834116753243
violation: 0.042616031756929865
violation: 0.01717750444238156
violation: 0.008756148476032642
violation: 0.005254803301527616
violation: 0.0035077784599203557
violation: 0.0025371177538820837
violation: 0.001908398067805715
violation: 0.0014940997759230177
violation: 0.0012111220118477947
violation: 0.0009911845628905811
violation: 0.0008299509736808292
violation: 0.0007038190081216436
violation: 0.0006005803513317439
violation: 0.0005246221553137346
violation: 0.00046118857303173384
violation: 0.00040671838616167125
violation: 0.0003609035320622684
violation: 0.00032333257390767677
violation: 0.00029153877522808684
violation: 0.0002627843745170314
violation: 0.00023765074916720205
violation: 0.00021680577157499574
violation: 0.00019917466343931202
violation: 0.00018351460994010072
violation: 0.00017027549678754806
violation: 0.0001578212138416929
violation: 0.00014574748243150384
violation: 0.00013597209410078838
violation: 0.0001270669723

NMF(max_iter=300, n_components=2000, verbose=2)

In [25]:
Q_matrix = nmf_model.components_

In [26]:
Q_matrix.shape

(2000, 1235)

In [27]:
Q = pd.DataFrame(Q_matrix)
Q

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,0.004368,0.0,0.000000,0.023536,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000e+00,0.001066,0.0,0.0
1,0.017820,0.0,0.002796,0.019399,0.009356,0.0,0.0,0.021952,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0
2,0.011248,0.0,0.023919,0.002214,0.061206,0.0,0.0,0.004302,0.0,0.004992,...,0.001346,0.000000,0.0,0.0,0.0,0.000000,0.000000e+00,0.002270,0.0,0.0
3,0.000000,0.0,0.000000,0.100310,0.000333,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0
4,0.047374,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000493,0.000000e+00,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,3.768523e-10,0.000000,0.0,0.0
1996,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0
1997,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0
1998,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000e+00,0.000000,0.0,0.0


In [28]:
P_matrix = nmf_model.transform(R)



violation: 1.0
violation: 0.9608658637851413
violation: 0.48842543227747537
violation: 0.21742612112130424
violation: 0.10452340542375751
violation: 0.05661238606840272
violation: 0.033084487449422266
violation: 0.022559432704261608
violation: 0.016268362959695096
violation: 0.011834453935999995
violation: 0.008822037417385383
violation: 0.0069509000458681195
violation: 0.005397204372227126
violation: 0.004397998062547433
violation: 0.003691660272716456
violation: 0.0030908539549444293
violation: 0.0026409950357121304
violation: 0.002261368651300166
violation: 0.0019510030203402885
violation: 0.0017301598427183792
violation: 0.0015306234088631
violation: 0.0013807499529762718
violation: 0.0012299134116160926
violation: 0.001115655742510109
violation: 0.001040804323992977
violation: 0.0009384280655652482
violation: 0.0008510460313869354
violation: 0.000762943636327082
violation: 0.0006822313230450484
violation: 0.000624460219824558
violation: 0.0005662430610552727
violation: 0.000526113

In [29]:
P_matrix.shape

(610, 2000)

### Reconstruct the ratings matrix 
$\hat{R} := P\cdot Q \sim R$

In [30]:
R_hat_matrix = np.dot(P_matrix, Q_matrix)
R_hat_matrix

array([[3.99999466e+00, 4.00102125e+00, 3.99812991e+00, ...,
        3.04092728e-06, 2.00301685e-04, 5.48854760e-04],
       [4.74326880e-04, 2.38210923e-10, 4.77779546e-05, ...,
        1.12266205e-09, 4.03670628e-04, 6.10770109e-07],
       [0.00000000e+00, 4.04153393e-08, 0.00000000e+00, ...,
        0.00000000e+00, 2.81411878e-09, 2.66624230e-07],
       ...,
       [2.50000233e+00, 2.00854188e+00, 1.21203698e-02, ...,
        9.99606552e-01, 1.49985244e+00, 8.92965451e-04],
       [3.00000000e+00, 2.17010509e-05, 5.25758259e-07, ...,
        4.23554672e-06, 0.00000000e+00, 1.28894896e-05],
       [4.99996339e+00, 5.93695455e-10, 5.00421067e+00, ...,
        3.00186804e+00, 3.65269898e-05, 5.93118496e-07]])

In [31]:
R_hat_df = pd.DataFrame(
    data=R_hat_matrix
)
R_hat_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,3.999995,4.001021e+00,3.998130e+00,5.000911,4.999999e+00,3.000982e+00,5.000625e+00,4.000047,4.999998e+00,5.000019e+00,...,2.687231e-11,1.609773e-04,2.818335e-07,3.657030e-04,1.064764e-10,4.478775e-04,1.897287e-10,3.040927e-06,2.003017e-04,5.488548e-04
1,0.000474,2.382109e-10,4.777795e-05,0.001284,1.451564e-03,2.633945e-03,8.704212e-10,0.001782,5.867135e-09,7.024763e-05,...,3.358318e-04,0.000000e+00,8.685181e-09,3.077745e-04,6.544488e-06,6.546638e-15,1.614255e-05,1.122662e-09,4.036706e-04,6.107701e-07
2,0.000000,4.041534e-08,0.000000e+00,0.000000,2.720466e-11,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,2.929772e-14,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,2.814119e-09,2.666242e-07
3,0.000587,1.480296e-04,1.665828e-04,1.999999,4.092738e-04,1.422434e-06,1.314635e-04,0.000971,1.901342e-05,1.145073e-04,...,1.875923e-05,3.667759e-04,3.325091e-07,6.201015e-05,2.643348e-04,1.116477e-04,7.588706e-06,4.093380e-08,3.000463e-04,3.931011e-04
4,4.000000,4.235620e-03,4.619355e-04,0.005291,4.000000e+00,5.608295e-07,2.325269e-06,4.000038,3.447496e-04,3.058888e-04,...,8.413309e-05,1.486974e-05,1.199423e-09,2.720934e-11,1.269776e-04,1.485914e-06,2.071380e-09,1.935891e-11,3.995898e-05,9.170200e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.499984,1.799596e-03,2.090309e-03,3.000831,4.499977e+00,4.000526e+00,2.962699e-06,3.499964,3.561720e-09,1.078797e-04,...,2.907165e-04,1.482881e-11,6.059784e-05,9.656534e-05,1.360447e-07,4.008877e+00,1.858826e-04,1.201692e-09,1.771035e-03,0.000000e+00
606,4.000000,6.400565e-06,1.170973e-04,0.001691,1.092605e-03,8.922278e-05,0.000000e+00,5.000000,1.044412e-13,3.195427e-05,...,4.690115e-08,0.000000e+00,0.000000e+00,0.000000e+00,4.638002e-09,0.000000e+00,2.747072e-09,6.752043e-08,5.676701e-09,2.160756e-08
607,2.500002,2.008542e+00,1.212037e-02,4.499271,4.499991e+00,2.995410e+00,5.947126e-06,3.999967,4.022417e-03,3.006376e+00,...,6.275683e-09,4.999640e-01,1.935681e-05,1.610836e-03,4.499321e+00,5.216258e-05,2.502016e+00,9.996066e-01,1.499852e+00,8.929655e-04
608,3.000000,2.170105e-05,5.257583e-07,0.003715,4.535862e-03,0.000000e+00,1.944261e-12,2.999993,2.352680e-04,5.996877e-07,...,2.515046e-04,0.000000e+00,4.660224e-06,0.000000e+00,7.376715e-13,2.428335e-05,9.109984e-07,4.235547e-06,0.000000e+00,1.288949e-05


### Calculate the error

In [32]:
nmf_model.reconstruction_err_

3.6427684102461018

### Save and load the model

In [33]:
import pickle

In [34]:


with open('nmf_model_week10.pkl',mode='wb') as file:
    pickle.dump(nmf_model,file)

### Model Deployment

In [35]:
with open('nmf_model_week10.pkl','rb') as file:
    loaded_model = pickle.load(file)

In [36]:
new_user_query = {
    10: 4,  # Billy Madison (1995)
    100: 3, # Bambi (1942)
    555: 3.5,  # Mortal Kombat (1995)
    756: 2,  # Inside Man (2006)
    1224: 5,  # Babe: Pig in the City (1998)
}

In [37]:
new_user_dataframe = pd.DataFrame(new_user_query,columns = df_R.columns, index = ['new user'])
new_user_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new user,,,,,,,,,,,...,,,,,,,,,,


In [38]:
new_user_imputed = new_user_dataframe.fillna(df_R.mean())

In [39]:
new_user_imputed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new user,1.381967,0.277869,0.659836,1.322951,1.417213,0.316393,0.142623,1.566393,0.255738,0.385246,...,0.12623,0.087705,0.118033,0.128689,0.139344,0.136885,0.098361,0.081967,0.108197,0.106557


### Create new matrixes for new users

In [40]:
P_new_user_matrix = loaded_model.transform(new_user_imputed)
P_new_user_matrix

violation: 1.0
violation: 1.2198613176822528
violation: 0.4721387892010086
violation: 0.34610784244592535
violation: 0.3147575104909552
violation: 0.2026175912986121
violation: 0.17267512535661192
violation: 0.1374743334602472
violation: 0.10731452375442832
violation: 0.09975866430115783
violation: 0.09268980309977888
violation: 0.09010145083654547
violation: 0.09774189491188694
violation: 0.08617085440098735
violation: 0.09917674181136125
violation: 0.13679922753206264
violation: 0.11423647048550561
violation: 0.09123105481841667
violation: 0.07041675138541609
violation: 0.06440039950900099
violation: 0.055793211223101574
violation: 0.05059244397367346
violation: 0.04530616787523075
violation: 0.043171006211117836
violation: 0.03821640282554736
violation: 0.04588236712320433
violation: 0.03722108319532786
violation: 0.031066396146604117
violation: 0.02925650711311332
violation: 0.02799987023965325
violation: 0.02765287194327797
violation: 0.02561041000848689
violation: 0.0248346576531

array([[0.97950392, 0.        , 0.19615586, ..., 0.73665676, 0.        ,
        0.        ]])

In [41]:
P_new_user_matrix.shape

(1, 2000)

In [42]:
P_new_user = pd.DataFrame(P_new_user_matrix, 
                         #columns = loaded_model.get_feature_names_out(),
                         index = ['new_user'])
P_new_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
new_user,0.979504,0.0,0.196156,0.247845,0.0,0.078983,0.0,0.0,0.029192,0.0,...,0.055096,0.0,0.086961,0.0,0.0,0.0,0.176138,0.736657,0.0,0.0


In [43]:
R_hat_new_user_matrix = np.dot(P_new_user, Q)
R_hat_new_user_matrix

array([[1.38198841, 0.28534463, 0.87673439, ..., 0.06871651, 0.14998441,
        0.0988383 ]])

In [44]:
#make dataframe
R_hat_new_user = pd.DataFrame(data=R_hat_new_user_matrix,
                         columns=df_R.columns,
                         index = ['new_user'])
R_hat_new_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new_user,1.381988,0.285345,0.876734,1.32326,1.417424,0.306073,0.316137,1.566405,0.255731,0.388801,...,0.097752,0.005004,0.085093,0.144692,0.072369,0.173928,0.033889,0.068717,0.149984,0.098838


In [45]:
new_user_query.keys()

dict_keys([10, 100, 555, 756, 1224])

#### Step 4. Get a list of k-top rated movie to recommend to the new user
So which movies?

In [46]:
#filter out movies already seen
R_hat_new_user_filtered = R_hat_new_user.drop(new_user_query.keys(), axis =1)

In [47]:
R_hat_new_user_filtered.T.sort_values(by = ['new_user'], ascending=False).index.tolist()

[180,
 19,
 15,
 130,
 32,
 14,
 7,
 151,
 26,
 322,
 24,
 58,
 4,
 0,
 60,
 593,
 144,
 448,
 63,
 3,
 101,
 309,
 459,
 469,
 23,
 34,
 218,
 40,
 282,
 233,
 73,
 31,
 340,
 171,
 316,
 440,
 300,
 186,
 30,
 516,
 387,
 59,
 191,
 317,
 86,
 181,
 35,
 65,
 77,
 72,
 677,
 676,
 651,
 269,
 465,
 234,
 52,
 118,
 672,
 698,
 2,
 373,
 813,
 446,
 64,
 57,
 182,
 349,
 564,
 594,
 69,
 646,
 306,
 669,
 21,
 382,
 39,
 177,
 449,
 460,
 276,
 25,
 315,
 61,
 425,
 55,
 328,
 71,
 479,
 16,
 324,
 68,
 358,
 50,
 17,
 160,
 134,
 724,
 486,
 891,
 185,
 12,
 140,
 308,
 215,
 431,
 447,
 612,
 453,
 38,
 668,
 451,
 712,
 325,
 596,
 242,
 619,
 295,
 645,
 217,
 241,
 535,
 884,
 92,
 420,
 310,
 62,
 607,
 138,
 493,
 254,
 404,
 18,
 11,
 760,
 419,
 719,
 1157,
 588,
 755,
 280,
 109,
 156,
 706,
 393,
 458,
 126,
 209,
 88,
 305,
 311,
 348,
 83,
 256,
 239,
 426,
 232,
 142,
 962,
 221,
 1062,
 188,
 228,
 313,
 1121,
 615,
 372,
 378,
 41,
 66,
 702,
 376,
 298,
 51,
 538,
 47

In [48]:
ranked = R_hat_new_user_filtered.T.sort_values(by = ['new_user'], ascending=False).index.tolist()
ranked

[180,
 19,
 15,
 130,
 32,
 14,
 7,
 151,
 26,
 322,
 24,
 58,
 4,
 0,
 60,
 593,
 144,
 448,
 63,
 3,
 101,
 309,
 459,
 469,
 23,
 34,
 218,
 40,
 282,
 233,
 73,
 31,
 340,
 171,
 316,
 440,
 300,
 186,
 30,
 516,
 387,
 59,
 191,
 317,
 86,
 181,
 35,
 65,
 77,
 72,
 677,
 676,
 651,
 269,
 465,
 234,
 52,
 118,
 672,
 698,
 2,
 373,
 813,
 446,
 64,
 57,
 182,
 349,
 564,
 594,
 69,
 646,
 306,
 669,
 21,
 382,
 39,
 177,
 449,
 460,
 276,
 25,
 315,
 61,
 425,
 55,
 328,
 71,
 479,
 16,
 324,
 68,
 358,
 50,
 17,
 160,
 134,
 724,
 486,
 891,
 185,
 12,
 140,
 308,
 215,
 431,
 447,
 612,
 453,
 38,
 668,
 451,
 712,
 325,
 596,
 242,
 619,
 295,
 645,
 217,
 241,
 535,
 884,
 92,
 420,
 310,
 62,
 607,
 138,
 493,
 254,
 404,
 18,
 11,
 760,
 419,
 719,
 1157,
 588,
 755,
 280,
 109,
 156,
 706,
 393,
 458,
 126,
 209,
 88,
 305,
 311,
 348,
 83,
 256,
 239,
 426,
 232,
 142,
 962,
 221,
 1062,
 188,
 228,
 313,
 1121,
 615,
 372,
 378,
 41,
 66,
 702,
 376,
 298,
 51,
 538,
 47

In [49]:
recommended = ranked[:3]
recommended


[180, 19, 15]

### Next steps: 
- convert the above to function 
- make it spit movieIds mapped to movie names

#### Example: Baseline Recommender

In [50]:
def recommend_popular(query, ratings, k=10):
    rated_movies = set(query.keys())
    recommendations = ratings[ratings.index.isin(rated_movies)].sort_values(by='rating', ascending=False).head(k)
    recommended_movies = movies[movies['movieId'].isin(recommendations.index)]
    return recommended_movies[['movieId', 'title']].values.tolist()


# Usage example:
user_query = {12: 5, 234: 1, 567: 4.5}
recommended_movies = recommend_popular(user_query, ratings, k=10)
recommended_movies

[[12, 'Dracula: Dead and Loving It (1995)'],
 [234, 'Exit to Eden (1994)'],
 [567, 'Kika (1993)']]

In [55]:
def recommend_nmf(query, model, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    
    # 1. construct new_user-item dataframe given the query
    rated_movies = set(query.keys())
    recommendations = ratings[ratings.index.isin(rated_movies)].sort_values(by='rating', ascending=False).head(k)
    new_user_dataframe = pd.DataFrame(new_user_query,columns = df_R.columns, index = ['new user'])
    new_user_imputed = new_user_dataframe.fillna(df_R.mean())

    # 2. scoring
    # calculate the score with the NMF model
    P_new_user_matrix = model.transform(new_user_imputed)
    Q_matrix = model.components_
    R_hat_new_user_matrix = np.dot(P_new_user_matrix, Q_matrix)
    
    # 3. Ranking
    # Filter out movies already seen by the user
    R_hat_new_user_filtered = pd.DataFrame(R_hat_new_user_matrix, columns=df_R.columns)
    R_hat_new_user_filtered = R_hat_new_user_filtered.loc[:, ~R_hat_new_user_filtered.columns.isin(rated_movies)]
    
    # Return the top-k highest rated movie ids or titles
    recommended = R_hat_new_user_filtered.iloc[0].nlargest(k)
    recommended_movies = titles[titles['movieId'].isin(recommended.index)]

    return recommended_movies[['movieId', 'title']].values.tolist()

In [56]:
recommend_nmf(new_user_query, nmf_model, k=10)

violation: 1.0
violation: 1.2198613176822528
violation: 0.4721387892010086
violation: 0.34610784244592535
violation: 0.3147575104909552
violation: 0.2026175912986121
violation: 0.17267512535661192
violation: 0.1374743334602472
violation: 0.10731452375442832
violation: 0.09975866430115783
violation: 0.09268980309977888
violation: 0.09010145083654547
violation: 0.09774189491188694
violation: 0.08617085440098735
violation: 0.09917674181136125
violation: 0.13679922753206264
violation: 0.11423647048550561
violation: 0.09123105481841667
violation: 0.07041675138541609
violation: 0.06440039950900099
violation: 0.055793211223101574
violation: 0.05059244397367346
violation: 0.04530616787523075
violation: 0.043171006211117836
violation: 0.03821640282554736
violation: 0.04588236712320433
violation: 0.03722108319532786
violation: 0.031066396146604117
violation: 0.02925650711311332
violation: 0.02799987023965325
violation: 0.02765287194327797
violation: 0.02561041000848689
violation: 0.0248346576531

[[7, 'Sabrina (1995)'],
 [14, 'Nixon (1995)'],
 [15, 'Cutthroat Island (1995)'],
 [19, 'Ace Ventura: When Nature Calls (1995)'],
 [26, 'Othello (1995)'],
 [32, 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'],
 [151, 'Rob Roy (1995)'],
 [180, 'Mallrats (1995)'],
 [322, 'Swimming with Sharks (1995)']]