### Task: 

Calculate the average rating for each movie in the dataset - done

Filter out movies that have been watched by less than 20 users - done

Recommend the top ten movies that a user has not seen yet

Write a function recommend_popular(query, ratings, k=10) that gets a user query of rated movie-ids and the ratings table as input. It returns a list of k movie-ids.

The user query is a python dictionary that looks like this: {12: 5, 234: 1, 234: 4.5}.


In [23]:
import numpy as np
from sklearn.decomposition import NMF 
import pandas as pd

## Load and check the data

In [24]:
movies= pd.read_csv('ml-latest-small/movies.csv' , sep = ',')
ratings= pd.read_csv('ml-latest-small/ratings.csv' , sep = ',')
links = pd.read_csv('ml-latest-small/links.csv' , sep = ',')
tags = pd.read_csv('ml-latest-small/tags.csv' , sep = ',')

In [25]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [26]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [27]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [28]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### working with df

In [29]:
# calculate the number of ratings per movie
rating_count = ratings.groupby('movieId')[['rating']].count()

In [30]:
# filter for movies with more than 20 ratings and extract the index
popular_movies = rating_count[rating_count['rating']>20].index

In [31]:
av_rating = ratings.groupby('movieId')[['rating']].mean()

In [32]:
# filter the ratings matrix and only keep the popular movies
df = ratings[ratings['movieId'].isin(popular_movies)].copy()

In [33]:
# remake user ids since they are not sequential
user_ids = df['userId'].unique()
user_id_map = {v:k for k,v in enumerate(user_ids)}
df['userId'] = df['userId'].map(user_id_map)

In [34]:
# remake movie ids since they are not sequential
movie_ids = df['movieId'].unique()
movie_id_map = {v:k for k,v in enumerate(movie_ids)}
df['movieId'] = df['movieId'].map(movie_id_map)

In [35]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
4,0,4,5.0,964982931
...,...,...,...,...
100803,609,808,4.0,1493847175
100808,609,643,4.0,1493846503
100829,609,809,5.0,1493845631
100830,609,644,4.0,1493879365


In [None]:
# Initialize a sparse user-item rating matrix
from scipy.sparse import csr_matrix
# (data, (row_ind, col_ind) 
R = csr_matrix((df['rating'], (df['userId'], df['movieId'])))

In [None]:
R.shape

(610, 1235)

In [None]:
R = R.todense()

In [None]:
df_R = pd.DataFrame(R)
df_R

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,4.0,4.0,4.0,5.0,5.0,3.0,5.0,4.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.5,0.0,0.0,3.0,4.5,4.0,0.0,3.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
606,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,2.5,2.0,0.0,4.5,4.5,3.0,0.0,4.0,0.0,3.0,...,0.0,0.5,0.0,0.0,4.5,0.0,2.5,1.0,1.5,0.0
608,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF MODEL

In [54]:
## Calculate Q matrix
nmf_model = NMF(n_components=2000, max_iter=300, tol = 0.0001, verbose = 2)

In [55]:
nmf_model.fit(R)



violation: 1.0
violation: 0.15501531749852895
violation: 0.0478985326191118
violation: 0.021157972936216844
violation: 0.010980530002105344
violation: 0.006286719130413587
violation: 0.004087740038228715
violation: 0.0028094900624516875
violation: 0.0020977935810678264
violation: 0.0016285497995166178
violation: 0.0012939665336330842
violation: 0.0010590229660795937
violation: 0.0008869064859768568
violation: 0.0007528532898775561
violation: 0.0006685523966687774
violation: 0.0005759228412523466
violation: 0.0004996447351959798
violation: 0.0004359375365296365
violation: 0.0003838808657965991
violation: 0.0003444645562205297
violation: 0.00030541964778159434
violation: 0.0002752070911625181
violation: 0.0002484575451329636
violation: 0.00022650653974707837
violation: 0.0002057687451636564
violation: 0.00019075629703940364
violation: 0.0001772607962890574
violation: 0.00016441275432001987
violation: 0.00015523345917466006
violation: 0.0001460606866531765
violation: 0.0001357267667264219

NMF(max_iter=300, n_components=2000, verbose=2)

In [56]:
Q_matrix = nmf_model.components_

In [57]:
Q_matrix.shape

(2000, 1235)

In [58]:
Q = pd.DataFrame(Q_matrix)
Q

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,0.000684,0.000000,0.000000,0.046439,0.000000,0.000000,0.00000,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.00002,0.000000,0.00000,0.0,0.000000,0.000000,0.0
1,0.014533,0.003862,0.018574,0.018158,0.015210,0.000000,0.00000,0.010680,0.0,0.030176,...,0.0,0.000071,0.0,0.00000,0.000000,0.00000,0.0,0.000003,0.000000,0.0
2,0.007163,0.000000,0.010413,0.000000,0.000000,0.000000,0.00732,0.008641,0.0,0.000000,...,0.0,0.000000,0.0,0.00000,0.008318,0.00000,0.0,0.000000,0.000028,0.0
3,0.127537,0.000000,0.000000,0.010224,0.000000,0.000000,0.00000,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.00000,0.000000,0.00000,0.0,0.000000,0.000000,0.0
4,0.000000,0.000000,0.000000,0.175559,0.226730,0.000000,0.00000,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.00000,0.000000,0.00000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000472,0.000000,0.008080,0.171189,0.000000,0.266454,0.00000,0.055878,0.0,0.043055,...,0.0,0.000000,0.0,0.00000,0.000000,0.00000,0.0,0.000000,0.000000,0.0
1996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.00000,0.000000,0.00000,0.0,0.000000,0.000000,0.0
1997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.00000,0.000000,0.00000,0.0,0.000000,0.000000,0.0
1998,0.000000,0.000000,0.000000,0.000096,0.000000,0.064434,0.00000,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.00000,0.000000,0.00000,0.0,0.000000,0.000000,0.0


In [63]:
P_matrix = nmf_model.transform(R)



violation: 1.0
violation: 0.920454003527021
violation: 0.5622661244642034
violation: 0.28280224653390096
violation: 0.14588895125226148
violation: 0.08374790458677908
violation: 0.054187979920824934
violation: 0.03718388443236708
violation: 0.026690922495498193
violation: 0.018639571764861782
violation: 0.014487257399446734
violation: 0.011580556981529645
violation: 0.008843497317048643
violation: 0.00726235926017071
violation: 0.006053950332562434
violation: 0.004900081955582941
violation: 0.004152587910102046
violation: 0.0035935417858472844
violation: 0.003198184308820623
violation: 0.002884723042849456
violation: 0.0025742452340843445
violation: 0.002266853424808468
violation: 0.00197998795762114
violation: 0.0017132504961219893
violation: 0.001536527993337089
violation: 0.001384542564037297
violation: 0.0012639622696870325
violation: 0.0010920012131294787
violation: 0.0009483672653552619
violation: 0.0008879978496129058
violation: 0.0008344752334388256
violation: 0.000768172169715

In [64]:
P_matrix.shape

(610, 2000)

### Reconstruct the ratings matrix 
$\hat{R} := P\cdot Q \sim R$

In [65]:
R_hat_matrix = np.dot(P_matrix, Q_matrix)
R_hat_matrix

array([[4.00037118e+00, 4.00034034e+00, 4.00029686e+00, ...,
        1.79129741e-06, 1.31526320e-05, 2.62924534e-07],
       [1.14104967e-04, 0.00000000e+00, 9.62301217e-06, ...,
        4.84952868e-12, 6.24139476e-10, 0.00000000e+00],
       [1.04058724e-04, 5.98567669e-05, 4.60606320e-06, ...,
        5.37134150e-06, 1.88710131e-06, 2.73324272e-06],
       ...,
       [2.50321237e+00, 2.00046003e+00, 2.87572776e-02, ...,
        1.00059625e+00, 1.50035055e+00, 5.00299498e-04],
       [3.00002495e+00, 3.83993851e-11, 7.01451807e-04, ...,
        5.56344771e-11, 1.47112961e-07, 0.00000000e+00],
       [4.99999224e+00, 4.48957327e-06, 4.99996145e+00, ...,
        3.00143919e+00, 1.82562504e-05, 0.00000000e+00]])

In [66]:
R_hat_df = pd.DataFrame(
    data=R_hat_matrix
)
R_hat_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
0,4.000371,4.000340e+00,4.000297e+00,4.999770,5.000000,2.999952e+00,4.999985e+00,4.000000,4.999959e+00,5.000000e+00,...,1.219677e-04,6.085504e-05,3.411410e-11,3.189201e-08,2.804495e-04,2.729346e-07,3.469372e-06,1.791297e-06,1.315263e-05,2.629245e-07
1,0.000114,0.000000e+00,9.623012e-06,0.000000,0.000000,1.757943e-13,1.615813e-13,0.000001,0.000000e+00,5.251632e-07,...,8.619118e-11,0.000000e+00,0.000000e+00,0.000000e+00,2.658313e-09,0.000000e+00,0.000000e+00,4.849529e-12,6.241395e-10,0.000000e+00
2,0.000104,5.985677e-05,4.606063e-06,0.000076,0.000077,2.959752e-05,3.796609e-06,0.000098,3.880136e-05,2.041726e-04,...,4.562779e-05,6.579401e-05,9.749465e-14,1.063275e-05,9.880270e-06,6.983464e-06,3.452786e-12,5.371342e-06,1.887101e-06,2.733243e-06
3,0.001557,3.048074e-03,1.583303e-05,1.999963,0.000558,2.475425e-07,1.052256e-04,0.000572,2.108844e-12,3.016196e-04,...,6.502149e-07,2.211597e-14,0.000000e+00,4.646774e-04,1.533679e-04,1.296380e-03,3.216335e-04,1.616085e-08,5.688752e-10,1.471922e-05
4,4.000001,2.700025e-04,1.534661e-05,0.000580,4.000457,2.192428e-05,7.487948e-05,4.001516,1.285940e-04,2.233104e-04,...,0.000000e+00,0.000000e+00,0.000000e+00,4.061162e-05,3.681398e-05,2.650993e-05,5.631583e-06,2.454994e-05,2.001885e-06,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.500000,3.400264e-06,8.060930e-04,2.999943,4.500000,3.999878e+00,1.879599e-03,3.500000,1.570402e-03,2.765409e-08,...,2.256085e-07,8.868298e-05,1.482550e-05,2.053633e-04,1.747825e-05,4.005444e+00,5.181333e-06,9.012159e-05,9.993929e-04,0.000000e+00
606,4.000047,2.116350e-03,9.997953e-08,0.003845,0.000404,1.464887e-06,2.558146e-07,5.002817,1.603576e-04,0.000000e+00,...,0.000000e+00,0.000000e+00,4.722319e-07,0.000000e+00,1.885600e-04,3.028935e-08,7.496767e-06,0.000000e+00,0.000000e+00,0.000000e+00
607,2.503212,2.000460e+00,2.875728e-02,4.499712,4.500000,2.998097e+00,4.075070e-03,4.000140,5.949196e-03,3.000000e+00,...,5.448886e-02,5.003053e-01,1.666488e-10,1.005922e-03,4.494532e+00,0.000000e+00,2.505368e+00,1.000596e+00,1.500351e+00,5.002995e-04
608,3.000025,3.839939e-11,7.014518e-04,0.000005,0.014728,5.887365e-05,1.121507e-06,2.999996,2.659897e-08,2.152597e-05,...,3.612308e-07,0.000000e+00,0.000000e+00,0.000000e+00,1.757599e-05,1.545119e-05,9.864979e-06,5.563448e-11,1.471130e-07,0.000000e+00


### Calculate the error

In [67]:
nmf_model.reconstruction_err_

3.4851385305524167

### Save and load the model

In [68]:
import pickle

with open('nmf_model_week10.pkl',mode='wb') as file:
    pickle.dump(nmf_model,file)

### Model Deployment

In [69]:
with open('nmf_model_week10.pkl','rb') as file:
    loaded_model = pickle.load(file)

In [70]:
new_user_query = {
    10: 4,  # Billy Madison (1995)
    100: 3, # Bambi (1942)
    555: 3.5,  # Mortal Kombat (1995)
    756: 2,  # Inside Man (2006)
    1224: 5,  # Babe: Pig in the City (1998)
}

In [73]:
new_user_dataframe = pd.DataFrame(new_user_query,columns = df_R.columns, index = ['new user'])
new_user_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new user,,,,,,,,,,,...,,,,,,,,,,


In [74]:
new_user_imputed = new_user_dataframe.fillna(df_R.mean())

In [75]:
new_user_imputed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new user,1.381967,0.277869,0.659836,1.322951,1.417213,0.316393,0.142623,1.566393,0.255738,0.385246,...,0.12623,0.087705,0.118033,0.128689,0.139344,0.136885,0.098361,0.081967,0.108197,0.106557


### Create new matrixes for new users

In [76]:
P_new_user_matrix = loaded_model.transform(new_user_imputed)
P_new_user_matrix

violation: 1.0
violation: 0.7757178304053128
violation: 0.5165865599390655
violation: 0.49022209727680743
violation: 0.31458641248821617
violation: 0.31794329475469335
violation: 0.3244558136425374
violation: 0.3510086903472163
violation: 0.20361326289444706
violation: 0.24224904770868086
violation: 0.17772340294066696
violation: 0.19622136136672133
violation: 0.2714952176679524
violation: 0.2831052861466074
violation: 0.2312252353491389
violation: 0.12775399095265355
violation: 0.11545519045683797
violation: 0.13234600090460666
violation: 0.12986576112088788
violation: 0.13844027791809702
violation: 0.14484502598455992
violation: 0.14537237288548638
violation: 0.14002427119227384
violation: 0.1421460870489114
violation: 0.07559675519678313
violation: 0.0834809750917666
violation: 0.07422137590356492
violation: 0.06682559975385219
violation: 0.055672353808285495
violation: 0.049484463366625465
violation: 0.04436494930549194
violation: 0.04173962326745799
violation: 0.04099446481717805


array([[0.82833062, 0.48568394, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [77]:
P_new_user_matrix.shape

(1, 2000)

In [78]:
P_new_user = pd.DataFrame(P_new_user_matrix, 
                         #columns = loaded_model.get_feature_names_out(),
                         index = ['new_user'])
P_new_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
new_user,0.828331,0.485684,0.0,0.309818,0.107489,0.0,0.037124,0.0,0.0,0.0,...,0.0,0.205121,0.0,0.26706,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
R_hat_new_user_matrix = np.dot(P_new_user, Q)
R_hat_new_user_matrix

array([[1.38207986, 0.27797192, 0.65983657, ..., 0.11555505, 0.09716418,
        0.1262582 ]])

In [83]:
#make dataframe
R_hat_new_user = pd.DataFrame(data=R_hat_new_user_matrix,
                         columns=df_R.columns,
                         index = ['new_user'])
R_hat_new_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1225,1226,1227,1228,1229,1230,1231,1232,1233,1234
new_user,1.38208,0.277972,0.659837,1.325568,1.43732,0.288377,0.412611,1.56945,0.287799,0.385245,...,0.094587,0.089922,0.056459,0.20138,0.02847,0.191191,0.16397,0.115555,0.097164,0.126258


In [80]:
new_user_query.keys()

dict_keys([10, 100, 555, 756, 1224])

#### Step 4. Get a list of k-top rated movie to recommend to the new user
So which movies?

In [84]:
#filter out movies already seen
R_hat_new_user_filtered = R_hat_new_user.drop(new_user_query.keys(), axis =1)

In [85]:
R_hat_new_user_filtered.T.sort_values(by = ['new_user'], ascending=False).index.tolist()

[180,
 19,
 15,
 130,
 32,
 14,
 7,
 26,
 151,
 24,
 58,
 322,
 4,
 60,
 0,
 144,
 593,
 448,
 63,
 3,
 101,
 309,
 23,
 469,
 459,
 34,
 233,
 218,
 186,
 40,
 73,
 282,
 316,
 171,
 440,
 300,
 646,
 31,
 35,
 465,
 181,
 30,
 52,
 72,
 382,
 317,
 387,
 59,
 86,
 191,
 65,
 340,
 57,
 77,
 516,
 676,
 651,
 486,
 677,
 234,
 672,
 118,
 449,
 50,
 64,
 373,
 17,
 535,
 310,
 446,
 182,
 62,
 69,
 21,
 669,
 39,
 594,
 215,
 177,
 315,
 25,
 349,
 479,
 61,
 564,
 425,
 16,
 306,
 431,
 160,
 55,
 328,
 813,
 276,
 295,
 185,
 140,
 324,
 83,
 612,
 71,
 269,
 38,
 447,
 698,
 134,
 686,
 668,
 451,
 348,
 18,
 724,
 596,
 645,
 420,
 460,
 619,
 453,
 325,
 267,
 588,
 419,
 242,
 311,
 241,
 11,
 337,
 12,
 92,
 2,
 323,
 712,
 68,
 695,
 424,
 691,
 404,
 287,
 88,
 232,
 156,
 755,
 890,
 221,
 711,
 126,
 627,
 706,
 308,
 615,
 109,
 607,
 839,
 33,
 305,
 390,
 153,
 458,
 378,
 1062,
 472,
 188,
 393,
 228,
 457,
 618,
 75,
 239,
 426,
 342,
 256,
 66,
 380,
 702,
 649,
 67,


In [86]:
ranked = R_hat_new_user_filtered.T.sort_values(by = ['new_user'], ascending=False).index.tolist()
ranked

[180,
 19,
 15,
 130,
 32,
 14,
 7,
 26,
 151,
 24,
 58,
 322,
 4,
 60,
 0,
 144,
 593,
 448,
 63,
 3,
 101,
 309,
 23,
 469,
 459,
 34,
 233,
 218,
 186,
 40,
 73,
 282,
 316,
 171,
 440,
 300,
 646,
 31,
 35,
 465,
 181,
 30,
 52,
 72,
 382,
 317,
 387,
 59,
 86,
 191,
 65,
 340,
 57,
 77,
 516,
 676,
 651,
 486,
 677,
 234,
 672,
 118,
 449,
 50,
 64,
 373,
 17,
 535,
 310,
 446,
 182,
 62,
 69,
 21,
 669,
 39,
 594,
 215,
 177,
 315,
 25,
 349,
 479,
 61,
 564,
 425,
 16,
 306,
 431,
 160,
 55,
 328,
 813,
 276,
 295,
 185,
 140,
 324,
 83,
 612,
 71,
 269,
 38,
 447,
 698,
 134,
 686,
 668,
 451,
 348,
 18,
 724,
 596,
 645,
 420,
 460,
 619,
 453,
 325,
 267,
 588,
 419,
 242,
 311,
 241,
 11,
 337,
 12,
 92,
 2,
 323,
 712,
 68,
 695,
 424,
 691,
 404,
 287,
 88,
 232,
 156,
 755,
 890,
 221,
 711,
 126,
 627,
 706,
 308,
 615,
 109,
 607,
 839,
 33,
 305,
 390,
 153,
 458,
 378,
 1062,
 472,
 188,
 393,
 228,
 457,
 618,
 75,
 239,
 426,
 342,
 256,
 66,
 380,
 702,
 649,
 67,


In [87]:
recommended = ranked[:3]
recommended


[180, 19, 15]

### Next steps: 
- convert the above to function 
- make it spit movieIds mapped to movie names

#### Example: Baseline Recommender

In [None]:
def recommend_popular(query, ratings, k=10):
    rated_movies = set(query.keys())
    recommendations = ratings[ratings.index.isin(rated_movies)].sort_values(by='rating', ascending=False).head(k)
    recommended_movies = movies[movies['movieId'].isin(recommendations.index)]
    return recommended_movies[['movieId', 'title']].values.tolist()


# Usage example:
user_query = {12: 5, 234: 1, 567: 4.5}
recommended_movies = recommend_popular(user_query, ratings, k=10)
recommended_movies

[[12, 'Dracula: Dead and Loving It (1995)'],
 [234, 'Exit to Eden (1994)'],
 [567, 'Kika (1993)']]

In [98]:
def recommend_nmf(query, model, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    
    # 1. construct new_user-item dataframe given the query
    rated_movies = set(query.keys())
    recommendations = ratings[ratings.index.isin(rated_movies)].sort_values(by='rating', ascending=False).head(k)
    new_user_dataframe = pd.DataFrame(new_user_query,columns = df_R.columns, index = ['new user'])
    new_user_imputed = new_user_dataframe.fillna(df_R.mean())

    # 2. scoring
    # calculate the score with the NMF model
    P_new_user_matrix = model.transform(new_user_imputed)
    Q_matrix = model.components_
    R_hat_new_user_matrix = np.dot(P_new_user_matrix, Q_matrix)
    
    # 3. Ranking
    # Filter out movies already seen by the user
    R_hat_new_user_filtered = pd.DataFrame(R_hat_new_user_matrix, columns=df_R.columns)
    R_hat_new_user_filtered = R_hat_new_user_filtered.loc[:, ~R_hat_new_user_filtered.columns.isin(rated_movies)]
    
    # Return the top-k highest rated movie ids or titles
    recommended = R_hat_new_user_filtered.iloc[0].nlargest(k)
    recommended_movies = movies[movies['movieId'].isin(recommended.index)]

    return recommended_movies[['movieId', 'title']].values.tolist()

In [99]:
recommend_nmf(new_user_query, nmf_model, k=10)

violation: 1.0
violation: 0.7757178304053128
violation: 0.5165865599390655
violation: 0.49022209727680743
violation: 0.31458641248821617
violation: 0.31794329475469335
violation: 0.3244558136425374
violation: 0.3510086903472163
violation: 0.20361326289444706
violation: 0.24224904770868086
violation: 0.17772340294066696
violation: 0.19622136136672133
violation: 0.2714952176679524
violation: 0.2831052861466074
violation: 0.2312252353491389
violation: 0.12775399095265355
violation: 0.11545519045683797
violation: 0.13234600090460666
violation: 0.12986576112088788
violation: 0.13844027791809702
violation: 0.14484502598455992
violation: 0.14537237288548638
violation: 0.14002427119227384
violation: 0.1421460870489114
violation: 0.07559675519678313
violation: 0.0834809750917666
violation: 0.07422137590356492
violation: 0.06682559975385219
violation: 0.055672353808285495
violation: 0.049484463366625465
violation: 0.04436494930549194
violation: 0.04173962326745799
violation: 0.04099446481717805


[[7, 'Sabrina (1995)'],
 [14, 'Nixon (1995)'],
 [15, 'Cutthroat Island (1995)'],
 [19, 'Ace Ventura: When Nature Calls (1995)'],
 [24, 'Powder (1995)'],
 [26, 'Othello (1995)'],
 [32, 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'],
 [151, 'Rob Roy (1995)'],
 [180, 'Mallrats (1995)']]