In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error

In [3]:
users = pd.read_csv('data/users.csv')
movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [17]:
print(f'Users: \n{users.head()}\n')
print(f'Movies: \n{movies.head()}\n')
print(f'Train: \n{train.head()}\n')
print(f'Test: \n{test.head()}')


Users: 
   uID gender  age  accupation    zip
0    1      F    1          10  48067
1    2      M   56          16  70072
2    3      M   25          15  55117
3    4      M   45           7  02460
4    5      M   25          20  55455

Movies: 
   mID                        title  year  Doc  Com  Hor  Adv  Wes  Dra  Ani  \
0    1                    Toy Story  1995    0    1    0    0    0    0    1   
1    2                      Jumanji  1995    0    0    0    1    0    0    0   
2    3             Grumpier Old Men  1995    0    1    0    0    0    0    0   
3    4            Waiting to Exhale  1995    0    1    0    0    0    1    0   
4    5  Father of the Bride Part II  1995    0    1    0    0    0    0    0   

   ...  Chi  Cri  Thr  Sci  Mys  Rom  Fil  Fan  Act  Mus  
0  ...    1    0    0    0    0    0    0    0    0    0  
1  ...    1    0    0    0    0    0    0    1    0    0  
2  ...    0    0    0    0    0    1    0    0    0    0  
3  ...    0    0    0    0    0    0 

Load ratings data and use matrix factorization techniques and predict the missing ratings from the test data. Measure the RMSE.

In [18]:
allusers = list(users['uID'])
allmovies = list(movies['mID'])
mid2idx = dict(zip(movies.mID,list(range(len(movies)))))
uid2idx = dict(zip(users.uID,list(range(len(users)))))
ind_movie = [mid2idx[x] for x in train.mID] 
ind_user = [uid2idx[x] for x in train.uID]
rating_train = list(train.rating)
Mr = np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(allusers), len(allmovies))).toarray())

In [19]:
Mr

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]])

In [20]:
Mr.shape

(6040, 3883)

In [21]:
# Compute the sparsity
len(Mr.nonzero()[0]) / float(Mr.shape[0] * Mr.shape[1])

0.029852745794625237

In [22]:
model = NMF(n_components=20)     
W = model.fit_transform(Mr)
H = model.components_



In [23]:
W.shape

(6040, 20)

In [24]:
H.shape

(20, 3883)

In [25]:
ind_movie_test = [mid2idx[x] for x in test.mID] 
ind_user_test = [uid2idx[x] for x in test.uID]
rating_test = list(test.rating)
Mr_test = np.array(coo_matrix(
    (rating_test, (ind_user_test, ind_movie_test)), 
    shape=(len(allusers), len(allmovies))).toarray())


In [26]:
Mr_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
Mr_test.shape

(6040, 3883)

In [28]:
len(Mr_test.nonzero()[0]) / float(Mr_test.shape[0] * Mr_test.shape[1])

0.012794052185362243

In [29]:
Mr_pred = H.T.dot(W.T).T

In [30]:
Mr_pred

array([[1.78561924e+00, 5.35822187e-01, 1.59740139e-02, ...,
        1.30735004e-02, 6.42771938e-03, 9.26652383e-02],
       [1.26557355e+00, 3.72001854e-01, 1.34497823e-01, ...,
        1.58572552e-02, 0.00000000e+00, 3.85520995e-02],
       [6.85354227e-01, 1.45091941e-01, 2.53994011e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [6.58618577e-01, 1.33013509e-02, 8.07949334e-04, ...,
        1.29707703e-03, 0.00000000e+00, 8.92649818e-05],
       [1.25687321e+00, 2.87928423e-01, 9.35396638e-02, ...,
        4.93460949e-02, 0.00000000e+00, 0.00000000e+00],
       [1.34773976e+00, 9.75702074e-02, 5.90561503e-03, ...,
        9.29470025e-02, 9.13006316e-02, 4.16696932e-01]])

In [31]:
rmse = np.sqrt(mean_squared_error(
    Mr_pred[Mr_test.nonzero()].flatten(), 
    Mr_test[Mr_test.nonzero()].flatten()))


In [32]:
rmse

2.8624319319601716

Discuss the results and why they did not work well compared to simple baseline or similarity-based methods we’ve done in Module 3. Can you suggest a way(s) to fix it?

- The result rmse 2.86 did not look good, simple baseline is around 1.26 rmse per week3 assignment
- NMF didn't take true movie and user features into account, but purely based on user-movie rating matrix
- Also NMF default l2 loss does not work well for sparse matrix which has lots of 0
- Tuning n_components may be helpful to improve the performance through gridsearchcv; And using Kl loss is good for data that has lots of 0