In [1]:
import pandas as pd
# import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from scipy.sparse import dok_matrix

from sklearn.decomposition import NMF

from sklearn.metrics import mean_squared_error

In [2]:
users = pd.read_csv('data/movie-ratings-data/users.csv')
movies = pd.read_csv('data/movie-ratings-data/movies.csv')
train = pd.read_csv('data/movie-ratings-data/train.csv')
test = pd.read_csv('data/movie-ratings-data/test.csv')

match user and movie id's to indices

In [3]:
uid2idx = {row['uID']: index for index, row in users.iterrows()}
mid2idx = {row['mID']: index for index, row in movies.iterrows()}

user-movie rating matrix

In [4]:
num_users = users['uID'].nunique()
num_movies = movies['mID'].nunique()
rating_matrix_train = dok_matrix((num_users, num_movies), dtype=np.float32)

for _, row in train.iterrows():
    idx_movie = mid2idx[row['mID']]
    idx_user = uid2idx[row['uID']]
    rating_matrix_train[idx_user, idx_movie] = row['rating']

nmf model

In [5]:
nmf_model = NMF(n_components=20, init='random', random_state=0)
nmf_model.fit(rating_matrix_train)

W = nmf_model.transform(rating_matrix_train)
H = nmf_model.components_

# Predict ratings for the test set
rating_matrix_train_pred = np.dot(W, H)

In [6]:
train_pred = train.copy()
train_pred['rating_pred'] = np.nan

for index, row in train_pred.iterrows():
    idx_movie = mid2idx[row['mID']]
    idx_user = uid2idx[row['uID']]
    train_pred.loc[index, 'rating_pred'] = rating_matrix_train_pred[idx_user, idx_movie]

In [7]:
    # Clipping values                                                    
train_pred.loc[train_pred['rating_pred'] > 5, 'rating_pred'] = 5.           # clips ratings above 5             
train_pred.loc[train_pred['rating_pred'] < 1, 'rating_pred'] = 1.  # clips ratings below 1
train_pred['rating_pred'] = train_pred['rating_pred'].round()

In [9]:
mean_squared_error(train_pred['rating'], train_pred['rating_pred'], squared=False)

2.501832517453663

test data

In [10]:
test_pred = test.copy()
test_pred['rating_pred'] = np.nan

for index, row in test_pred.iterrows():
    idx_movie = mid2idx[row['mID']]
    idx_user = uid2idx[row['uID']]
    test_pred.loc[index, 'rating_pred'] = rating_matrix_train_pred[idx_user, idx_movie]
    
    # Clipping values                                                    
test_pred.loc[test_pred['rating_pred'] > 5, 'rating_pred'] = 5.           # clips ratings above 5             
test_pred.loc[test_pred['rating_pred'] < 1, 'rating_pred'] = 1.  # clips ratings below 1
test_pred['rating_pred'] = test_pred['rating_pred'].round()  

mean_squared_error(test_pred['rating'], test_pred['rating_pred'], squared=False)

2.5597143169663665