In [13]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation, digits
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from pytest import approx
import itertools
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)
color = ['#247BA0', '#F6511D', '#7FB800', '#FFB400', '#F25F5C', '#50514F']

1. Load the movie ratings data (as in the HW3-recommender-system) and use matrix factorization technique(s) and predict the missing ratings from the test data. Measure the RMSE. You should use sklearn library. [10 pts]

Make sure that your notebook includes the following:

use's sklearn's non-negative matrix factorization

notebook shows the RMSE with an analysis of what that RMSE means

In [6]:
MV_users = pd.read_csv('users.csv')
MV_movies = pd.read_csv('movies.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [7]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [18]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(train.rating)
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

In [74]:
rs = RecSys(data)

Trying different combinations of attributes in the NMF model to find out the best model

In [75]:
best_init = ''
best_solver = ''
best_beta_loss = ''
best_rmse = 100

for init in ['nndsvd', 'nndsvda', 'nndsvdar']:
    for solver in ['cd', 'mu']:
        for beta_loss in ['frobenius', 'kullback-leibler', 'itakura-saito']:
            try:
                nmf = NMF(n_components = 5, random_state = 1, l1_ratio = 0.5,
                      init = init,
                      solver = solver,
                      beta_loss = beta_loss).fit(rs.Mr)
                nmf_matrix = nmf.transform(rs.Mr)
                nmf_component_matrix = nmf_model.components_
                rating_matrix = np.dot(nmf_matrix, nmf_component_matrix)
                nmf_prediction =[]
                for uid, mid in zip(rs.data.test['uID'], rs.data.test['mID']):
                    nmf_prediction.append(rating_matrix[rs.uid2idx[uid], rs.mid2idx[mid]])
                nmf_prediction = np.array(nmf_prediction)
                rmse = rs.rmse(nmf_prediction)
                print(init, solver, beta_loss, rmse)
                
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_init = init
                    best_solver = solver
                    best_beta_loss = beta_loss
            except:    
                pass

nndsvd cd frobenius 2.991422658477686
nndsvd mu frobenius 3.120610583675257
nndsvd mu kullback-leibler 3.1897507750557383
nndsvda cd frobenius 3.064032365162081
nndsvda mu frobenius 3.134147817903623
nndsvda mu kullback-leibler 3.2611967252752065
nndsvdar cd frobenius 2.9916708798576046
nndsvdar mu frobenius 3.0955019795549497
nndsvdar mu kullback-leibler 3.1790577003096097


In [76]:
nmf = NMF(n_components = 5, random_state = 1, l1_ratio = 0.5,
          init = best_init,
          solver = best_solver,
          beta_loss = best_beta_loss).fit(rs.Mr)
nmf_matrix = nmf.transform(rs.Mr)
nmf_component_matrix = nmf_model.components_
rating_matrix = np.dot(nmf_matrix, nmf_component_matrix)
nmf_prediction =[]
for uid, mid in zip(rs.data.test['uID'], rs.data.test['mID']):
    nmf_prediction.append(rating_matrix[rs.uid2idx[uid], rs.mid2idx[mid]])
nmf_prediction = np.array(nmf_prediction)
rmse = rs.rmse(nmf_prediction)
print(best_init, best_solver, best_beta_loss, ' leads the best performance.')
print('RMSE is ' +  str(rmse))

nndsvd cd frobenius  leads the best performance.
RMSE is 2.991422658477686


The rmse of the best model is 2.99, which is much worse than Jaccard and Cosines similarity, even worse than predicting everything to 3.

2. Discuss the results and why they did not work well compared to simple baseline or similarity-based methods weâ€™ve done in Module 3. Can you suggest a way(s) to fix it? [10 pts]

The poor result is because of there are too many missing ratings in the dataset, and we keep every missing rating as zero in the matrix. One way to fix it is to fill the 0s with 3s or mean ratings of movies first, and then use the NMD model to predict the result.