In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 

In [2]:
MV_users = pd.read_csv('../Files/data/users.csv')
MV_movies = pd.read_csv('../Files/data/movies.csv')
train = pd.read_csv('../Files/data/train.csv')
test = pd.read_csv('../Files/data/test.csv')

In [3]:
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import NMF

class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray()

    def fit(self, **kwargs):
        model = NMF(**kwargs)
        self.W = model.fit_transform(self.Mr)
        self.H = model.components_
        self.generated_rankings = np.dot(self.W, self.H)   
    
    
    def predict(self):
        test_predictions = []
        for i in range(len(self.data.test)):
            uid = self.data.test.uID[i]
            mid = self.data.test.mID[i]
            rank_for_uid_mid = self.generated_rankings[self.uid2idx[uid], self.mid2idx[mid]]
            test_predictions.append(rank_for_uid_mid)
        return np.array(test_predictions)
        
    def rmse(self,yp):
        yp[np.isnan(yp)]=3
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())



In [4]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [5]:
np.random.seed(42)
sample_train = train[:300]
sample_test = test[:300]
sample_MV_users = MV_users[(MV_users.uID.isin(sample_train.uID)) | (MV_users.uID.isin(sample_test.uID))]
sample_MV_movies = MV_movies[(MV_movies.mID.isin(sample_train.mID)) | (MV_movies.mID.isin(sample_test.mID))]
sample_data = Data(sample_MV_users, sample_MV_movies, sample_train, sample_test)

In [7]:
rs = RecSys(data)
yp = rs.fit()
yp = rs.predict()

print(rs.rmse(yp))



3.0387445821997803


|Method|RMSE|
|:----|:--------:|
|Baseline, $Y_p$=3| 1.2585510334053043 |
|Baseline, $Y_p=\mu_u$| 1.0352910334228647 |
|Content based, item-item| 1.0128116783754684 |
|Collaborative, cosine| 1.0301524420757868 |
|Collaborative, jaccard, $M_r\geq 3$| 0.9819058692126349 |
|Collaborative, jaccard, $M_r\geq 1$| 0.991363571262366 |
|Collaborative, jaccard, $M_r$| 0.9509126236828654 |
|NMF| 3.038744|

# Discussion

We can definetely see that NMF has a very poor performance on the dataset.

RMSE, which stands for Root Mean Squared Error, is a widely used metric for assessing the performance of predictive models, particularly in regression and recommendation systems. This metric quantifies the average magnitude of errors between the predicted values and the actual observed values.

- A lower RMSE indicates that the predicted values are closer to the actual values, reflecting better model accuracy.
- A higher RMSE signifies greater discrepancies between the predicted values and the actual values, pointing to poorer model performance.

When dealing with sparse datasets, NMF may struggle to identify meaningful patterns, potentially resulting in a higher RMSE due to a lack of sufficient information. Additionally, NMF can be sensitive to the initialization of matrices; inappropriate initial values may cause the algorithm to converge to a local minimum, leading to suboptimal outcomes.

In scenarios involving sparse datasets, collaborative filtering methods, such as those based on Jaccard similarity, may be more effective in handling sparsity by utilizing more insightful user-item interactions.

The NMF struggles in this scenario because it interprets the unknown ratings as zeros within the matrix being factorized (X in the equation X=WH). By default, we utilize the Frobenius norm, and assigning a value of zero when the actual value should be between 1 and 5 skews the gradient. This results in inaccurate updates to matrices W and H.

I recommend starting by altering the loss function to Kullback-Leibler divergence, as this approach proved effective in the BBC assignment.