In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF

####

In [44]:
def ratings_matrix():
    ratings = pd.read_csv("movies/ratings.csv")
    data_mat = np.array(ratings.pivot(index = 'movieId', columns = 'userId', values = 'rating'))
    data_mat_rev = np.nan_to_num(data_mat)
    return data_mat_rev

In [45]:
def books_data_matrix():
    dataset = BookCrossingReader(0.8,0.9)
    t = dataset.URM_train.todense().T
    return t

In [46]:
X = ratings_matrix()
# X = books_data_matrix()

In [47]:
X.shape

(9724, 610)

In [48]:
h, w = X.shape
inner = 50

In [49]:
W = np.random.random((h, inner))
H = np.random.random((inner, w))

In [50]:
def cost(X, W, H):
    Xwh = np.dot(W, H)
    E = X - Xwh
    return np.mean((E)**2)

In [51]:
def update(X, W, H):
    N = np.dot(X, H.T)
    D = np.dot(W, H).dot(H.T)
    U = np.divide(N, D)
    Wu = np.multiply(W, U)
    return Wu

In [52]:
def optimize(X, W, H):
    Wt = update(X, W, H)
    Ht = update(X.T, H.T, Wt.T)
    cost_value = cost(X, Wt, Ht.T)
    return Wt, Ht.T, cost_value

In [53]:
for i in range(150):
    W, H, c = optimize(X, W, H)
    if i % 2 == 0:
        print("Cost value: {}".format(c))

Cost value: 0.18074025399936222
Cost value: 0.17610526900973308
Cost value: 0.17185425745054927
Cost value: 0.1615174552479625
Cost value: 0.14856125188690025
Cost value: 0.13883902656726022
Cost value: 0.1318997053858367
Cost value: 0.12678966185222912
Cost value: 0.12274740109037288
Cost value: 0.11944747501367592
Cost value: 0.11671920078727503
Cost value: 0.11447673636257662
Cost value: 0.1126700276407033
Cost value: 0.11120495841739103
Cost value: 0.11000296559563953
Cost value: 0.10902132891435233
Cost value: 0.10821357968533783
Cost value: 0.10755169283572853
Cost value: 0.10700753472636483
Cost value: 0.10655648759970054
Cost value: 0.10617913873823656
Cost value: 0.10585980097856139
Cost value: 0.10558364781571275
Cost value: 0.10534254491247996
Cost value: 0.10513174715470498
Cost value: 0.10494840672504255
Cost value: 0.10478734344909614
Cost value: 0.10464473639996724
Cost value: 0.10452003074241333
Cost value: 0.10440985449447059
Cost value: 0.10431030402364942
Cost value:

In [54]:
W = np.round(W, 2)
H = np.round(H, 2)

In [55]:
Xr = W.dot(H)

In [59]:
np.round(Xr, 0)

array([[3., 0., 0., ..., 2., 1., 5.],
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [60]:
X

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [3]:
import numpy as np
import scipy.sparse as sps
import os
import random
import sys
import pandas as pd


class BookCrossingReader(object):
    #TODO: aggiungere validation option.
    def __init__(self, train_test_split=None, train_validation_split=None, delete_popular=None, top_popular_threshold=0.33):
        '''
        :param train_test_split: is the percentage of the training set
        '''


        dir = 'BX-CSV-Dump/'
        filename = dir+"/BX-Book-Ratings.csv"
        from numpy import genfromtxt
        fileHandle = pd.read_csv(filename, sep=";", encoding="ISO-8859-1")
        #fileHandle = open(filename, "r")

        rows, cols, vals = [], [], []
        numCells = 0
        fileHandle['ISBN'], levels = pd.factorize(fileHandle['ISBN'])
        fileHandle['User-ID'], levels = pd.factorize(fileHandle['User-ID'])
        print(fileHandle.iloc[0:100])

        #fileHandle['User'], levels = pd.factorize(fileHandle['ISBN'] )

        #print(fileHandle)

        #These arrays are sorted by user
        self.users = np.array(fileHandle['User-ID']).astype(int)
        self.movies = np.array(fileHandle['ISBN']).astype(int)
        self.ratings = np.array(fileHandle['Book-Rating']).astype(float)

        if delete_popular:
            unique, counts = np.unique(self.movies, return_counts=True)
            dictionary = dict(zip(unique, counts))
            sorted_dictionary = sorted(dictionary.items(), key=lambda x: x[1])
            cutting_index = round(len(sorted_dictionary)*(1-top_popular_threshold))
            least_popular_item = [x[0] for x in sorted_dictionary[:cutting_index]]

            popular_mask = []
            for item in self.movies:

                if item in least_popular_item:
                    popular_mask.append(True)
                else:
                    popular_mask.append(False)

            self.movies = self.movies[popular_mask]
            self.users = self.users[popular_mask]
            self.ratings = self.ratings[popular_mask]

        self.unique_movies = np.sort(np.unique(self.movies)).astype(int)
        self.unique_users = np.sort(np.unique(self.users))
        '''
        #These arrays are sorted by item
        self.users_by_item = np.array(data2[:,0])
        self.items_by_item = np.array(data2[:,1])
        self.ratings_by_item = np.array(data2[:,2])
   
        # gli id degli users partono da 1 e sono tutti consecutivi, quindi l'unica
        # riga della URM che ha tutti 0 è la prima (riga 0) che quindi eliminiamo
        '''
        URM_all_partial = sps.csr_matrix((self.ratings, (self.users, self.movies)), dtype=np.float32)
        self.URM_all = URM_all_partial
        self.URM_all = self.URM_all.tocoo()

        num_interactions = self.URM_all.nnz

        train_mask = np.random.choice([True, False], num_interactions, p=[train_test_split, 1 - train_test_split])
        test_mask = np.logical_not(train_mask)

        if train_validation_split is not None:

            splitted_test_mask = [random.choice([True, False]) if x else False for x in test_mask]
            validation_mask = np.logical_and(np.logical_not(splitted_test_mask), test_mask)

            self.URM_validation = sps.csr_matrix((self.ratings[validation_mask], (self.users[validation_mask], self.movies[validation_mask])))

        elif train_test_split is not None:
            train_mask = np.random.choice([True, False], num_interactions, p=[train_test_split, 1 - train_test_split])

            test_mask = np.logical_not(train_mask)

        else:
            raise Exception("One between train_test_split and train_validation_split must be valid")

        #mask = np.random.choice([True, False], len(self.unique_users), p=[0.3, 0.7])

        self.URM_test = sps.csr_matrix((self.ratings[test_mask], (self.users[test_mask], self.movies[test_mask])))
        print(self.URM_test.nnz)
        print(self.URM_test.shape[0])
        self.URM_test = self.URM_test[0:1000, :]
        print(self.URM_test.nnz)

        self.URM_train = sps.csr_matrix((self.ratings[train_mask], (self.users[train_mask], self.movies[train_mask])))
        self.URM_train = self.URM_train[0:1000, :]