In [3]:
import sys
import pandas as pd

import dask.array as da
import dask.dataframe as dd

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 

In [4]:
class MF(object):
    """docstring for CF"""
    def __init__(self, Y_data, K, lam = 0.1, Xinit = None, Winit = None, 
            learning_rate = 0.5, max_iter = 1000, print_every = 100, user_based = 1):
        self.Y_raw_data = Y_data
        self.K = K
        # regularization parameter
        self.lam = lam
        # learning rate for gradient descent
        self.learning_rate = learning_rate
        # maximum number of iterations
        self.max_iter = max_iter
        # print results after print_every iterations
        self.print_every = print_every
        # user-based or item-based
        self.user_based = user_based
        # number of users, items, and ratings. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(Y_data[:, 0])) + 1 
        self.n_items = int(np.max(Y_data[:, 1])) + 1
        self.n_ratings = Y_data.shape[0]
        
        if Xinit is None: # new
            self.X = np.random.randn(self.n_items, K)
        else: # or from saved data
            self.X = Xinit 
        
        if Winit is None: 
            self.W = np.random.randn(K, self.n_users)
        else: # from daved data
            self.W = Winit
            
        # normalized data, update later in normalized_Y function
        self.Y_data_n = self.Y_raw_data.copy()


    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users

        # if we want to normalize based on item, just switch first two columns of data
        else: # item bas
            user_col = 1
            item_col = 0 
            n_objects = self.n_items

        users = self.Y_raw_data[:, user_col] 
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data_n[ids, item_col] 
            # and the corresponding ratings 
            ratings = self.Y_data_n[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Y_data_n[ids, 2] = ratings - self.mu[n]
    
    #Loss function
    def loss(self):
        L = 0 
        for i in range(self.n_ratings):
            # user, item, rating
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), self.Y_data_n[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2
        
        # take average
        L /= self.n_ratings
        # regularization, don't ever forget this 
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L 
    
    
    #Xác định các items được đánh giá bởi 1 user, và users đã đánh giá 1 item và các ratings tương ứng:
    def get_items_rated_by_user(self, user_id):
        """
        get all items which are rated by user user_id, and the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:,0] == user_id)[0] 
        item_ids = self.Y_data_n[ids, 1].astype(np.int32) # indices need to be integers
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)
    
    def get_users_who_rate_item(self, item_id):
        """
        get all users who rated item item_id and get the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:,1] == item_id)[0] 
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)
    
    def get_users_who_rate_item(self, item_id):
        """
        get all users who rated item item_id and get the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:,1] == item_id)[0] 
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)
    
    #Cập nhật X, W
    def updateX(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            Wm = self.W[:, user_ids]
            # gradient
            grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + \
                                               self.lam*self.X[m, :]
            self.X[m, :] -= self.learning_rate*grad_xm.reshape((self.K,))
            
            
    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            # gradient
            grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + \
                        self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.K,))
            
            
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print('iter =', it + 1, ', loss =', self.loss(), ', RMSE train =', rmse_train)
                
                
    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        if you need the un
        """
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias 
        # truncate if results are out of range [0, 5]
        if pred < 0:
            return 0 
        if pred > 5: 
            return 5 
        return pred 
    
    
    def pred_for_user(self, user_id):
        """
        predict ratings one user give all unrated items
        """
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()              
        
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings= []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        
        return predicted_ratings
    
    
    # Đánh giá kết quả bằng cách đo Root Mean Square Error:
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE
    
    
    

In [6]:
%%time
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('/home/suv/Documents/dask/dask/DASK/ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('/home/suv/Documents/dask/dask/DASK/ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

CPU times: user 16.5 ms, sys: 8.76 ms, total: 25.3 ms
Wall time: 36.4 ms


  
  import sys


In [15]:
%%time
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base1 = dd.read_csv('/home/suv/Documents/dask/dask/DASK/ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test2 = dd.read_csv('/home/suv/Documents/dask/dask/DASK/ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

CPU times: user 12.3 ms, sys: 5.3 ms, total: 17.6 ms
Wall time: 18.2 ms


####  Loss function 

In [8]:
import numpy as np 

In [9]:
%%time
rs = MF(rate_train, K = 10, lam = .1, print_every = 10, 
    learning_rate = 0.75, max_iter = 100, user_based = 1)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nUser-based MF, RMSE =', RMSE)

iter = 10 , loss = 5.653604351845312 , RMSE train = 1.2071831319091437
iter = 20 , loss = 2.640334859902972 , RMSE train = 1.0383224360028536
iter = 30 , loss = 1.3437243767962102 , RMSE train = 1.029636283648093
iter = 40 , loss = 0.7529904193809682 , RMSE train = 1.0292351430055953
iter = 50 , loss = 0.4822789981461216 , RMSE train = 1.0292153639518722
iter = 60 , loss = 0.3581467610695107 , RMSE train = 1.0292143288095914
iter = 70 , loss = 0.30122295306465086 , RMSE train = 1.0292143057863021
iter = 80 , loss = 0.2751188670563678 , RMSE train = 1.0292143214347151
iter = 90 , loss = 0.26314803216778476 , RMSE train = 1.0292143281301387
iter = 100 , loss = 0.25765842859321425 , RMSE train = 1.0292143302141175

User-based MF, RMSE = 1.0603798998837122
CPU times: user 43.3 s, sys: 1.14 s, total: 44.5 s
Wall time: 42.2 s


In [10]:
%%time
rs = MF(rate_train, K = 10, lam = .1, print_every = 10, learning_rate = 0.75, max_iter = 100, user_based = 0)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nItem-based MF, RMSE =', RMSE)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


iter = 10 , loss = 5.632585979560632 , RMSE train = 1.1808680117705348
iter = 20 , loss = 2.619307174658946 , RMSE train = 1.0057083688793385
iter = 30 , loss = 1.3248813334941383 , RMSE train = 0.9965955871250328
iter = 40 , loss = 0.7351822408286656 , RMSE train = 0.9961959094556818
iter = 50 , loss = 0.46493279152013245 , RMSE train = 0.9961805191743438
iter = 60 , loss = 0.3410085424559327 , RMSE train = 0.9961805933198813
iter = 70 , loss = 0.2841790932916618 , RMSE train = 0.9961808130322629
iter = 80 , loss = 0.2581179845392985 , RMSE train = 0.9961808803965905
iter = 90 , loss = 0.24616676082866285 , RMSE train = 0.9961808980492374
iter = 100 , loss = 0.24068611384626598 , RMSE train = 0.9961809024518246

Item-based MF, RMSE = 1.0498047573574536
CPU times: user 43.6 s, sys: 1.58 s, total: 45.2 s
Wall time: 43 s


In [12]:
rate_test

array([[        0,        16,         3, 875073198],
       [        0,        46,         4, 875072125],
       [        0,        63,         5, 875072404],
       ...,
       [      942,       594,         2, 875502597],
       [      942,       684,         4, 875502042],
       [      942,      1010,         2, 875502560]])

In [11]:
%%time
rs = MF(rate_train, K = 2, lam = 0, print_every = 10, learning_rate = 1, max_iter = 100, user_based = 0)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print('\nItem-based MF, RMSE =', RMSE)

iter = 10 , loss = 1.1904389337747212 , RMSE train = 1.498904003693602
iter = 20 , loss = 1.122899659665539 , RMSE train = 1.4775190397284794
iter = 30 , loss = 1.0628468882473376 , RMSE train = 1.4574931785064922
iter = 40 , loss = 1.0091839082518004 , RMSE train = 1.4386970730205273
iter = 50 , loss = 0.9610100141979995 , RMSE train = 1.4210351247975737
iter = 60 , loss = 0.9175796675508457 , RMSE train = 1.4044660788100862
iter = 70 , loss = 0.8782712907882321 , RMSE train = 1.38883428080351
iter = 80 , loss = 0.8425631747433698 , RMSE train = 1.3740697179394024
iter = 90 , loss = 0.8100146992102124 , RMSE train = 1.3601609709177591
iter = 100 , loss = 0.7802515646751954 , RMSE train = 1.3470493257692813

Item-based MF, RMSE = 1.425823682236154
CPU times: user 42.2 s, sys: 1.48 ms, total: 42.2 s
Wall time: 42.2 s


#### ===================================================================================================

####  1M

###### read csv with dd

In [14]:
%%time 
r_cols1 = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base1= dd.read_csv('/home/suv/Documents/dask/dask/DASK/ml-1m/ratings.dat', sep='::', names=r_cols, encoding='latin-1')

CPU times: user 46 ms, sys: 53 µs, total: 46 ms
Wall time: 46.3 ms


####  20M

In [27]:
%%time 
r_cols20 = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base20 = dd.read_csv('/home/suv/Documents/dask/dask/DASK/ml-20m/rating.csv')

CPU times: user 15.9 ms, sys: 0 ns, total: 15.9 ms
Wall time: 15.6 ms


In [18]:
%time
ratings_base20.count()

Dask Series Structure:
npartitions=1
movieId    int64
userId       ...
dtype: int64
Dask Name: dataframe-count-agg, 45 tasks