In [1]:
from module import *

In [2]:
class MatrixFactorizationCF(object):
    def __init__(self, data, K, lam = 0.1, Xinit = None, Winit = None, learning_rate = 0.5, max_iter = 1000, print_every = 100, user_based = 1):
        
        # data: utility matrix
        self.data = data

        # K: number of latent features
        self.K = K

        # lam: regularization parameter
        self.lam = lam

        # learning_rate: alpha
        self.learning_rate = learning_rate

        # max_iter: number of iterations
        self.max_iter = max_iter

        # print_every: print loss every print_every iterations
        self.print_every = print_every

        # user_based: 1 if we are working with user-based CF, 0 if item-based
        self.user_based = user_based

        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(data[:, 0])) + 1
        self.n_items = int(np.max(data[:, 1])) + 1

        # number of ratings
        self.n_ratings = data.shape[0]

        # Xinit: item matrix initialization with row = number of items, col = latent features
        if Xinit is None:
            self.X = np.random.randn(self.n_items, K)
        else:
            self.X = Xinit

        # Winit: user matrix initialization with col = number of users, row = latent features
        if Winit is None:
            self.W = np.random.randn(K, self.n_users)
        else:
            self.W = Winit

        # copied data
        self.data_n = data.copy()

    def normalize(self):
        # if we are working with user-based CF
        if self.user_based:
            user_col = 0
            item_col = 1
            n_object = self.n_users
        
        # if we are working with item-based CF
        else:
            user_col = 1
            item_col = 0
            n_object = self.n_items

        users = self.data[:, user_col]
        # init mean matrix
        self.mu = np.zeros((n_object,))
        
        for n in range(n_object):
            # get all ratings of user n
            ids = np.where(users == n)[0].astype(int)
            item_ids = self.data[ids, item_col]
            ratings = self.data[ids, 2]

            # calculate mean
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[n] = m

            # normalize
            self.data_n[ids, 2] = ratings - self.mu[n]
        return self.data_n

    def loss(self):
        L = 0
        # for each rating in data
        for i in range(self.n_ratings):
            # get user, item, rating
            n, m, _rating = int(self.data_n[i, 0]), int(self.data_n[i, 1]), self.data_n[i, 2]
            # UPDATE LOSS
            xm = self.X[m, :]
            wn = self.W[:, n]
            L += 0.5 * (_rating - xm.dot(wn))**2
        
        L /= self.n_ratings
        # regularization
        L += .5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L

    def get_items_rated_by_user(self, user_id):
        # get all items which are rated by user n
        ids = np.where(self.data[:, 0] == user_id)[0]
        item_ids = self.data[ids, 1].astype(int)
        ratings = self.data[ids, 2]
        return (item_ids, ratings)
    
    def get_users_who_rate_item(self, item_id):
        # get all users who rated item m
        ids = np.where(self.data[:, 1] == item_id)[0]
        user_ids = self.data[ids, 0].astype(int)
        ratings = self.data[ids, 2]
        return (user_ids, ratings)
    
    '''
        Update X
            The method optimize item matrix X, given user matrix W
    '''
    def updateX(self):
        # Walk through each item to update
        for m in range(self.n_items):
            # Get all users who rated item m and rating of them
            user_ids, ratings = self.get_users_who_rate_item(m)
            
            # Get Wm matrix related to user_ids
            Wm = self.W[:, user_ids]

            xm = self.X[m, :]
            # Calculate gradient
            grad_xm = -(ratings - xm.dot(Wm)).dot(Wm.T)/self.n_ratings + self.lam*xm
            
            # Update X
            xm -= self.learning_rate*grad_xm.reshape((self.K,))
            self.X[m, :] = xm
    
    def updateW(self):
        # Walk through each user to update
        for n in range(self.n_users):
            # Get all items rated by user n and rating of them
            item_ids, ratings = self.get_items_rated_by_user(n)
            
            # Get Xn matrix related to item_ids
            Xn = self.X[item_ids, :]

            wn = self.W[:, n]
            # Calculate gradient
            grad_wn = -Xn.T.dot(ratings - Xn.dot(wn))/self.n_ratings + self.lam*wn
            
            # Update W
            wn -= self.learning_rate*grad_wn.reshape((self.K,))
            self.W[:, n] = wn

    def predict(self, u, i):
        u = int(u)
        i = int(i)

        # create bias to make rate in range [0, 5]
        if self.user_based:
            bias = self.mu[u]
        else:
            bias = self.mu[i]
        
        # compute the pred rate y_hat = x*w
        pred = self.X[i, :].dot(self.W[:, u]) + bias

        if pred < 0 :
            return 0
        if pred > 5: 
            return 5
        return pred
    
    def predict_for_user(self, user_id):
        # get all items which are rated by user
        ids = np.where(self.data[:, 0] == user_id)[0]
        items_rated_by_u = self.data[ids, 1].tolist()
        
        # predict ratings for items that are not rated by user
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings = []

        # return list of (item, rating) of items that are not rated by user
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        return predicted_ratings

    def RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.predict(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2
        RMSE = np.sqrt(SE/n_tests)
        return RMSE

    def fit(self):
        self.normalize()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                print('iter =', it + 1, ', loss =', self.loss())

    def score(self):
        pass

## __Hàm chức năng sử dụng cho quá trình__:

### __Hàm tìm các items được rate bởi các user__

__Mục đích__: Tìm tất cả cặp (items, rates) mà các items này được đánh giá bởi user đầu vào.<br>
__Đầu vào__: user_id mà cần để truy xuất các giá trị.

+ Hàm `where_user_in_data` để tìm __index__ của các user có trong data ban đầu.
+ Hàm `get_items_rated_by_user` truy xuất các items được rate và giá trị rate.

In [3]:
def where_user_in_data(data, user_id):
    valid_index = []

    for i in range(data.shape[0]):
        if data[i][0] == user_id:
            valid_index.append(i)
    return np.array(valid_index)

def get_items_rated_by_user(data, user_id):
    idx = where_user_in_data(data, user_id)
    items_ids = []
    ratings = []
    for i in idx:
        items_ids.append(data[i][1])
        ratings.append(data[i][2])
    return (np.array(idx), np.array(items_ids), np.array(ratings))

### __Hàm tìm các users đã rated cho item__:

__Mục đích__: Tìm tất cả cặp (users, rates) mà các users này đánh giá cho item đầu vào.<br>
__Đầu vào__: item_id mà cần để truy xuất các giá trị.

In [4]:
def where_item_in_data(data, item_id):
    valid_index = []

    for i in range(data.shape[0]):
        if data[i][1] == item_id:
            valid_index.append(i)
    return np.array(valid_index)

def get_users_who_rate_item(data, item_id):
    idx = where_item_in_data(data, item_id)
    user_ids = []
    ratings = []
    for i in idx:
        user_ids.append(data[i][0])
        ratings.append(data[i][2])
    return (np.array(idx), np.array(user_ids), np.array(ratings))

## __Normalize__

In [5]:
def swap_cols(data, frm, to):
    data[:,[frm, to]] = data[:,[to, frm]]

def normalize(data, user_based = 1):
    # create copy_data
    cdata = data.copy()
    if user_based == 0:
        swap_cols(cdata, 0, 1)
    user_col = 0
    item_col = 1
    n_object = int(np.max(cdata[:, user_col])) + 1
    
    users = cdata[:, user_col]
    mu = np.zeros((n_object,))

    for n in range(n_object):
        idx, items_ids, ratings = get_items_rated_by_user(cdata, n)

        m = np.mean(ratings)
        if np.isnan(m):
            m = 0
        mu[n] = m

        cdata[list(idx), 2] = ratings - mu[n]
    return cdata, mu


In [6]:
def frobenius_norm_without_numpy(X):
    # Initialize sum of squares to zero
    sum_of_squares = 0
    
    # Iterate through each row of the matrix
    for row in X:
        # Iterate through each element in the row
        for element in row:
            # Add the square of the element to the sum of squares
            sum_of_squares += element ** 2
    
    # Take the square root of the sum of squares to get the Frobenius norm
    frobenius_norm = sum_of_squares ** 0.5
    
    return frobenius_norm

In [7]:

def loss(data, X, W, lam):
    L = 0
    for i in range(data.shape[0]):
        # get user, item, rating
        n, m, _rating = int(data[i, 0]), int(data[i, 1]), data[i, 2]
        # Update loss
        xm = X[m, :]
        wn = W[:, n]
        L += 0.5 * (_rating - xm.dot(wn))**2
    L /= data.shape[0]
    # # regularization
    L += .5*lam*(np.linalg.norm(X, 'fro') + np.linalg.norm(W, 'fro'))
    return L

In [8]:
def updateX(data, X, W, lam, learning_rate):
    for m in range(X.shape[0]):
        # Get all users who rated item m and rating of them
        idx, user_ids, ratings = get_users_who_rate_item(data, m)
        Wm = W[:, list(user_ids)]
        xm = X[m, :]
        # Calculate gradient
        grad_xm = -(ratings - xm.dot(Wm)).dot(Wm.T)/data.shape[0] + lam*xm
        # Update X
        xm -= learning_rate*grad_xm.reshape((X.shape[1],))
        X[m, :] = xm
    return X

def updateW(data, X, W, lam, learning_rate):
    for n in range(W.shape[1]):
        # Get all items rated by user n and rating of them
        idx, item_ids, ratings = get_items_rated_by_user(data, n)
        Xn = X[list(item_ids), :]
        wn = W[:, n]
        # Calculate gradient
        grad_wn = -Xn.T.dot(ratings - Xn.dot(wn))/data.shape[0] + lam*wn
        # Update W
        wn -= learning_rate*grad_wn.reshape((W.shape[0],))
        W[:, n] = wn
    return W

In [9]:
def predict(data, X, W, mu, user_id, item_id):
    u = int(user_id)
    i = int(item_id)
    # create bias to make rate in range [0, 5]
    bias = mu[u]
    # compute the pred rate y_hat = x*w
    pred = X[i, :].dot(W[:, u]) + bias
    if pred < 0 :
        return 0
    if pred > 5: 
        return 5
    return pred

In [10]:
def predict_for_user(data, X, W, mu, user_id):
    # get all items which are rated by user
    idx, items_rated_by_u, _ = get_items_rated_by_user(data, user_id)
    # predict ratings for items that are not rated by user
    y_pred = X.dot(W[:, user_id]) + mu[user_id]
    predicted_ratings = []
    # return list of (item, rating) of items that are not rated by user
    for i in range(X.shape[0]):
        if i not in items_rated_by_u:
            predicted_ratings.append((i, y_pred[i]))
    return predicted_ratings

In [11]:
def RMSE(data, X, W, mu, rate_test):
    n_tests = rate_test.shape[0]
    SE = 0 # squared error
    for n in range(n_tests):
        pred = predict(data, X, W, mu, rate_test[n, 0], rate_test[n, 1])
        SE += (pred - rate_test[n, 2])**2
    RMSE = np.sqrt(SE/n_tests)
    return RMSE

In [12]:
def fit(data, X, W, lam = 0.1, learning_rate = 0.5, max_iter = 10, print_every = 1):
    data, mu = normalize(data)
    for it in range(max_iter):
        X = updateX(data, X, W, lam, learning_rate)
        W = updateW(data, X, W, lam, learning_rate)
        if (it + 1) % print_every == 0:
            print('iter =', it + 1, ', loss =', loss(data, X, W, lam))
    return X, W, mu

In [13]:
def score(data, X, W, mu, rate_test):
    return RMSE(data, X, W, mu, rate_test)

In [14]:
def run_mfcf(data, X, W, lam = 0.1, learning_rate = 0.5, max_iter = 1000, print_every = 100, user_based = 1, rate_test = None):
    X, W = fit(data, X, W, lam, learning_rate, max_iter, print_every)
    data, mu = normalize(data)
    return score(data, X, W, mu, rate_test)

In [15]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('../../data/ml-100k/ub.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('../../data/ml-100k/ub.test', sep='\t', names=r_cols)

rate_train = np.array(ratings_base, dtype= object)
rate_test = np.array(ratings_test, dtype= object)

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [16]:
rate_train

array([[0, 0, 5, 874965758],
       [0, 1, 3, 876893171],
       [0, 2, 4, 878542960],
       ...,
       [942, 1187, 3, 888640250],
       [942, 1227, 3, 888640275],
       [942, 1329, 3, 888692465]], dtype=object)

In [17]:
rs = MatrixFactorizationCF(rate_train, K = 10, lam = .1, print_every = 2, learning_rate = 0.5, max_iter = 100, user_based = 1)

In [18]:
normalize_matrix, mu = normalize(rate_train, user_based=1)

In [19]:
X, W, mu = fit(rate_train, rs.X, rs.W, max_iter=50, print_every=10)

iter = 10 , loss = 7.895038660661221
iter = 20 , loss = 4.646712690139914
iter = 30 , loss = 2.957491433780671
iter = 40 , loss = 1.9785938371098972
iter = 50 , loss = 1.3966209349628678


In [20]:
rate_test[3]

array([0, 89, 4, 878542300], dtype=object)

In [21]:
predict_for_user(rate_train, X, W, mu, 3)

[(0, 4.2794576334126875),
 (1, 4.243576066332711),
 (2, 4.278326788491015),
 (3, 4.2580447118195695),
 (4, 4.294057887805732),
 (5, 4.26204661003183),
 (6, 4.296754762073303),
 (7, 4.2791908875209295),
 (8, 4.281146852475538),
 (9, 4.30767286974109),
 (10, 4.310978123564038),
 (11, 4.32236969762393),
 (12, 4.329607759591645),
 (13, 4.31211804882516),
 (14, 4.315553287790926),
 (15, 4.279671173192545),
 (16, 4.329788785516527),
 (17, 4.292191225381375),
 (18, 4.278218410047923),
 (19, 4.25565544779496),
 (20, 4.283097176469984),
 (21, 4.24331472840655),
 (22, 4.270590496302643),
 (23, 4.258464363916753),
 (24, 4.271327669602351),
 (25, 4.268727343169753),
 (26, 4.318447462635749),
 (27, 4.278831862975834),
 (28, 4.321694968778485),
 (29, 4.270173408896758),
 (30, 4.291203879583904),
 (31, 4.279756272701415),
 (32, 4.304760457083138),
 (33, 4.303522064859528),
 (34, 4.27451054409223),
 (35, 4.266095870741149),
 (36, 4.257478303292716),
 (37, 4.289633648921544),
 (38, 4.277883584787758),
