# implement matrix factorization for movielens dataset

In [1]:
%matplotlib inline
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

# load data from movielens

In [2]:
df = pd.DataFrame(np.zeros((943,1682) , dtype=np.float64))

In [3]:
print(df.shape)

(943, 1682)


In [4]:
with open("movielens_100K.data" , "r") as data:
    for line in data:
        line_l = line.split()
        df.iloc[int(line_l[0])-1 , int(line_l[1])-1] = int(line_l[2])

In [5]:
rating_data = df.as_matrix()

# matrix factorization

In [8]:
class MF():
    '''
    number of user = 943
    number of item = 1682
    k = number of latent dim
    U = (943 * k) : user latent matrix
    V = (1682 * k) : item latent matrix
    R = (943 * 1682) : user item rating matrix
    alpha = 0.2
    '''
    def __init__(self , k , rating , iteration , alpha , beta):
        
        self._num_user , self._num_item = rating.shape
        
        self._R = rating
        self._iteration = iteration
        
        self._alpha = alpha
        self._beta = beta
        
        #初始化 user latent matrix and item latent matrix
        self._U = np.random.normal(loc = 0.0 , scale=1/k , size = (self._num_user , k) )
        self._U.astype(np.float64)
        self._V = np.random.normal(loc = 0.0 , scale=1/k , size = (self._num_item , k) )
        self._V.astype(np.float64)
        #初始化 user bias and item bias
        self._user_bias = np.zeros(self._num_user )
        self._item_bias = np.zeros(self._num_item )
        self._bias = np.mean( self._R[np.where(self._R != 0 )] )
        
        
    def train(self):
        #create training data
        self._training_data = [
            (i,j)
            for i in range(self._num_user)
            for j in range(self._num_item)
            if self._R[i,j]!=0 
        ]
        
        for iteration in range(self._iteration):
            np.random.shuffle(self._training_data)
            self.sgd()
    
    
    def get_prediction(self , i , j):
        
        prediction = self._bias + self._user_bias[i] + self._item_bias[j] + np.dot( self._U[i,:] , self._V[j,:] )
        return prediction
    
    def sgd(self):
        for i,j in self._training_data:
            e = ( self._R[i,j] - self.get_prediction(i,j) )

            #update bias
            self._user_bias[i] = self._user_bias[i] + self._alpha * (e - self._beta * self._user_bias[i])
            self._item_bias[j] = self._item_bias[j] + self._alpha * (e - self._beta * self._item_bias[j])
            
            # update user item latent vector
            self._U[i,:] = self._U[i,:] + self._alpha * (e * self._V[j,:] - self._beta * self._U[i,:])           
            self._V[j,:] = self._V[j,:] + self._alpha * (e * self._U[i,:] - self._beta * self._V[j,:])

            

    def result(self):
        return self._bias + self._user_bias[:,np.newaxis] + self._item_bias[np.newaxis:,] + np.dot(self._U , self._V.T )

In [9]:
mf = MF(100 , rating_data , 100 , 0.1 , 0.01)