In [1]:
from BPM_MF_algo_numba_class import BPM_MatrixFactorization, fit
import pandas as pd
import numpy as np
from ypstruct import structure
from scipy.sparse import coo_matrix
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import json

In [None]:
#Import data
colnames = ['userId', 'movieId', 'rating', 'timeStamp']
data_df = pd.read_csv('ml-100k/u.data',sep='\t', 
                        names = colnames, header = None)
# data_df = pd.read_csv('E:/DW/확률그래프모델/project/ml-25m/ratings.csv')
#rating_df = data_df.pivot(index='userId', columns='movieId', values='rating')

userId_list = Counter(data_df['userId'])
movieId_list = Counter(data_df['movieId'])
ratings_list = Counter(data_df['rating'])

userId_arr  = data_df['userId'].values.copy()
movieId_arr = data_df['movieId'].values.copy()
rating_arr = data_df['rating'].values.copy()

#coordinate transformation minus 1
userId_arr -= 1
movieId_arr -= 1

#problem setting
problem = structure()
problem.maxiter = 250
problem.rows = max(userId_arr)+1
problem.cols = max(movieId_arr)+1


X_train, X_test = train_test_split(data_df, test_size=0.2, random_state=42)

#Cross validation setting
cv = KFold(5, shuffle=True, random_state=42)
cv_output = []

In [None]:
for k in range(5) :
    #rmse
    temp_list = []
    for i, (idx_train, idx_validation) in enumerate(cv.split(X_train)):
        print("{}_th cv computing".format(i))
        df_train = data_df.iloc[idx_train]
        df_validation = data_df.iloc[idx_validation]

        userId_tr  = df_train['userId'].values
        movieId_tr = df_train['movieId'].values
        rating_tr = df_train['rating'].values

        userId_vd  = df_validation['userId'].values
        movieId_vd = df_validation['movieId'].values
        rating_vd = df_validation['rating'].values

        userId_tr -= 1
        movieId_tr -= 1

        userId_vd -= 1
        movieId_vd -= 1

        sparse_train_m = coo_matrix((rating_tr, (userId_tr, movieId_tr)), \
                        shape=(problem.rows, problem.cols))

        sparse_validation_m = coo_matrix((rating_vd, (userId_vd, movieId_vd)), \
                        shape=(problem.rows, problem.cols))

        problem.data_m = sparse_train_m
        problem.test_m = sparse_validation_m

        #Parameters setting
        params = structure()
        params.alpha = 0.2
        params.beta = 5
        params.R = 4
        params.normal_loc = 0
        params.normal_var = 0.5
        params.latent_k = 5*(k+1)

        #run algorithm
        temp_list.append(fit(problem,params))

        #print result
        print("{}_th latent ".format(params.latent_k), "cv_{}_th MAE :".format(i), temp_list[i].MAE)
        print("{}_th latent ".format(params.latent_k), "cv_{}_th CMAE :".format(i), temp_list[i].CMAE)
        print("{}_th latent ".format(params.latent_k), "cv_{}_th 0_1_loss :".format(i), temp_list[i].zero_one_loss)

---

In [2]:
#Import data
colnames = ['userId', 'movieId', 'rating', 'timeStamp']
data_df = pd.read_csv('ml-100k/u.data',sep='\t', 
                        names = colnames, header = None)

In [3]:
data_df

Unnamed: 0,userId,movieId,rating,timeStamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [4]:
userId_list = Counter(data_df['userId'])
movieId_list = Counter(data_df['movieId'])
ratings_list = Counter(data_df['rating'])

userId_arr  = data_df['userId'].values.copy()
movieId_arr = data_df['movieId'].values.copy()
rating_arr = data_df['rating'].values.copy()

In [6]:
X_train, X_test = train_test_split(data_df, test_size=0.2, random_state=42)

In [7]:
idx_train = X_train.index
idx_test = X_test.index

In [8]:
userId_train = X_train['userId'].values - 1
movieId_train = X_train['movieId'].values - 1
rating_train = X_train['rating'].values

sparse_train_matrix = coo_matrix((rating_train, (userId_train, movieId_train)), \
                           shape=(problem.rows, problem.cols))

In [9]:
userId_test = X_test['userId'].values - 1
movieId_test = X_test['movieId'].values - 1
rating_test = X_test['rating'].values

sparse_test_matrix = coo_matrix((rating_test, (userId_test, movieId_test)), \
                           shape=(problem.rows, problem.cols))

In [10]:
problem.data_m = sparse_train_matrix
problem.test_m = sparse_test_matrix

In [11]:
# parameters setting
params = structure()
params.alpha = 0.2
params.beta = 5.0
params.R = 4
params.normal_loc = 0.0
params.normal_var = 0.5
params.latent_k = 5

In [37]:
problem.data_m

<943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 80000 stored elements in COOrdinate format>

In [46]:
from numba.experimental import jitclass
from numba import int32, int64, float32, jit
spec = [
    ('data_u', int32[:]),
    ('data_i', int32[:]),  
    ('data_v', int64[:]),             
    ('rows', int32),
    ('cols', int32),
    ('test_u', int32[:]),
    ('test_i', int32[:]),
    ('test_v', float32[:]),
    ('latent_k', int32),
    ('alpha', float32),
    ('beta', float32),
    ('R', int32),
    ('normal_loc', float32),
    ('normal_var', float32),
    ('gamma_m', float32[:,:]),
    ('eps_plus_m', float32[:,:]),
    ('eps_minus_m', float32[:,:]),
    ('lambda_m', float32[:,:]),
    ('a_m', float32[:,:]),
    ('b_m', float32[:,:]),
    ('p_m', float32[:,:]),
    ('q_m', float32[:,:]),
    ('r_plus_v', int32[:]),
    ('r_minus_v', int32[:])
]

@jitclass(spec)
class BPM_MatrixFactorization1 :
    def __init__(self, problem, params):
        self.data_u = problem.data_m.row
        self.data_i = problem.data_m.col
        self.data_v = problem.data_m.data
        self.test_u = problem.test_m.row
        self.test_i = problem.test_m.col
        self.test_v = problem.test_m.data
        self.rows = problem.rows
        self.cols = problem.cols
        self.latent_k = params.latent_k
        self.alpha = params.alpha
        self.beta = params.beta
        self.R = params.R
        self.normal_loc = params.normal_loc
        self.normal_var = params.normal_var
        self.gamma_m = np.zeros((self.rows, self.latent_k), dtype=np.float32)
        self.eps_plus_m = np.zeros((self.cols, self.latent_k), dtype=np.float32)
        self.eps_minus_m = np.zeros((self.cols, self.latent_k), dtype=np.float32)
        self.lambda_m = np.zeros((self.rows, self.cols, self.latent_k), dtype=np.float32)
        self.a_m = np.zeros((self.rows, self.latent_k), dtype=np.float32)
        self.b_m = np.zeros((self.cols, self.latent_k), dtype=np.float32)
        self.p_m = np.zeros((self.rows, self.cols), dtype=np.float32)
        self.q_m = np.zeros((self.rows, self.cols), dtype=np.float32)
        self.r_plus_v = self.data_v.copy() - 1
        self.r_minus_v = 5 - self.data_v.copy()

In [47]:
factorization = BPM_MatrixFactorization1(problem, params)

AttributeError: 'NoneType' object has no attribute '_code'

In [45]:
problem

structure({'maxiter': 250, 'rows': 943, 'cols': 1682, 'data_m': <943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 80000 stored elements in COOrdinate format>, 'test_m': <943x1682 sparse matrix of type '<class 'numpy.int64'>'
	with 20000 stored elements in COOrdinate format>})

In [None]:
factorization.normal_var

In [None]:
add(1, 2)

In [39]:
import numpy as np
from numba import int32, float32    # import the types
from numba.experimental import jitclass

spec = [
    ('value', int32),               # a simple scalar field
    ('array', float32[:]),          # an array field
]

@jitclass(spec)
class Bag(object):
    def __init__(self, value1):
        self.value = value1
        self.array = np.zeros(self.value, dtype=np.float32)

    @property
    def size(self):
        return self.array.size

    def increment(self, val):
        for i in range(self.size):
            self.array[i] += val
        return self.array

    def add(x, y):
        return x + y

n = 21
mybag = Bag(n)

In [41]:
mybag.size

21