In [227]:
import pandas as pd
import numpy as np

%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [228]:
train = pd.read_parquet('../datamovies/train.parquet')

In [229]:
test = pd.read_parquet('../datamovies/test.parquet')

In [230]:
train.head()

Unnamed: 0,rating,date,userId,movieId
0,3.0,2000-11-22,0,0
1,5.0,2000-11-21,0,1
2,4.0,2000-11-21,0,2
3,4.0,2000-11-21,0,3
4,5.0,2000-11-21,0,4


In [231]:
train.shape

(5566601, 4)

In [232]:
test.shape

(281497, 4)

In [233]:
test.head(50)

Unnamed: 0,rating,date,userId,movieId
0,3.0,2011-10-29,6,8
1,4.0,2010-09-22,60,8
2,4.5,2013-12-24,133,8
3,3.5,2010-10-15,140,8
4,3.5,2013-04-08,153,8
5,2.0,2013-05-08,246,8
6,4.0,2010-03-13,458,8
7,3.5,2011-08-30,537,8
8,4.5,2010-05-23,687,8
9,1.5,2010-04-11,791,8


In [234]:
#calc global average
mu = train['rating'].mean()
mu

3.383387277083448

In [235]:
train['userId'].unique().shape, train['movieId'].unique().shape

((8481,), (4237,))

In [236]:
test['userId'].unique().shape, test['movieId'].unique().shape

((2543,), (4227,))

# Creating URM

In [238]:
##### Train

In [239]:
from scipy import sparse
from pandas.api.types import CategoricalDtype
import gc

In [275]:
#Create URM
userId_c = CategoricalDtype(sorted(train.userId.unique()), ordered=True)
movieId_c = CategoricalDtype(sorted(train.movieId.unique()), ordered=True)

row = train.userId.astype(userId_c).cat.codes
col = train.movieId.astype(movieId_c).cat.codes

In [241]:
urm_sparse = sparse.csr_matrix((train["rating"], (row, col)), \
                           shape=(userId_c.categories.size, movieId_c.categories.size), dtype=np.float32)

In [242]:
urm_sparse

<8481x4237 sparse matrix of type '<class 'numpy.float32'>'
	with 5566601 stored elements in Compressed Sparse Row format>

In [243]:
with open('urm_sparse.pkl', 'wb') as f:
    pickle.dump(urm_sparse, f)

In [244]:
##### Test

In [276]:
#Create URM
row = test.userId.astype(userId_c).cat.codes
col = test.movieId.astype(movieId_c).cat.codes

In [280]:
urm_sparse_test = sparse.csr_matrix((test["rating"], (row, col)), \
                           shape=(userId_c.categories.size, movieId_c.categories.size), dtype=np.float32)

In [281]:
urm_sparse_test

<8481x4237 sparse matrix of type '<class 'numpy.float32'>'
	with 281497 stored elements in Compressed Sparse Row format>

In [282]:
with open('urm_sparse_test.pkl', 'wb') as f:
    pickle.dump(urm_sparse_test, f)

### Initializing matrix and coeffs

In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy import sparse

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [57]:
with open('urm_sparse.pkl', 'rb') as f:
    urm_sparse = pickle.load(f)

In [58]:
with open('urm_sparse_test.pkl', 'rb') as f:
    urm_sparse_test = pickle.load(f)

In [59]:
mu = 3.383421030314803

In [60]:
indices = urm_sparse.nonzero()

In [61]:
indices[0].shape, indices[1].shape

((5566601,), (5566601,))

In [62]:
indices[0], indices[1]

(array([   0,    0,    0, ..., 8480, 8480, 8480]),
 array([   0,    1,    2, ..., 4232, 4234, 4235]))

%%time
user_films = []

for userid in np.unique(indices[0]):
    ind_mov_ind_1 = np.where(indices[0]==userid)
    user_films.append(indices[1][ind_mov_ind_1])

with open('user_films.pkl', 'wb') as f:
    pickle.dump(user_films, f)

In [64]:
with open('user_films.pkl', 'rb') as f:
    user_films = pickle.load(f)

%%time
films_users = []

for movieid in np.unique(indices[1]):
    ind_usr_ind_1 = np.where(indices[1]==movieid)
    films_users.append(indices[0][ind_usr_ind_1])

with open('films_users.pkl', 'wb') as f:
    pickle.dump(films_users, f)

In [66]:
with open('films_users.pkl', 'rb') as f:
    films_users = pickle.load(f)

In [67]:
#size of latent 
k = 10

In [68]:
N = np.unique(indices[0]).shape[0]
M = np.unique(indices[1]).shape[0]

In [69]:
N, M

(8481, 4237)

In [74]:
w = np.random.rand(N, k)
u = np.random.rand(k, M)

In [75]:
b = np.zeros(N)
c = np.zeros(M)

In [76]:
lamb = 20

In [73]:
from scipy.sparse.linalg import norm

def get_loss(urm_sparse_st):
    N = urm_sparse_st.nonzero()[0].shape[0]
    
    b_sp = sparse.csr_matrix(b.reshape(-1,1))
    c_sp = sparse.csr_matrix(c.reshape(1,-1))
    
    urm_sparse_c = urm_sparse_st - sparse.csr_matrix(w.dot(u))
    
    urm_sparse_c.data -= mu    
    urm_sparse_c -= (urm_sparse_c != 0).multiply(c_sp)
    urm_sparse_c -= (urm_sparse_c != 0).multiply(b_sp)
    #urm_sparse_c -= c_sp
    #urm_sparse_c -= b_sp
    urm_sparse_c = urm_sparse_c.multiply(urm_sparse_st)
    
    loss = norm(urm_sparse_c, ord='fro')# + lamb*(np.linalg.norm(w, ord='fro') +\
           # np.linalg.norm(u, ord='fro') + np.linalg.norm(c) +\
            #np.linalg.norm(b))
    
    return loss**2/N

In [77]:
from datetime import datetime

num_epochs = 5

train_losses = []
test_losses = []

for ep in range(num_epochs):
    print("epoch: ", ep)
    
    t0 = datetime.now()
    for i in range(N): #по юзерам
        u_i = np.take(u, axis=1, indices=user_films[i])
        sum_u = np.zeros((k,k))
        
        for j in range(u_i.shape[1]): #по фильмам
            vec = u_i[:,j].reshape(-1,1)
            sum_u += vec.dot(vec.T)
            
        sum_u += lamb * np.eye(k)
        #sum_u_inv = np.linalg.inv(sum_u) 
        
        r_ij = np.array(urm_sparse[i].todense())[0][user_films[i]]
        
        c_j = c[user_films[i]].reshape(1,-1)
        f_st = (r_ij - b[i] - c_j - mu) * u_i
        s_nd = f_st.sum(axis=1)

        b[i] = (r_ij - w[i].dot(u_i) - c_j - mu).sum(axis=1)
        
        w[i] = np.linalg.solve(sum_u, s_nd)
        b[i] = b[i]/(lamb + u_i.shape[1])      
        
    print("updated W and b:", datetime.now() - t0)    
     
    t0 = datetime.now()
    for j in range(M):
        w_i = np.take(w, axis=0, indices=films_users[j])
        sum_w = np.zeros((k,k))
        
        for i in range(w_i.shape[0]): #по юзерам
            vec = w_i[i,:].reshape(-1,1)
            sum_u += vec.dot(vec.T)
            
        sum_u += lamb * np.eye(k)
        #sum_u_inv = np.linalg.inv(sum_u) 
        
        r_ij = np.array(urm_sparse[:,j].todense())[films_users[j]]
        
        b_i = b[films_users[j]].reshape(-1,1)
        f_st = (r_ij - b_i - c[j] - mu) * w_i
        s_nd = f_st.sum(axis=0)
        
        c[j] = (r_ij - w_i.dot(u[:,j]).reshape(-1,1) - b_i - mu).sum(axis=0)
        
        u[:,j] = np.linalg.solve(sum_u, s_nd)
        c[j] = c[j]/(lamb + w_i.shape[0])
        
    print("updated U and c:", datetime.now() - t0)
    
    t0 = datetime.now()
    #store train loss
    train_losses.append(get_loss(urm_sparse))
    
    #store test loss
    test_losses.append(get_loss(urm_sparse_test))
    
    print("calculate cost:", datetime.now() - t0)
    print("train loss:", train_losses[-1])
    print("test loss:", test_losses[-1])

epoch:  0
updated W and b: 0:00:37.211413
updated U and c: 0:01:24.111266
calculate cost: 0:00:14.822293
train loss: 10.480677577559007
test loss: 10.00980534040464
epoch:  1
updated W and b: 0:00:35.976953
updated U and c: 0:01:10.586746
calculate cost: 0:00:10.070061
train loss: 7.615033877392266
test loss: 7.60170223972117
epoch:  2
updated W and b: 0:00:29.610753
updated U and c: 0:01:14.368048
calculate cost: 0:00:14.357893
train loss: 7.557167338885358
test loss: 7.462099006195266
epoch:  3
updated W and b: 0:00:32.428326
updated U and c: 0:01:00.107308
calculate cost: 0:00:09.877880
train loss: 7.537753138843286
test loss: 7.336303697715944
epoch:  4
updated W and b: 0:00:28.318880


KeyboardInterrupt: 

In [None]:
# plot losses
plt.plot(train_losses, label="train loss")
plt.plot(test_losses, label="test loss")
plt.legend()
plt.show()