In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from time import time

from line_profiler import LineProfiler
%load_ext Cython

In [2]:
def profile_print(func_to_call, *args):
    profiler = LineProfiler()
    profiler.add_function(func_to_call)
    profiler.runcall(func_to_call, *args)
    profiler.print_stats()

In [3]:
train_data = np.array(pd.read_csv('./train.txt', header=None, engine='c').iloc[:, 0:3])

In [39]:
train_data = train_data[0:50]

In [5]:
# def read_gen():
#     fin = open('./train.txt', 'r')
#     cnt = 0
    
#     for line in fin:
#         cnt += 1
#         arr = line.strip().split(',')
#         if len(arr) != 4:
#             continue
    
#         m, u, r, d = arr
    
#         if cnt % 1000000 == 0:
#             print('Processed ' + str(cnt))
    
#         yield int(m) - 1, int(u) - 1, int(r)
    
#     fin.close()

In [5]:
# fin = open('./train.txt', 'r')

# t1 = time()

# for line in fin:
#     arr = line.strip().split(',')
#     if len(arr) != 4:
#         continue
    
#     m, u, r, d = arr
    
#     users.add(int(u))
#     films.add(int(m))
    
# print(time()-t1)

# fin.close()

169.8168444633484


In [57]:
%%cython -a

import numpy as np
cimport numpy as np

from time import time

def loss(np.ndarray[long, ndim=2] data, np.ndarray[np.float64_t, ndim=2] p, np.ndarray[np.float64_t, ndim=2] q):
    cdef np.float64_t result, r, pred
    cdef long i, u, m, k
    
    result = 0.0
    
    for i in range(data.shape[0]):
        m = data[i, 0] - 1
        u = data[i, 1] - 1
        r = data[i, 2] * 1.0
            
        pred = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1]
        result += (r - pred) ** 2 / data.shape[0]
    return result

def predict(np.ndarray[np.float64_t, ndim=2] p, np.ndarray[np.float64_t, ndim=2] q, np.ndarray[long, ndim=2] X):
    cdef long i, u, m, k
    cpdef np.ndarray[np.float64_t, ndim=1] result
    
    
    result = np.zeros(X.shape[0])
    
    for i in range(X.shape[0]):
        m = X[i, 0] - 1
        u = X[i, 1] - 1
        
        result[i] = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1]
    
    return result
        
def do_sgd(np.ndarray[long, ndim=2] data, np.ndarray[np.float64_t, ndim=2] p, np.ndarray[np.float64_t, ndim=2] q,
           int n_epoch=1, np.float64_t lr=0.1, np.float64_t lmd=1.0):
    cdef long i, k, u, m
    cdef np.float64_t delta0, delta1, r, dt
    cpdef np.ndarray[np.float64_t, ndim=2] p1, q1
    
    np.random.seed(10)
#     p = np.random.random((2649429, 2))
#     q = np.random.random((17770, 2))
    
    p1 = np.copy(p)
    q1 = np.copy(q)
    
    for i in range(n_epoch):
        t1 = time()
    
        for j in range(p.shape[0]):
            p1[j, 0] = (1.0 - lr * lmd) * p[j, 0]
            p1[j, 1] = (1.0 - lr * lmd) * p[j, 1]

        for j in range(q.shape[0]):
            q1[j, 0] = (1.0 - lr * lmd) * q[j, 0]
            q1[j, 1] = (1.0 - lr * lmd) * q[j, 1]
    
        if i % 2 == 0:
            #p_u stage
        
            for j in range(data.shape[0]):
                m = data[j, 0] - 1
                u = data[j, 1] - 1
                r = data[j, 2] * 1.0
                
                dt = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1]
                p1[u, 0] += lr * (r - dt) * q[m, 0]
                p1[u, 1] += lr * (r - dt) * q[m, 1]
                
#                 print('dt', dt)
                
#                 print('p1[u]', p1[u])
                
            for j in range(p.shape[0]):
                p[j, 0] = p1[j, 0]
                p[j, 1] = p1[j, 1]
                
#             print(p[822108], q[0])
#             print(p[822108].dot(q[0]))
        else:
            # q_i stage
            for j in range(data.shape[0]):
                m = data[j, 0] - 1
                u = data[j, 1] - 1
                r = data[j, 2] * 1.0
                
                dt = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1]
                q1[m, 0] += lr * (r - dt) * p[u, 0]
                q1[m, 1] += lr * (r - dt) * p[u, 1]
                
#             print('q[m]', q[m])
                
            for j in range(q.shape[0]):
                q[j, 0] = q1[j, 0]
                q[j, 1] = q1[j, 1] 
            
#             print(p[822108], q[0])
#             print(p[822108].dot(q[0]))
        
        if i % 10 == 0:
            print('Time: ' + str(time() - t1))
            print('Epoch:', i, 'Loss:', loss(data, p, q))
            print('\n')
    
    return p, q

In [13]:
# n_epoch = 1
# lr = 0.1
# lmd = 1.0

# for i in range(n_epoch):
#     t1 = time()
    
#     gen = read_gen()
    
#     if i % 2 == 0:
#         #p_u stage
        
#         pun = (p ** 2).sum(axis=1)
        
#         for m, u, r in gen:
#             delta = (r - p[u].dot(q[m])) * q[m] - lmd * p[u]
#             p[u] += lr * delta
#     else:
#         for m, u, r in gen:
#             delta = (r - p[u].dot(q[m])) * p[u]
#             q[m] += lr * delta
            
#     print('Time: ' + str(time() - t1))

In [111]:
p = np.loadtxt('./als-plain/p-0931.txt')
q = np.loadtxt('./als-plain/q-0931.txt')

In [126]:
p, q = do_sgd(train_data, p, q, 500, 0.01 * 1e-3, 0.2)

Time: 3.78834867477417
Epoch: 0 Loss: 0.855911238050364


Time: 5.085688829421997
Epoch: 10 Loss: 0.85576368435409


Time: 3.596402883529663
Epoch: 20 Loss: 0.8556184311422534


Time: 4.491948843002319
Epoch: 30 Loss: 0.8554754268700022


Time: 5.316196918487549
Epoch: 40 Loss: 0.8553346228844689


Time: 4.770061254501343
Epoch: 50 Loss: 0.8551959720999648


Time: 5.139500141143799
Epoch: 60 Loss: 0.8550594288420967


Time: 3.639463186264038
Epoch: 70 Loss: 0.8549249487754185


Time: 4.156174659729004
Epoch: 80 Loss: 0.85479248884259


Time: 5.23666524887085
Epoch: 90 Loss: 0.8546620072175871


Time: 4.381762266159058
Epoch: 100 Loss: 0.854533463253414


Time: 4.1988513469696045
Epoch: 110 Loss: 0.8544068174399646


Time: 4.352834463119507
Epoch: 120 Loss: 0.8542820313621198


Time: 3.5894579887390137
Epoch: 130 Loss: 0.8541590676567732


Time: 4.513508319854736
Epoch: 140 Loss: 0.8540378899765335


Time: 3.71760630607605
Epoch: 150 Loss: 0.8539184629517509


Time: 5.240488767623901
Ep

In [16]:
# X_test = np.array(pd.read_csv('./test.txt', header=None).iloc[:, 0:2])

In [17]:
# result = pd.DataFrame(columns=['StringId', 'Mark'])

In [33]:
# index = [i+1 for i in range(X_test.shape[0])]

In [127]:
result['StringId'] = index
result['Mark'] = predict(p, q, X_test)

In [128]:
result.to_csv('result.txt', index=False, float_format='%.2f')

In [122]:
!head ./result.txt

StringId,Mark
1,3.28
2,3.89
3,4.01
4,3.23
5,3.40
6,3.59
7,2.96
8,4.46
9,3.65


In [129]:
!zip result.txt.zip result.txt 

updating: result.txt (deflated 72%)


In [130]:
!kg submit result.txt.zip -c movie-recomendation-competition-ts-spring-2017 -u 'dmitry103' -p 'iopkl5374'

Starting new HTTPS connection (1): www.kaggle.com
0.92876


In [131]:
!kg submit result.txt.zip -c movie-recomendation-competition-ts-spring-2018 -u 'dmitry103' -p 'iopkl5374'

Starting new HTTPS connection (1): www.kaggle.com
0.92862


In [132]:
np.savetxt('./als-plain/p-0928.txt', p)
np.savetxt('./als-plain/q-0928.txt', q)

In [29]:
q[0].dot(p[822108])

-1711.2168742363042

In [23]:
train_data

array([[      1,  822109,       5],
       [      1,  885013,       4],
       [      1,   30878,       4],
       [      1,  893988,       3],
       [      1,  124105,       4],
       [      1, 1248029,       3],
       [      1, 1842128,       4],
       [      1, 2238063,       3],
       [      1, 1503895,       4],
       [      1, 2207774,       5],
       [      1, 2590061,       3],
       [      1,    2442,       3],
       [      1,  804919,       4],
       [      1, 1086807,       3],
       [      1, 1711859,       4],
       [      1,  372233,       5],
       [      1, 1080361,       3],
       [      1, 1245640,       3],
       [      1,  558634,       4],
       [      1, 1181550,       3],
       [      1, 1227322,       4],
       [      1,  427928,       4],
       [      1,  814701,       5],
       [      1,  662870,       5],
       [      1,  337541,       5],
       [      1, 1133214,       4],
       [      1, 1537427,       4],
       [      1, 1209954,   

In [39]:
np.random.random((2, 2))

array([[0.36475651, 0.80312363],
       [0.05780821, 0.89899718]])

In [44]:
np.dot(np.array([[1, 2, 3], [2, 3, 4]]), ([[1, 2, 3], [2, 3, 4]]))

ValueError: shapes (2,3) and (2,3) not aligned: 3 (dim 1) != 2 (dim 0)

array([[0.77523924, 0.71312041],
       [0.19377476, 0.16827078],
       [0.40277501, 0.66918522],
       ...,
       [0.45671997, 0.40815862],
       [0.81002158, 0.57314929],
       [0.80613913, 0.11987462]])

array([[      1,  822109,       5],
       [      1,  885013,       4],
       [      1,   30878,       4],
       ...,
       [  17770, 1790158,       4],
       [  17770, 1608708,       3],
       [  17770,  453585,       2]])

In [47]:
a = np.array([[1, 2], [3, 4]])
b = np.copy(a)
b[0][0] = 10

In [48]:
a

array([[1, 2],
       [3, 4]])