In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from time import time

from line_profiler import LineProfiler
%load_ext Cython

In [2]:
def profile_print(func_to_call, *args):
    profiler = LineProfiler()
    profiler.add_function(func_to_call)
    profiler.runcall(func_to_call, *args)
    profiler.print_stats()

In [3]:
train_data = np.array(pd.read_csv('./train.txt', header=None, engine='c').iloc[:, 0:3])

In [3]:
X_test = np.array(pd.read_csv('./test.txt', header=None).iloc[:, 0:2])

In [4]:
args = np.argsort(X_test[:, 1])
bounds = np.zeros((2649429, 2), dtype=int)

st = 0
end = 0
prev_u = X_test[args[0], 1] - 1

for i in tqdm(range(len(args))):
    pos = args[i]
    if X_test[pos, 1] - 1 != prev_u:
        bounds[prev_u][0] = st
        bounds[prev_u][1] = end
        st = i
        end = i + 1
        prev_u = X_test[pos, 1] - 1
    else:
        end += 1

100%|██████████| 20095978/20095978 [00:20<00:00, 962311.92it/s]


In [7]:
sm = 0
for i in tqdm(range(len(bounds))):
    sm += bounds[i, 1] - bounds[i, 0]

100%|██████████| 2649429/2649429 [00:02<00:00, 1219555.26it/s]


In [11]:
sm

20095928

In [13]:
%%cython -a

import numpy as np
cimport numpy as np
from libc.math cimport sqrt
from tqdm import tqdm
from time import time

cdef np.float64_t implicit_delta(long u, long m,
                    np.ndarray[np.float64_t, ndim=2] q,
                    np.ndarray[long, ndim=2] implicit_data, np.ndarray[long, ndim=2] bounds, 
                    np.ndarray[long, ndim=1] args, np.ndarray[np.float64_t, ndim=2] y):
    cdef np.float64_t result, N
    cdef long i, k, im_id
    
    result = 0.0
    
    N = sqrt(1.0 * (bounds[u, 1] - bounds[u, 0]))
        
    for j in range(bounds[u, 0], bounds[u, 1]):
        k = args[j]
        im_id = implicit_data[k, 0] - 1
            
        result += (q[m, 0] * y[im_id, 0] + q[m, 1] * y[im_id, 1]) / N
    
    return result

cdef np.float64_t[:] implicit_delta01(long u, long m,
                    np.ndarray[long, ndim=2] implicit_data, np.ndarray[long, ndim=2] bounds, 
                    np.ndarray[long, ndim=1] args, np.ndarray[np.float64_t, ndim=2] y):
    cdef np.float64_t result0, result1, N
    cdef long i, k, im_id
    
    result0 = 0.0
    result1 = 0.0
    
    N = sqrt(1.0 * (bounds[u, 1] - bounds[u, 0]))
        
    for j in range(bounds[u, 0], bounds[u, 1]):
        k = args[j]
        im_id = implicit_data[k, 0] - 1
            
        result0 += y[im_id, 0] / N
        result1 += y[im_id, 1] / N
    
    return result0, result1
        
def loss(np.ndarray[long, ndim=2] data, np.ndarray[np.float64_t, ndim=2] p, np.ndarray[np.float64_t, ndim=2] q,
        np.ndarray[np.float64_t, ndim=1] bu, np.ndarray[np.float64_t, ndim=1] bm, np.float64_t mv,
        np.ndarray[long, ndim=2] implicit_data, np.ndarray[long, ndim=2] bounds, 
        np.ndarray[long, ndim=1] args, np.ndarray[np.float64_t, ndim=2] y):
    cdef np.float64_t result, r, pred
    cdef long i, j, u, m, k
    
    result = 0.0
    
    for i in tqdm(range(data.shape[0])):
        m = data[i, 0] - 1
        u = data[i, 1] - 1
        r = data[i, 2] * 1.0
            
        pred = mv + bu[u] + bm[m] + p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1]
        pred += implicit_delta(u, m, q, implicit_data, bounds, args, y)
        
        result += (r - pred) ** 2 / data.shape[0]
    return result

def predict(np.ndarray[np.float64_t, ndim=2] p, np.ndarray[np.float64_t, ndim=2] q, 
            np.ndarray[np.float64_t, ndim=1] bu, np.ndarray[np.float64_t, ndim=1] bm, np.float64_t mv,
            np.ndarray[long, ndim=2] X,
            np.ndarray[long, ndim=2] implicit_data, np.ndarray[long, ndim=2] bounds, 
            np.ndarray[long, ndim=1] args, np.ndarray[np.float64_t, ndim=2] y):
    cdef long i, u, m, k
    cpdef np.ndarray[np.float64_t, ndim=1] result
    
    
    result = np.zeros(X.shape[0])
    
    for i in range(X.shape[0]):
        m = X[i, 0] - 1
        u = X[i, 1] - 1
        
        result[i] = mv + bu[u] + bm[m] + p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1]
        result[u] += implicit_delta(u, m, q, implicit_data, bounds, args, y)
        
    return result
        
def do_sgd(np.ndarray[long, ndim=2] data, np.ndarray[np.float64_t, ndim=2] p, np.ndarray[np.float64_t, ndim=2] q,
            np.ndarray[np.float64_t, ndim=1] bu, np.ndarray[np.float64_t, ndim=1] bm, np.float64_t mv,
            np.ndarray[long, ndim=2] implicit_data, np.ndarray[long, ndim=2] bounds, 
            np.ndarray[long, ndim=1] args, np.ndarray[np.float64_t, ndim=2] y,
            int n_epoch=1, np.float64_t lr=0.1, np.float64_t lmd=1.0):
    cdef long i, k, u, m
    cdef np.float64_t delta0, delta1, r, dt, mv1
    cpdef np.ndarray[np.float64_t, ndim=2] p1, q1
    cpdef np.ndarray[np.float64_t, ndim=1] cntu, cntm, bu1, bm1, buf
    
    cntu = np.zeros(p.shape[0])
    cntm = np.zeros(q.shape[0])
    
    for j in range(data.shape[0]):
        m = data[j, 0] - 1
        u = data[j, 1] - 1
        cntu[u] += 1.0
        cntm[m] += 1.0
    
    p1 = np.copy(p)
    q1 = np.copy(q)
    bu1 = np.zeros(bu.shape[0])
    bm1 = np.zeros(bm.shape[0])
    
    for i in range(n_epoch):
        t1 = time()
    
        if i % 4 == 0:
            for j in range(p.shape[0]):
                p1[j, 0] = (1.0 - lr * lmd) * p[j, 0]
                p1[j, 1] = (1.0 - lr * lmd) * p[j, 1]
        elif i % 4 == 1:
            for j in range(q.shape[0]):
                q1[j, 0] = (1.0 - lr * lmd) * q[j, 0]
                q1[j, 1] = (1.0 - lr * lmd) * q[j, 1]
        elif i % 4 == 2:
            for j in range(bu.shape[0]):
                bu1[j] = 0.0
        elif i % 4 == 3:
            for j in range(bm.shape[0]):
                bm1[j] = 0.0
            pass
    
        for j in range(data.shape[0]):
            m = data[j, 0] - 1
            u = data[j, 1] - 1
            r = data[j, 2] * 1.0
    
            if i % 5 == 0:
                #p_u stage
                dt = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1] + bu[u] + bm[m] + mv
                dt += implicit_delta(u, m, q, implicit_data, bounds, args, y)
                
                p1[u, 0] += lr * (r - dt) * q[m, 0] / cntu[u]
                p1[u, 1] += lr * (r - dt) * q[m, 1] / cntu[u]
            elif i % 5 == 1:
                # q_m stage
                dt = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1] + bu[u] + bm[m] + mv
                buf = implicit_delta01(u, m, implicit_data, bounds, args, y)
                
                q1[m, 0] += lr * (r - dt) * (p[u, 0] + buf[0]) / cntm[m]
                q1[m, 1] += lr * (r - dt) * (p[u, 1] + buf[1]) / cntm[m]
            elif i % 5 == 2:
                #bu stage
                dt = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1] + bm[m] + mv
                dt += implicit_delta(u, m, q, implicit_data, bounds, args, y)
                
                bu1[u] += (r - dt) / (lmd + cntu[u])
            elif i % 5 == 3:
                #bm stage
                dt = p[u, 0] * q[m, 0] + p[u, 1] * q[m, 1] + bu[u] + mv
                dt += implicit_delta(u, m, q, implicit_data, bounds, args, y)
                
                bm1[m] += (r - dt) / (lmd + cntm[m])
                pass
        
        if i % 5 == 0:
            for j in range(p.shape[0]):
                p[j, 0] = p1[j, 0]
                p[j, 1] = p1[j, 1]
        elif i % 5 == 1:
            for j in range(q.shape[0]):
                q[j, 0] = q1[j, 0]
                q[j, 1] = q1[j, 1] 
        elif i % 5 == 2:
            for j in range(bu.shape[0]):
                bu[j] = bu1[j]
        elif i % 5 == 3:
            for j in range(bm.shape[0]):
                bm[j] = bm1[j]
        
#         if i % 10 == 0:
        if True:
            print('Time: ' + str(time() - t1))
            print('Epoch:', i, 'Loss:', loss(data, p, q, bu, bm, mv, implicit_data, bounds, args, y))
            print('\n')
    
    return p, q, bu, bm, mv

In [7]:
np.random.seed(10)
p = np.random.random((2649429, 2))
q = np.random.random((17770, 2))
bu = np.random.random(2649429)
bm = np.random.random(17770)
y = np.random.random((2649429, 2))
mv = 3.7

In [111]:
# p = np.loadtxt('./als-bias/p-0923.txt')
# q = np.loadtxt('./als-bias/q-0923.txt')
# bu = np.loadtxt('./als-bias/bu-0923.txt')
# bm = np.loadtxt('./als-bias/bm-0923.txt')
# mv = 3.7#float(np.loadtxt('./als-bias/mv-0923.txt'))

In [9]:
# p, q, bu, bm, mv = do_sgd(train_data, p, q, bu, bm, mv, X_test, bounds, args, y, 1, 0.1, 0.2)

array([[      1, 1488844],
       [      1,  823519],
       [      1,  543865],
       ...,
       [  17770,  834323],
       [  17770,  234275],
       [  17770,  255278]])

In [19]:
np.argsort(X_test[0:10, 1])

array([2, 6, 5, 1, 3, 9, 0, 7, 4, 8])

In [5]:
result = pd.DataFrame(columns=['StringId', 'Mark'])
index = [i+1 for i in range(X_test.shape[0])]

In [116]:
result['StringId'] = index
result['Mark'] = predict(p, q, bu, bm, mv, X_test)

In [117]:
result.to_csv('result.txt', index=False, float_format='%.2f')

In [118]:
!zip result.txt.zip result.txt 

updating: result.txt (deflated 72%)


In [119]:
!kg submit result.txt.zip -c movie-recomendation-competition-ts-spring-2017 -u 'dmitry103' -p 'iopkl5374'

Starting new HTTPS connection (1): www.kaggle.com
0.92294


In [96]:
np.savetxt('./als-bias/p-0923.txt', p)
np.savetxt('./als-bias/q-0923.txt', q)

In [98]:
np.savetxt('./als-bias/bu-0923.txt', bu)
np.savetxt('./als-bias/bm-0923.txt', bm)
np.savetxt('./als-bias/mv-0923.txt', [mv])

In [21]:
implicit_delta(6, 0, q, X_test, bounds, args, y)

3.7418004103784255

In [20]:
q[0].dot(implicit_delta01(6, 0, X_test, bounds, args, y))

3.7418004103784264

In [14]:
loss(train_data, p, q, bu, bm, mv, X_test, bounds, args, y)

  5%|▌         | 4235869/80384529 [02:15<40:33, 31291.18it/s]

KeyboardInterrupt: 