In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from time import time

from line_profiler import LineProfiler
%load_ext Cython

In [4]:
def profile_print(func_to_call, *args):
    profiler = LineProfiler()
    profiler.add_function(func_to_call)
    profiler.runcall(func_to_call, *args)
    profiler.print_stats()

In [5]:
train_data = np.array(pd.read_csv('./train30.txt', header=None, engine='c'))

In [17]:
test_data = np.array(pd.read_csv('./test30.txt', header=None, engine='c'))

In [13]:
# test_data[3] = test_data[2]
# test_data[2] = 0

In [16]:
# test_data.to_csv('./test30.txt', header=False, index=False)

In [73]:
# dates = np.unique(train_data[3])
# test_dates = np.unique(test_data[2])

In [34]:
# for i in range(1, dates.shape[0]):
#     f = date(*(map(int, dates[i].split('-')))) > date(*(map(int, dates[i - 1].split('-'))))
    
#     if not f:
#         print(i)

In [52]:
# import bisect

In [54]:
# bisect.bisect_left(dates, dates[100])

100

In [79]:
# dates2id = {}
# for i in range(dates.shape[0]):
#     idx = int(np.floor(bisect.bisect_left(dates, dates[i]) / 71))
#     dates2id[dates[i]] = idx

In [86]:
# train_data[4] = list(map(lambda x: dates2id[x], train_data[3]))
# train_data = train_data.drop([3], axis=1)
# train_data.to_csv('./train30.txt', header=False, index=False)

In [92]:
# test_data[2] = list(map(lambda x: dates2id[x], test_data[2]))
# test_data.to_csv('./test30.txt', header=False, index=False)

In [12]:
%%cython -a

cimport cython
import numpy as np
cimport numpy as np

import time

def loss(np.ndarray[long, ndim=2] data, np.ndarray[np.float64_t, ndim=3] p, np.ndarray[np.float64_t, ndim=3] q,
        np.ndarray[np.float64_t, ndim=2] bu, np.ndarray[np.float64_t, ndim=2] bm, np.float64_t mv):
    cdef np.float64_t result, r, pred, mlt
    cdef long i, u, m, k, date
    
    result = 0.0
    
    for i in range(data.shape[0]):
        m = data[i, 0] - 1
        u = data[i, 1] - 1
        r = data[i, 2] * 1.0
        date = int(data[i, 3])
        
        mlt = 0.0
        for k in range(p.shape[1]):
            mlt += p[u, k, date] * q[m, k, date]
            
        pred = mv + bu[u, date] + bm[m, date] + mlt
        result += (r - pred) ** 2 / data.shape[0]
    return result

def predict(np.ndarray[np.float64_t, ndim=3] p, np.ndarray[np.float64_t, ndim=3] q, 
            np.ndarray[np.float64_t, ndim=2] bu, np.ndarray[np.float64_t, ndim=2] bm, np.float64_t mv,
            np.ndarray[long, ndim=2] X):
    cdef np.float64_t mlt
    cdef long i, u, m, k, date
    cpdef np.ndarray[np.float64_t, ndim=1] result
    
    
    result = np.zeros(X.shape[0])
    
    for i in range(X.shape[0]):
        m = X[i, 0] - 1
        u = X[i, 1] - 1
        date = int(X[i, 3])
        
        mlt = 0.0
        for k in range(p.shape[1]):
            mlt += p[u, k, date] * q[m, k, date]
        
        result[i] = mv + bu[u, date] + bm[m, date] + mlt
    
        if result[i] > 5.0:
            result[i] = 5.0
        if result[i] < 1.0:
            result[i] = 1.0
    
    return result
        
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
@cython.cdivision_warnings(False)
@cython.cdivision(True)
def do_sgd(np.ndarray[long, ndim=2] data, np.ndarray[np.float64_t, ndim=3] p, np.ndarray[np.float64_t, ndim=3] q,
           np.ndarray[np.float64_t, ndim=2] bu, np.ndarray[np.float64_t, ndim=2] bm, np.float64_t mv, 
           int n_epoch=1, np.float64_t lr=0.1, np.float64_t lmd=1.0):
    cdef long i, k, u, m, date, d
    cdef np.float64_t delta0, delta1, r, dt, mv1, mlt
    cpdef np.ndarray[np.float64_t, ndim=3] p1, q1
    cpdef np.ndarray[np.float64_t, ndim=2] bu1, bm1
    cpdef np.ndarray[np.float64_t, ndim=1] cntu, cntm
    
    cntu = np.zeros(p.shape[0])
    cntm = np.zeros(q.shape[0])
    
    for j in range(data.shape[0]):
        m = data[j, 0] - 1
        u = data[j, 1] - 1
        
        cntu[u] += 1.0
        cntm[m] += 1.0
    
#     np.random.seed(10)
#     p = np.random.random((2649429, 2))
#     q = np.random.random((17770, 2))
    
    p1 = np.copy(p)
    q1 = np.copy(q)
    bu1 = np.zeros((bu.shape[0], bu.shape[1]))
    bm1 = np.zeros((bm.shape[0], bm.shape[1]))
    
    for i in range(n_epoch):
        t1 = time.time()
    
        if i % 4 == 0:
            for j in range(p.shape[0]):
                for k in range(p.shape[1]):
                    for d in range(p.shape[2]):
                        p1[j, k, d] = (1.0 - lr * lmd) * p[j, k, d]
        elif i % 4 == 1:
            for j in range(q.shape[0]):
                for k in range(p.shape[1]):
                    for d in range(p.shape[2]):
                        q1[j, k, d] = (1.0 - lr * lmd) * q[j, k, d]
        elif i % 4 == 2:
            for j in range(bu.shape[0]):
                for d in range(bu.shape[1]):
                    bu1[j, d] = 0.0
        elif i % 4 == 3:
            for j in range(bm.shape[0]):
                for d in range(bm.shape[1]):
                    bm1[j, d] = 0.0
            pass
    
        for j in range(data.shape[0]):
            m = data[j, 0] - 1
            u = data[j, 1] - 1
            r = data[j, 2] * 1.0
            date = int(data[j, 3])
    
            mlt = 0.0
            for k in range(p.shape[1]):
                mlt += p[u, k, date] * q[m, k, date]
    
            if i % 4 == 0:
                #p_u stage
                dt = mlt + bu[u, date] + bm[m, date] + mv
                for k in range(p.shape[1]):
                    p1[u, k, date] += lr * (r - dt) * q[m, k, date] / cntu[u]
            elif i % 4 == 1:
                # q_m stage
                dt = mlt + bu[u, date] + bm[m, date] + mv
                for k in range(p.shape[1]):
                    q1[m, k, date] += lr * (r - dt) * p[u, k, date] / cntm[m]
            elif i % 4 == 2:
                #bu stage
                dt = mlt + bm[m, date] + mv
                bu1[u, date] += (r - dt) / (lmd + cntu[u])
                pass
            elif i % 4 == 3:
                #bm stage
                dt = mlt + bu[u, date] + mv
                bm1[m, date] += (r - dt) / (lmd + cntm[m])
                pass
        
        if i % 4 == 0:
            for j in range(p.shape[0]):
                for k in range(p.shape[1]):
                    for d in range(p.shape[2]):
                        p[j, k, d] = p1[j, k, d]
        elif i % 4 == 1:
            for j in range(q.shape[0]):
                for k in range(p.shape[1]):
                    for d in range(p.shape[2]):
                        q[j, k, d] = q1[j, k, d]
        elif i % 4 == 2:
            for j in range(bu.shape[0]):
                for d in range(bu.shape[1]):
                    bu[j, d] = bu1[j, d]
        elif i % 4 == 3:
            for j in range(bm.shape[0]):
                for d in range(bm.shape[1]):
                    bm[j, d] = bm1[j, d]
        
#         if i % 10 == 0:
        if True:
            print('Time: ' + str(time.time() - t1))
            print('Epoch:', i, 'Loss:', loss(data, p, q, bu, bm, mv))
            print('\n')
            time.sleep(0.5)
    
    return p, q, bu, bm, mv

In [15]:
np.random.seed(10)
p = np.random.normal(0, 0.1, (2649429, 2, 31))
q = np.random.normal(0, 0.1, (17770, 2, 31))
bu = np.random.normal(0, 0.1, (2649429, 31))
bm = np.random.normal(0, 0.1, (17770, 31))
mv = 3.7

In [111]:
# p = np.loadtxt('./als-bias/p-0923.txt')
# q = np.loadtxt('./als-bias/q-0923.txt')
# bu = np.loadtxt('./als-bias/bu-0923.txt')
# bm = np.loadtxt('./als-bias/bm-0923.txt')
# mv = 3.7#float(np.loadtxt('./als-bias/mv-0923.txt'))

In [17]:
p, q, bu, bm, mv = do_sgd(train_data, p, q, bu, bm, mv, 5, 0.001, 0.01)

Time: 15.076218128204346
Epoch: 0 Loss: 1.0269635289961123


Time: 10.93113398551941
Epoch: 1 Loss: 1.0269635124062781


Time: 11.902992963790894
Epoch: 2 Loss: 1.0275813566857994


Time: 5.962985038757324
Epoch: 3 Loss: 1.0273938259207414


Time: 73.7034649848938
Epoch: 4 Loss: 1.0273935736157584




In [8]:
answer1 = pd.read_csv('/Users/dmitry103/Downloads/submission_mcmc0112_25it_0130.txt.csv')
answer2 = pd.read_csv('/Users/dmitry103/Downloads/result.txt')
answer3 = pd.read_csv('/Users/dmitry103/Downloads/submission_mcmc0112_35it_0130.csv')

In [2]:
answer4 = pd.read_csv('/Users/dmitry103/Downloads/out.txt', header=None)

In [2]:
answer5 = pd.read_csv('/Users/dmitry103/Downloads/out-2.txt', header=None)

In [4]:
answer0 = pd.read_csv('/Users/dmitry103/Documents/Programing/Technospere/datamining-2/netflix/result-81424.txt')

In [3]:
answer6 = pd.read_csv('/Users/dmitry103/Documents/Programing/Technospere/datamining-2/netflix/out150.txt', header=None)

In [22]:
# X_test = np.array(pd.read_csv('./test.txt', header=None).iloc[:, 0:2])
# result = pd.DataFrame(columns=['StringId', 'Mark'])
# index = [i+1 for i in range(X_test.shape[0])]

In [4]:
index = [i+1 for i in range(20095978)]
# result['Mark'] = predict(p, q, bu, bm, mv, X_test)
result = pd.DataFrame(columns=['StringId', 'Mark'])
result['StringId'] = index

In [5]:
result['Mark'] = answer6[0]#answer5[0] * 0.7 + answer0['Mark'] * 0.3#answer4[0] * 0.7 + 0.3 * (((answer1['Mark'] + answer2['Mark']) / 2.0) * 0.3 + answer3['Mark'] * 0.7)

In [6]:
result.to_csv('result.txt', index=False, float_format='%.3f')

In [30]:
!zip result.txt.zip result.txt 

updating: result.txt (deflated 72%)


In [6]:
!kg submit result.txt -c movie-recomendation-competition-ts-spring-2017 -u 'dmitry103' -p 'iopkl5374'

Starting new HTTPS connection (1): www.kaggle.com
'NoneType' object has no attribute 'group'


In [127]:
np.savetxt('./als-bias/p-0921.txt', p)
np.savetxt('./als-bias/q-0921.txt', q)

In [128]:
np.savetxt('./als-bias/bu-0921.txt', bu)
np.savetxt('./als-bias/bm-0921.txt', bm)
np.savetxt('./als-bias/mv-0921.txt', [mv])