In [2]:
import json
import os
import typing as tp
from datetime import date, datetime

import numpy as np
import optuna
import pandas as pd

from loguru import logger
from scipy import sparse
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [3]:
train1 = pd.read_csv('../data/train1level.csv')
test1 = pd.read_csv('../data/test1level.csv')
holdout1 = pd.read_csv('../data/holdout1level.csv')

In [4]:
items = train1.movieid.unique()
test1 = test1[test1.movieid.isin(items)]
users_test = test1.userid.unique()
holdout1 = holdout1[holdout1.userid.isin(users_test)]

In [5]:
train1['rating'], rating_idx_map = pd.factorize(train1['rating'], sort=True)

In [6]:
test1['rating'], rating_idx_map = pd.factorize(test1['rating'], sort=True)

In [7]:
rating_idx_map

Float64Index([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0], dtype='float64')

In [8]:
test1['rating'].unique()

array([5, 3, 9, 0, 8, 7, 6, 4, 2, 1])

In [9]:
import numpy as np
from scipy.sparse.linalg import svds
from polara.lib.sparse import dttm_seq, dttm_par
from polara.lib.sparse import arrange_indices
from polara.tools.display import log_status

In [10]:
def core_growth_callback(growth_tol, verbose=True):
    def check_core_growth(step, core, factors):
        singular_values, _ = core
        core_norm = np.linalg.norm(singular_values)
        g_growth = (core_norm - check_core_growth.core_norm) / core_norm
        check_core_growth.core_norm = core_norm
        log_status(f'growth of the core: {g_growth}')
        if g_growth < growth_tol:
            log_status(f'Core is no longer growing. Norm of the core: {core_norm}.', verbose=verbose)
            raise StopIteration
    check_core_growth.core_norm = 0
    return check_core_growth


def ttm3d_seq(idx, val, shape, U, V, modes, dtype=None):
    mode1, mat_mode1 = modes[0]
    mode2, mat_mode2 = modes[1]

    u = U.T if mat_mode1 == 1 else U
    v = V.T if mat_mode2 == 1 else V

    mode0, = [x for x in (0, 1, 2) if x not in (mode1, mode2)]
    new_shape = (shape[mode0], U.shape[1-mat_mode1], V.shape[1-mat_mode2])

    res = np.zeros(new_shape, dtype=dtype)
    dttm_seq(idx, val, u, v, mode0, mode1, mode2, res)
    return res


def ttm3d_par(idx, val, shape, U, V, modes, unqs, inds, dtype=None):
    mode1, mat_mode1 = modes[0]
    mode2, mat_mode2 = modes[1]

    u = U.T if mat_mode1 == 1 else U
    v = V.T if mat_mode2 == 1 else V

    mode0, = [x for x in (0, 1, 2) if x not in (mode1, mode2)]
    new_shape = (shape[mode0], U.shape[1-mat_mode1], V.shape[1-mat_mode2])

    res = np.zeros(new_shape, dtype=dtype)
    dttm_par(idx, val, u, v, mode1, mode2, unqs, inds, res)
    return res


def initialize_factors(dims, ranks, seed):
    random_state = np.random if seed is None else np.random.RandomState(seed)
    factors = []
    for dim, rank in zip(dims, ranks):
        u_rnd = random_state.rand(dim, rank)
        u = np.linalg.qr(u_rnd, mode='reduced')[0]
        factors.append(u)
    return factors


def hooi(idx, val, shape, core_shape, return_core=True, num_iters=4,
         parallel_ttm=False, growth_tol=0.001, iter_callback=None,
         verbose=True, seed=None):
    '''
    Compute Tucker decomposition of a sparse tensor in COO format
    with the help of HOOI algorithm. Usage:
    u0, u1, u2, g = hooi(idx, val, shape, core_shape)
    '''
    tensor_data = idx, val, shape
    if not isinstance(parallel_ttm, (list, tuple)):
        parallel_ttm = [parallel_ttm] * len(shape)

    assert len(shape) == len(parallel_ttm)

    index_data = arrange_indices(idx, parallel_ttm)
    ttm = [ttm3d_par if par else ttm3d_seq for par in parallel_ttm]

    if iter_callback is None:
        iter_callback = core_growth_callback(growth_tol, verbose=verbose)
    iter_callback.stop_reason = 'Exceeded max iterations limit.'

    u1, u2 = initialize_factors(shape[1:], core_shape[1:], seed)
    g = None
    r0, r1, r2 = core_shape
    return_core_vectors = True if return_core else 'u'
    for i in range(num_iters):
        log_status('Step %i of %i' % (i+1, num_iters), verbose=verbose)

        u0 = ttm[0](*tensor_data, u2, u1, ((2, 0), (1, 0)), *index_data[0]).reshape(shape[0], r1*r2)
        uu, *_ = svds(u0, k=r0, return_singular_vectors='u')
        u0 = np.ascontiguousarray(uu[:, ::-1])

        u1 = ttm[1](*tensor_data, u2, u0, ((2, 0), (0, 0)), *index_data[1]).reshape(shape[1], r0*r2)
        uu, *_ = svds(u1, k=r1, return_singular_vectors='u')
        u1 = np.ascontiguousarray(uu[:, ::-1])

        u2 = ttm[2](*tensor_data, u1, u0, ((1, 0), (0, 0)), *index_data[2]).reshape(shape[2], r0*r1)
        uu, *core = svds(u2, k=r2, return_singular_vectors=return_core_vectors)
        u2 = np.ascontiguousarray(uu[:, ::-1])

        try:
            iter_callback(i, core, (u0, u1, u2))
        except StopIteration:
            iter_callback.stop_reason = 'Stopping criteria met.'
            break

    if return_core:
        ss, vv = core
        g = (
            np.ascontiguousarray((ss[:, np.newaxis] * vv)[::-1, :])
            .reshape(r2, r1, r0)
            .transpose(2, 1, 0)
        )
    log_status('Done')
    return u0, u1, u2, g

In [11]:
train = train1.drop(['timestamp'], axis=1)
idx = train.values
val = np.ones(idx.shape[0])
n_items = len(items)  # max(train1.movieid) 
n_users = max(train1.userid) 
shape = [n_users, n_items, 10]
core_shape = [70, 70,  5]

In [12]:
idx.shape

(7368284, 3)

In [13]:
val.shape

(7368284,)

In [14]:
u0, u1, u2, g = hooi(idx, val, shape, core_shape, num_iters=4, verbose=True, seed=1509)

Step 1 of 4
growth of the core: 1.0
Step 2 of 4
growth of the core: 0.2009608051839468
Step 3 of 4
growth of the core: 0.0151839099928466
Step 4 of 4
growth of the core: 0.00269574297867928
Done


In [15]:
u1.shape

(9576, 70)

In [16]:
u2.shape

(10, 5)

In [17]:
u0.shape

(64679, 70)

In [18]:
current_u2 = u2[-3:, :]

In [21]:
V_VT = u1 @ u1.T
V_VT.shape

(9576, 9576)

In [23]:
items.shape

(9350,)

In [20]:

for user in test1.userid.unique():
    current_test = test1[test1['userid'] == user]
    current_mat = load_train_data(current_test)
    print(current_mat.shape)
    break

(1, 9436)


In [99]:

W_WT = current_u2 @ current_u2.T

In [19]:
def load_train_data(
        train_data: pd.DataFrame
) -> sparse.csr_matrix:
    """
    Creates csr_matrix for train
    """
    
    n_items = max(train_data.movieid) + 1
    n_users = 1
    rows, cols = train_data["userid"], train_data["movieid"]
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype="float64", shape=(n_users, n_items))
    return data

In [111]:
u1.shape, current_u2.shape

((9576, 70), (3, 5))

In [108]:
for user in test1.userid.unique():
    current_test = test1[test1['userid'] == user]
    current_mat = load_train_data(current_test)
    print(current_mat.shape)
    print(V_VT.shape)
    print(W_WT.shape)
    R = V_VT @ current_mat @ W_WT
    print(R.shape)

(1, 9754)
(9576, 9576)
(3, 3)


ValueError: dimension mismatch