In [2]:
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

import json
import os
import typing as tp
from datetime import date, datetime

import numpy as np
import optuna
import pandas as pd
# from common_metrics.metrics.recsys import MAP, HitRate, NDCG, PrecisionRecall
from loguru import logger
from scipy import sparse
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [3]:
train1 = pd.read_csv('../data/train1level.csv')
test1 = pd.read_csv('../data/test1level.csv')
holdout1 = pd.read_csv('../data/holdout1level.csv')

In [4]:
items = train1.movieid.unique()
test1 = test1[test1.movieid.isin(items)]
users_test = test1.userid.unique()
holdout1 = holdout1[holdout1.userid.isin(users_test)]

In [5]:
def load_train_data(
        train_data: pd.DataFrame
) -> sparse.csr_matrix:
    """
    Creates csr_matrix for train
    """
    
    n_items = max(train_data.movieid) + 1
    n_users = max(train_data.userid) + 1
    rows, cols = train_data["userid"], train_data["movieid"]
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype="float64", shape=(n_users, n_items))
    return data

In [6]:
data = load_train_data(train1)
test = load_train_data(test1)

logger.info("data ready")

2022-07-11 09:37:38.713 | INFO     | __main__:<cell line: 4>:4 - data ready


In [25]:
def nonzeros(m, row):
    for index in range(m.indptr[row], m.indptr[row+1]):
        yield m.indices[index], m.data[index]
      
      
def implicit_als_cg(Cui, features=20, iterations=20, lambda_val=0.1):
    user_size, item_size = Cui.shape

    X = np.random.rand(user_size, features) * 0.01
    Y = np.random.rand(item_size, features) * 0.01

    Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()

    for iteration in tqdm(range(iterations)):
        least_squares_cg(Cui, X, Y, lambda_val)
        least_squares_cg(Ciu, Y, X, lambda_val)
    
    return sparse.csr_matrix(X), sparse.csr_matrix(Y)
  
    
def least_squares_cg(Cui, X, Y, lambda_val, cg_steps=3):
    users, features = X.shape

    YtY = Y.T.dot(Y) + lambda_val * np.eye(features)

    for u in range(users):

        x = X[u]
        r = -YtY.dot(x)

        for i, confidence in nonzeros(Cui, u): 
            r += (confidence - (confidence - 1) * Y[i].dot(x)) * Y[i]

        p = r.copy()
        rsold = r.dot(r)

        for it in range(cg_steps):
            Ap = YtY.dot(p)
            for i, confidence in nonzeros(Cui, u):
                Ap += (confidence - 1) * Y[i].dot(p) * Y[i]

            alpha = rsold / p.dot(Ap)
            x += alpha * p
            r -= alpha * Ap

            rsnew = r.dot(r)
            p = r + (rsnew / rsold) * p
            rsold = rsnew

        X[u] = x

alpha_val = 15
conf_data = (data * alpha_val).astype('double')
user_vecs, item_vecs = implicit_als_cg(conf_data, iterations=10, features=20)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [43:33<00:00, 261.32s/it]


In [26]:
def nonzeros(m, row):
    for index in range(m.indptr[row], m.indptr[row+1]):
        yield m.indices[index], m.data[index]
      
      
def implicit_als_cg(Cui, Y, features=20, iterations=20, lambda_val=0.1):
    user_size, item_size = Cui.shape

    X = np.random.rand(user_size, features) * 0.01
#     Y = np.random.rand(item_size, features) * 0.01

    Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()

    for iteration in tqdm(range(iterations)):
#         print 'iteration %d of %d' % (iteration+1, iterations)
        least_squares_cg(Cui, X, Y, lambda_val)
#         least_squares_cg(Ciu, Y, X, lambda_val)
    
    return sparse.csr_matrix(X), sparse.csr_matrix(Y)
  
    
def least_squares_cg(Cui, X, Y, lambda_val, cg_steps=3):
    users, features = X.shape

    YtY = Y.T.dot(Y) + lambda_val * np.eye(features)

    for u in range(users):

        x = X[u]
        r = -YtY.dot(x)

        for i, confidence in nonzeros(Cui, u):
            r += (confidence - (confidence - 1) * Y[i].dot(x)) * Y[i]

        p = r.copy()
        rsold = r.dot(r)

        for it in range(cg_steps):
            Ap = YtY.dot(p)
            for i, confidence in nonzeros(Cui, u):
                Ap += (confidence - 1) * Y[i].dot(p) * Y[i]

            alpha = rsold / p.dot(Ap)
            x += alpha * p
            r -= alpha * Ap

            rsnew = r.dot(r)
            p = r + (rsnew / rsold) * p
            rsold = rsnew

        X[u] = x

alpha_val = 15
conf_data = (test * alpha_val).astype('double')
user_vecs, item_vecs = implicit_als_cg(conf_data, item_vecs.toarray(), iterations=10, features=20)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:09<00:00,  6.94s/it]


In [27]:
AA_ials = user_vecs.dot(item_vecs.T)

In [28]:
AA_ials.shape

(4048, 9577)

In [29]:
AA_ials.toarray()

array([[ 3.64546247e-01,  1.58374819e-01,  6.73668254e-02, ...,
         6.65971554e-10,  9.19521758e-04,  8.29918317e-03],
       [ 1.71671574e-01,  4.82837509e-02,  1.85174768e-01, ...,
         6.04214690e-10,  2.11284335e-03,  1.72451423e-02],
       [ 9.05439862e-05,  1.11235490e-03,  1.03627991e-03, ...,
         9.09000282e-11,  1.79611884e-04,  1.23088042e-03],
       ...,
       [ 5.97558033e-01,  6.14115244e-01,  2.56316868e-01, ...,
         1.62605625e-09,  2.12360220e-03,  2.53384295e-02],
       [ 1.13762776e+00,  1.03432590e+00,  6.10457864e-01, ...,
         1.61390623e-09,  3.80733507e-03,  3.05354072e-02],
       [ 4.52109633e-02, -3.37236162e-02,  1.46415954e-02, ...,
         1.91657593e-09,  1.72518581e-03,  2.09891007e-02]])

In [30]:
def HR(scores, holdout1, count_of_rec = 10):
    count_of_true_rec = 0
    pred_array = np.argsort(-scores)[:, :count_of_rec]
    for index, row in holdout1.iterrows():
        movie = row.movieid
#         print(movie)
        recommend = pred_array[index]
        if movie in recommend:
            count_of_true_rec += 1
    return count_of_true_rec / len(holdout1)

In [31]:
hr = HR(AA_ials.toarray(), holdout1)
hr

0.027527527527527528