In [1]:
import os
import random
import logging

import numpy as np
import pandas as pd
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight
from numpy.random import MT19937, RandomState, SeedSequence
from scipy.optimize import minimize
from scipy.sparse import csr_matrix
from tqdm import tqdm

In [2]:
# logging
logger = logging.getLogger("Implicit")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [logging.FileHandler("../../data/alphas/Implicit/log"), logging.StreamHandler()]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

## Prepare Data

In [3]:
training = pd.read_csv("../../data/splits/training.csv")
validation = pd.read_csv("../../data/splits/validation.csv")

## Evaluate model

In [4]:
valid_users = list(set(validation.username))

In [5]:
def get_recommendations(model, mat, user):
    n_items = mat.shape[1]
    recs = model.recommend(
        user, mat[user], filter_already_liked_items=False, N=n_items
    )
    return recs[1][np.argsort(recs[0])]

In [6]:
def get_items(user):
    return list(validation[validation.username == user]["anime_id"])

In [7]:
def distribution(preds):
    preds = preds.clip(min=1e-20, max=1e6)
    return preds / sum(preds)

In [8]:
def get_random_crossentropy_loss(model, mat, rng):
    user = rng.choice(valid_users)
    preds = distribution(get_recommendations(model, mat, user))
    items = get_items(user) 
    return -np.log(preds[items]).mean()

In [9]:
def loss(model, mat, seed=20220302, iters=1000):
    rng = RandomState(MT19937(SeedSequence(seed)))
    losses = []
    for j in tqdm(range(iters)):
        losses.append(get_random_crossentropy_loss(model, mat, rng))
    return np.array(losses).mean()

In [10]:
def train_model(a, K):
    mat = csr_matrix(
        (1 + a[0] * training.my_score, (training.username, training.anime_id))
    )
    model = AlternatingLeastSquares(factors=a[2], regularization=a[1], iterations=10)  # faster settings so we can iterate more
    model.fit(mat)
    model, mat
    
def model_loss(a):
    logger.debug(f"params: {a}")
    model, mat = train_model(a)
    l = loss(model, mat)
    logger.debug(f"loss: {l}")
    return l

In [11]:
K = 100
losses = []
for K in [100, 10]:
    for alpha in [0, 1, 10]:
        for reg in [0, 0.1, 1]:
            losses.append((K, alpha, reg, model_loss((alpha, reg, K))))
            print(losses[-1])

In [13]:
# [(0, 0, 7.231668),
#  (0, 0.1, 7.1817513),
#  (0, 1, 7.3454037),
#  (1, 0, 7.2985926),
#  (1, 0.1, 7.332408),
#  (1, 1, 7.147287),
#  (10, 0, 7.503974),
#  (10, 0.1, 7.5582485),
#  (10, 1, 7.4923224)]
#losses

In [17]:
a = (1, 1)
mat = csr_matrix(
    (1 + a[0] * training.my_score, (training.username, training.anime_id))
)
als_model = AlternatingLeastSquares(factors=K, regularization=a[1])
als_model.fit(mat)

  0%|          | 0/15 [00:00<?, ?it/s]

## Save model

In [19]:
path = "../../data/alphas/Implicit"
if not os.path.exists(path):
    os.mkdir(path)

In [20]:
np.savetxt(
    os.path.join(path, "als_item_factors.csv.gz"), als_model.item_factors, delimiter=","
)
np.savetxt(
    os.path.join(path, "als_user_factors.csv.gz"), als_model.user_factors, delimiter=","
)