In [17]:
import typing as tp
from datetime import date, datetime

import numpy as np
import optuna
import pandas as pd
import scipy
# from common_metrics.metrics.recsys import MAP, HitRate, NDCG, PrecisionRecall
from lightfm import LightFM, data
from loguru import logger
from tqdm import tqdm

In [39]:
train1 = pd.read_csv('../data/train1level.csv')
test1 = pd.read_csv('../data/test1level.csv')
holdout1 = pd.read_csv('../data/holdout1level.csv')

In [40]:
items = train1.movieid.unique()
test1 = test1[test1.movieid.isin(items)]
users_test = test1.userid.unique()
holdout1 = holdout1[holdout1.userid.isin(users_test)]

In [41]:
train1 = train1[['userid','movieid']]
test1 = test1[['userid','movieid']]
holdout1 = holdout1[['userid','movieid']]

In [44]:
def build_interactions_matrix(
        train: pd.DataFrame, test: pd.DataFrame
) -> tp.Tuple[tp.Tuple[scipy.sparse.coo_matrix, scipy.sparse.coo_matrix], data.Dataset]:
    """
    Builds interactions matrix for train, test and creats dataset
    """
    logger.info("start building interactions")
    dataset = data.Dataset()
    dataset.fit((train["userid"].values), (train["movieid"].values))
    train_interact = dataset.build_interactions(train.to_numpy())
    
#     dataset.fit_partial((test["userid"].values), (test["movieid"].values))
#     test_interact = dataset.build_interactions(test.to_numpy())
    logger.info("end building interactions")
    return train_interact, dataset

In [45]:
train_interact, dataset = build_interactions_matrix(train1, test1)

2022-07-08 22:30:41.515 | INFO     | __main__:build_interactions_matrix:7 - start building interactions
2022-07-08 22:31:04.963 | INFO     | __main__:build_interactions_matrix:14 - end building interactions


In [46]:
def fit_lightfm(
        train_interact: tp.Tuple[scipy.sparse.coo_matrix, scipy.sparse.coo_matrix], params: tp.Dict[str, tp.Any],
    epoch_lightfm: int
) -> tp.Tuple[LightFM, tp.Dict[str, tp.Any]]:
    """
    Fits lightfm with given parameters
    """
    model = LightFM(**params)
    logger.info("start fitting")
    for i in tqdm(range(epoch_lightfm)):
        model.fit_partial(
            train_interact[0],
            sample_weight=train_interact[1],
            epochs=1,
            num_threads=20,
        )
    logger.info("end fitting")
    return model, dict(params)

In [47]:
epoch_lightfm = 1

params = {'no_components': 128,
  'loss': 'warp',
  'user_alpha': 9.99e-07, 
  'item_alpha': 5.44e-06, 
  'learning_rate': 0.024,
  'max_sampled': 248}

model, params = fit_lightfm(train_interact, params, epoch_lightfm)

2022-07-08 22:31:04.991 | INFO     | __main__:fit_lightfm:9 - start fitting
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:13<00:00, 73.80s/it]
2022-07-08 22:32:18.799 | INFO     | __main__:fit_lightfm:17 - end fitting


In [49]:
dataset.fit_partial((test1["userid"].values), (test1["movieid"].values))
# test_interact = dataset.build_interactions(test.to_numpy())
model.predict((test1["userid"].values), (test1["movieid"].values))

ValueError: The item feature matrix specifies more features than there are estimated feature embeddings: 9350 vs 9577.

In [50]:
len(train1.movieid.unique())

9350

In [51]:
len(test1.movieid.unique())

8028