In [None]:
%%capture

!pip install recbole
!pip install ray
!pip install kmeans_pytorch
!pip install torch

In [None]:
import ast
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import torch

import time

import warnings
warnings.filterwarnings('ignore')

from collections import Counter
from random import randint, random
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity


In [None]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec, Caser
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.quick_start import run_recbole
from recbole.model.general_recommender.multivae import MultiVAE

Загрузим данные:

In [None]:
!unzip processed_data.zip -x

Archive:  processed_data.zip
  inflating: users_processed_kion.csv  
  inflating: interactions_processed_kion.csv  
  inflating: items_processed_kion.csv  


In [None]:
interactions_df = pd.read_csv('/content/interactions_processed_kion.csv')
users_df = pd.read_csv('/content/users_processed_kion.csv')
items_df = pd.read_csv('/content/items_processed_kion.csv')

In [None]:
interactions_df['t_dat'] = pd.to_datetime(interactions_df['last_watch_dt'], format="%Y-%m-%d")
interactions_df['timestamp'] = interactions_df.t_dat.values.astype(np.int64) // 10 ** 9

In [None]:
df = interactions_df[['user_id', 'item_id', 'timestamp']].rename(
    columns={'user_id': 'user_id:token', 'item_id': 'item_id:token', 'timestamp': 'timestamp:float'})

In [None]:
!mkdir recbox_data

In [None]:
df.to_csv('recbox_data/recbox_data.inter', index=False, sep='\t')

Создаем датасет:

In [None]:
parameter_dict = {
    'data_path': '',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'device': 'GPU',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 10,
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}
config = Config(model='MultiVAE', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)



In [None]:
dataset = create_dataset(config)
logger.info(dataset)

Разделим данные на train, test и valid:

In [None]:
train_data, valid_data, test_data = data_preparation(config, dataset)

Возьмем 6 моделей из recbole и посмотрим на их результаты:

In [None]:
%%time
model_list = ['MultiVAE', 'MultiDAE', 'MacridVAE',
              "NeuMF", "RecVAE"]

for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset = 'recbox_data', config_dict = parameter_dict)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running MultiVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-bb8f9d95-c270-41f6-8eba-1c5f91e70170.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.15s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.12it/s]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.10s/it]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.12it/s]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.11s/it]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.10it/s]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.10s/it]
Train     7: 100%|████████████████████████████████████████████████

It took 2.85 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.084), ('mrr@10', 0.1695), ('ndcg@10', 0.0825), ('hit@10', 0.3503), ('precision@10', 0.0467)])}
running MultiDAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-bb8f9d95-c270-41f6-8eba-1c5f91e70170.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.11it/s]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.05s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.09it/s]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.01s/it]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.06it/s]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.04s/it]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.04it/s]
Train     7: 100%|████████████████████████████████████████████████

It took 2.72 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0829), ('mrr@10', 0.1655), ('ndcg@10', 0.081), ('hit@10', 0.3438), ('precision@10', 0.0459)])}
running MacridVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-bb8f9d95-c270-41f6-8eba-1c5f91e70170.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:53<00:00,  7.58s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:54<00:00,  7.76s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:57<00:00,  8.22s/it]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:53<00:00,  7.66s/it]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:53<00:00,  7.63s/it]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:50<00:00,  7.20s/it]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:52<00:00,  7.54s/it]
Train     7: 100%|████████████████████████████████████████████████

It took 12.79 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0835), ('mrr@10', 0.1574), ('ndcg@10', 0.0788), ('hit@10', 0.3499), ('precision@10', 0.0461)])}
running NeuMF...


command line args [-f /root/.local/share/jupyter/runtime/kernel-bb8f9d95-c270-41f6-8eba-1c5f91e70170.json] will not be used in RecBole
Train     0: 100%|████████████████████████████████████████████████| 755/755 [00:55<00:00, 13.70it/s]
Train     1: 100%|████████████████████████████████████████████████| 755/755 [00:50<00:00, 15.01it/s]
Train     2: 100%|████████████████████████████████████████████████| 755/755 [00:49<00:00, 15.18it/s]
Train     3: 100%|████████████████████████████████████████████████| 755/755 [00:48<00:00, 15.42it/s]
Train     4: 100%|████████████████████████████████████████████████| 755/755 [00:48<00:00, 15.41it/s]
Train     5: 100%|████████████████████████████████████████████████| 755/755 [00:49<00:00, 15.41it/s]
Train     6: 100%|████████████████████████████████████████████████| 755/755 [00:49<00:00, 15.40it/s]
Train     7: 100%|████████████████████████████████████████████████| 755/755 [00:49<00:00, 15.38it/s]
Train     8: 100%|███████████████████████████████████████

It took 10.89 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.069), ('mrr@10', 0.1173), ('ndcg@10', 0.0605), ('hit@10', 0.3009), ('precision@10', 0.0381)])}
running RecVAE...


command line args [-f /root/.local/share/jupyter/runtime/kernel-bb8f9d95-c270-41f6-8eba-1c5f91e70170.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:12<00:00,  1.75s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:13<00:00,  1.88s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.71s/it]
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.67s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.69s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.66s/it]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:11<00:00,  1.63s/it]
Train     1: 100%|████████████████████████████████████████████████

It took 9.77 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0849), ('mrr@10', 0.1697), ('ndcg@10', 0.0828), ('hit@10', 0.3532), ('precision@10', 0.047)])}
CPU times: user 37min 9s, sys: 1min 15s, total: 38min 24s
Wall time: 39min 1s


Прервала модель RepeatNet, очень долго выполнялось. Из остальных лучше всего показала себя MultiVAE: [('recall@10', 0.084), ('mrr@10', 0.1695), ('ndcg@10', 0.0825), ('hit@10', 0.3503), ('precision@10', 0.0467)]

In [None]:
result = run_recbole(model='MultiVAE', dataset = 'recbox_data', config_dict = parameter_dict)

command line args [-f /root/.local/share/jupyter/runtime/kernel-bb8f9d95-c270-41f6-8eba-1c5f91e70170.json] will not be used in RecBole
Max value of user's history interaction records has reached 20.9471766848816% of the total.
Train     0: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.13it/s]
Train     1: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.03s/it]
Train     2: 100%|████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.18s/it]
Train     3: 100%|████████████████████████████████████████████████████| 7/7 [00:09<00:00,  1.31s/it]
Train     4: 100%|████████████████████████████████████████████████████| 7/7 [00:07<00:00,  1.01s/it]
Train     5: 100%|████████████████████████████████████████████████████| 7/7 [00:06<00:00,  1.10it/s]
Train     6: 100%|████████████████████████████████████████████████████| 7/7 [00:08<00:00,  1.17s/it]
Train     7: 100%|████████████████████████████████████████████████

In [None]:
result

{'best_valid_score': -inf,
 'valid_score_bigger': True,
 'best_valid_result': None,
 'test_result': OrderedDict([('recall@10', 0.084),
              ('mrr@10', 0.1695),
              ('ndcg@10', 0.0825),
              ('hit@10', 0.3503),
              ('precision@10', 0.0467)])}

Попробуем обернуть в сервис:

In [None]:
reco = {}

model = MultiVAE(config, dataset)
checkpoint = torch.load("/content/saved/MultiVAE-Dec-18-2023_07-43-25.pth")
model.load_state_dict(checkpoint["state_dict"])

Max value of user's history interaction records has reached 23.254401942926535% of the total.


<All keys matched successfully>

In [None]:
def recommend_to_user(external_user_id, dataset, model):
    if (
        external_user_id in dataset.field2token_id[dataset.uid_field]
        and external_user_id != "[PAD]"
    ):
        model.eval()
        with torch.no_grad():
            uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
            index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
            new_inter = dataset[index]
            new_inter = new_inter.to(config["device"])
            new_scores = model.full_sort_predict(new_inter)
            new_scores = new_scores.view(-1, test_data.dataset.item_num)
            new_scores[:, 0] = -np.inf
            recommended_item_indices = torch.topk(new_scores, 10).indices[0].tolist()
            reco = dataset.id2token(dataset.iid_field, [recommended_item_indices]).tolist()
        return reco
    return []

In [None]:
from tqdm.notebook import tqdm

In [None]:
users = dataset.field2token_id[dataset.uid_field]
for user_id in tqdm(users):
    recos_for_user = recommend_to_user(user_id, dataset, model)
    if recos_for_user:
        reco.update({user_id: recos_for_user[0]})

  0%|          | 0/13355 [00:00<?, ?it/s]

In [None]:
with open("/content/MultiVAE_recos.json", "w") as jf:
    json.dump(reco, jf)

In [None]:
df = pd.read_json('/content/MultiVAE_recos.json', orient='records')

In [None]:
df = df.T

In [None]:
df.to_excel('recos_recbole.xlsx')