# Конфигурация

In [1]:
from pprint import pprint
from copy import deepcopy
import time

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import rectools 
from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.models import RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP, NDCG, Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.model_selection import TimeRangeSplitter

In [2]:
K_RECOS = 10
RANDOM_SEED = 32

np.random.seed(RANDOM_SEED)

# Загрузка данных

In [3]:
def headtail(df):
    return pd.concat([df.head(), df.tail()])

In [4]:
interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

In [5]:
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True)

interactions = Interactions(interactions)

In [6]:
headtail(interactions.df)

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0
5476246,648596,12225,2021-08-13,76.0,0.0
5476247,546862,9673,2021-04-13,2308.0,49.0
5476248,697262,15297,2021-08-20,18307.0,63.0
5476249,384202,16197,2021-04-19,6203.0,100.0
5476250,319709,4436,2021-08-15,3921.0,45.0


In [7]:
interactions.df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       float64       
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 208.9 MB


In [8]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

# Инициализация моделей, метрик, сплиттера

Согласно заданию в качестве моделей используется рандомная и популярная модель

In [9]:
models = {'random': RandomModel(random_state=32),
          'popular': PopularModel()}

В качестве метрик используются:

2 ранжирующие -- MAP, NDCG

2 классификационные -- Precision, Recall

2 beyond-accuracy -- MIUF (Novelty), Serendipity

In [10]:
metrics = {'MAP@1': MAP(k=1), 'MAP@5': MAP(k=5), 'MAP@10': MAP(k=10),
           'NDCG@1': NDCG(k=1), 'NDCG@5': NDCG(k=5), 'NDCG@10': NDCG(k=10),
           'Precision@1': Precision(k=1), 'Precision@5': Precision(k=5), 'Precision@10': Precision(k=10),
           'Recall@1': Recall(k=1), 'Recall@5': Recall(k=5), 'Recall@10': Recall(k=10),
           'MeanInvUserFreq@1': MeanInvUserFreq(k=1), 'MeanInvUserFreq@5': MeanInvUserFreq(k=5), 'MeanInvUserFreq@10': MeanInvUserFreq(k=10),
           'Serendipity@1': Serendipity(k=1), 'Serendipity@5': Serendipity(k=5), 'Serendipity@10': Serendipity(k=10)}

Согласно заданию используется сплиттер на 3 фолда для кросс-валидации по неделе c исключением холодных юзеров, айтемов и просмотренных айтемов

In [11]:
splitter = TimeRangeSplitter(test_size="7D",
                             n_splits=3,
                             filter_cold_users = True,
                             filter_cold_items= True,
                             filter_already_seen = True)

# Проведение обучения и валидации с расчётом метрик

In [12]:
def computation_metrics(models: dict, metrics: dict, splitter, K: int):

    results = []

    fold_iterator = splitter.split(interactions, collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=splitter.n_splits):
        print(f"\n{'='*20} Fold {fold_info['i_split']} {'='*20}")
        pprint(fold_info)

        # создаём train часть
        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        # создаём test часть
        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])
    
        catalog = df_train[Columns.Item].unique()
    
        for model_name, model in models.items():
            # инициализируем модель
            model = deepcopy(model)

            # обучаем модель
            start = time.time()
            model.fit(dataset)
            stop = time.time()

            # получаем рекомендации
            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=K,
                filter_viewed=True,
            )

            # считаем метрики
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            
            res = {"fold": fold_info["i_split"],
                   "model": model_name,
                   "train_time": stop - start}
            res.update(metric_values)
            
            results.append(res)
    
    return results

In [13]:
results = computation_metrics(models, metrics, splitter, K_RECOS)

  0%|          | 0/3 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}

{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}

{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


In [14]:
pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg(["mean", "std"])

In [15]:
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if metric != 'train_time' and agg == 'mean']
mean_time_subset = [(metric, agg) for metric, agg in pivot_results.columns if metric == 'train_time' and agg == 'mean']

In [16]:
pivot_results.style\
             .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)\
             .highlight_max(subset=mean_metric_subset, color='lightgreen', axis=0)\
             .highlight_max(subset=mean_time_subset, color='lightcoral', axis=0)\
             .highlight_min(subset=mean_time_subset, color='lightgreen', axis=0)

Unnamed: 0_level_0,train_time,train_time,Precision@1,Precision@1,Recall@1,Recall@1,Precision@5,Precision@5,Recall@5,Recall@5,Precision@10,Precision@10,Recall@10,Recall@10,NDCG@1,NDCG@1,NDCG@5,NDCG@5,NDCG@10,NDCG@10,MAP@1,MAP@1,MAP@5,MAP@5,MAP@10,MAP@10,MeanInvUserFreq@1,MeanInvUserFreq@1,MeanInvUserFreq@5,MeanInvUserFreq@5,MeanInvUserFreq@10,MeanInvUserFreq@10,Serendipity@1,Serendipity@1,Serendipity@5,Serendipity@5,Serendipity@10,Serendipity@10
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2
random,0.0,0.0,0.000221,3.3e-05,7.2e-05,1.9e-05,0.000202,2.8e-05,0.000365,8.4e-05,0.000193,1.9e-05,0.000693,7.6e-05,0.000221,3.3e-05,0.000208,2.5e-05,0.0002,2e-05,7.2e-05,1.9e-05,0.000169,3.3e-05,0.000211,3.2e-05,15.614137,0.022585,15.612989,0.01957,15.613009,0.019786,6e-06,2e-06,7e-06,1e-06,7e-06,0.0
popular,5.472562,0.662896,0.076432,0.006826,0.04272,0.004366,0.052402,0.001618,0.137413,0.005346,0.033903,0.001443,0.173492,0.007987,0.076432,0.006826,0.057932,0.002332,0.043084,0.001978,0.04272,0.004366,0.078295,0.00437,0.084109,0.004921,2.377055,0.023002,3.066979,0.012316,3.71339,0.002076,2e-06,0.0,3e-06,0.0,2e-06,0.0


# Визуальный анализ

In [17]:
def visual_analysis(model, dataset, user_ids, item_data):

    if model.is_fitted == False:
        raise Exception('model is not fitted')

    # получаем рекомендации
    recos = model.recommend(
        users=user_ids,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    recos.drop(columns="score", inplace=True)
    recos['is_reco'] = True

    # собираем историю просмотров
    history_view = dataset.interactions.df.query(f'user_id in {user_ids}')\
                                          .sort_values(['user_id', 'datetime'])
    ranks = history_view.sort_values('datetime').groupby(['user_id']).datetime.rank()
    history_view['rank'] = ranks
    history_view['is_reco'] = False
    history_view.drop(columns=['weight', 'datetime'], inplace=True)

    # склеиваем историю просмотров и рекомендации
    results = pd.concat([history_view, recos])

    # добавляем данные об айтемах
    num_views = interactions.df.groupby('item_id').count().user_id
    num_views.name = 'num_view'
    results = results.merge(num_views,
                            how='inner',
                            on='item_id')
    results = results.merge(item_data,
                            how='inner',
                            on='item_id')

    results.sort_values(['user_id', 'is_reco', 'rank'], inplace=True)
    results.set_index(['user_id', 'item_id'], inplace=True)

    return results

In [18]:
user_ids = [666262, 672861, 955527]
dataset = Dataset.construct(interactions.df)
item_data = items[['item_id', 'title', 'genres', 'countries']]

In [19]:
model = deepcopy(models['popular']).fit(dataset)

In [20]:
results = visual_analysis(model, dataset, user_ids, item_data)

In [21]:
results

Unnamed: 0_level_0,Unnamed: 1_level_0,rank,is_reco,num_view,title,genres,countries
user_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
666262,93,1.0,False,1,Дом ночных призраков,"зарубежные, криминал, детективы, ужасы",США
666262,10440,1.0,True,202457,Хрустальный,"триллеры, детективы",Россия
666262,15297,2.0,True,193123,Клиника счастья,"драмы, мелодрамы",Россия
666262,9728,3.0,True,132865,Гнев человеческий,"боевики, триллеры","Великобритания, США"
666262,13865,4.0,True,122119,Девятаев,"драмы, военные, приключения",Россия
666262,4151,5.0,True,91167,Секреты семейной жизни,комедии,Россия
666262,3734,6.0,True,74803,Прабабушка легкого поведения,комедии,Россия
666262,2657,7.0,True,68581,Подслушано,"драмы, триллеры",Россия
666262,4880,8.0,True,55043,Афера,комедии,Россия
666262,142,9.0,True,45367,Маша,"драмы, триллеры",Россия
