# Импорты

In [5]:
%%time
!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O ml-1m.zip
!unzip -o ml-1m.zip
!rm ml-1m.zip

Archive:  ml-1m.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: __MACOSX/data_original/._interactions.csv  
  inflating: data_original/users.csv  
  inflating: __MACOSX/data_original/._users.csv  
  inflating: data_original/items.csv  
  inflating: __MACOSX/data_original/._items.csv  
CPU times: user 420 ms, sys: 101 ms, total: 521 ms
Wall time: 27.3 s


In [3]:
import copy
from pprint import pprint

import numpy as np
import pandas as pd
import collections

import time
from tqdm.auto import tqdm

from rectools import Columns
from rectools.dataset import Interactions, Dataset
from rectools.metrics.ranking import MAP, MRR
from rectools.metrics import Precision, Recall, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models import RandomModel, PopularModel
from rectools.model_selection import TimeRangeSplitter

  from .autonotebook import tqdm as notebook_tqdm


# Подготовка данных

In [6]:
%%time
dataset = pd.read_csv("data_original/interactions.csv")
dataset.head()

CPU times: user 1.19 s, sys: 141 ms, total: 1.33 s
Wall time: 1.33 s


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [7]:
dataset = dataset.rename(columns={'last_watch_dt': 'datetime', 'watched_pct': 'weight'})
interactions = Interactions(dataset)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [8]:
n_splits = 3

cv = TimeRangeSplitter(
    test_size="14D",
    n_splits=n_splits,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

models = {
    "random": RandomModel(random_state=32),
    "popular": PopularModel()
}

metrics = {
    "prec@1": Precision(k=1),
    "prec@5": Precision(k=5),
    "prec@10": Precision(k=10),
    "recall@1": Recall(k=1),
    "recall@5": Recall(k=5),
    "recall@10": Recall(k=10),

    "novelty@1": MeanInvUserFreq(k=1),
    "novelty@5": MeanInvUserFreq(k=5),
    "novelty@10": MeanInvUserFreq(k=10),
    "serendipity@1": Serendipity(k=1),
    "serendipity@5": Serendipity(k=5),
    "serendipity@10": Serendipity(k=10),

    "MAP@1": MAP(k=1),
    "MAP@5": MAP(k=5),
    "MAP@10": MAP(k=10),
    "MRR@1": MRR(k=1),
    "MRR@5": MRR(k=5),
    "MRR@10": MRR(k=10),

}

K_RECOS = 10

# Расчет Метрик

In [157]:
def metrics_validation(splitter, interactions, models: dict, metrics: dict, k: int):
    
    results = []

    fold_iterator = splitter.split(interactions, collect_fold_stats=True)

    for train_ids, test_ids, fold_info in tqdm((fold_iterator), total=n_splits):
        print(f"\n==================== Fold {fold_info['i_split']}")
        # pprint(fold_info)

        df_train = interactions.df.iloc[train_ids]
        dataset = Dataset.construct(df_train)

        df_test = interactions.df.iloc[test_ids][Columns.UserItem]
        test_users = np.unique(df_test[Columns.User])

        catalog = df_train[Columns.Item].unique()

        for model_name, model in models.items():
            
            start_fit = time.time()
            model.fit(dataset)
            print('train time = ', time.time() - start_fit)

            models[model_name] = copy.deepcopy(model)

            recos = model.recommend(
                users=test_users,
                dataset=dataset,
                k=k,
                filter_viewed=True,
            )
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )
            res = {"fold": fold_info["i_split"], "model": model_name}
            res.update(metric_values)
            results.append(res)

    metrics_df = pd.DataFrame(results)

    # разделяем колонку metric на две отдельные колонки
    melted_df = metrics_df.melt(id_vars='model', var_name='metric', value_name='value')
    melted_df[['metric_type', 'lvl']] = melted_df['metric'].str.split('@', expand=True)
    melted_df.drop(columns='metric', inplace=True)
    melted_df = melted_df.sort_values(by=['model', 'lvl', 'metric_type'], ignore_index=True)

    pivot_results = pd.DataFrame(melted_df).groupby(["model", 'metric_type', 'lvl'], sort=False).agg(["mean"])
    mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
    (
        pivot_results.style
        .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
        .highlight_max(subset=mean_metric_subset, color='green', axis=0)
    )
    # выводим результат
    display(pivot_results)
    pivot_results.to_csv('Отчет_об_обучении.csv')
    return pivot_results

In [158]:
pivot_results = metrics_validation(cv, interactions=interactions, models=models, metrics=metrics, k=K_RECOS)

  0%|          | 0/3 [00:00<?, ?it/s]


train time =  3.8623809814453125e-05
train time =  0.8429033756256104


 33%|███▎      | 1/3 [00:18<00:36, 18.20s/it]


train time =  3.6716461181640625e-05
train time =  1.0456767082214355


 67%|██████▋   | 2/3 [00:38<00:19, 19.31s/it]


train time =  3.695487976074219e-05
train time =  1.2764780521392822


100%|██████████| 3/3 [01:01<00:00, 20.37s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean
model,metric_type,lvl,Unnamed: 3_level_2
popular,MAP,1,0.047186
popular,MRR,1,0.097211
popular,novelty,1,2.422735
popular,prec,1,0.097211
popular,recall,1,0.047186
popular,serendipity,1,3e-06
popular,MAP,10,0.098549
popular,MRR,10,0.17672
popular,novelty,10,3.722852
popular,prec,10,0.045964


In [161]:
# визуализация с раскрасной, т.к. при использовании функции раскраска так и не сработала :<
pivot_results 
mean_metric_subset = [(metric, agg) for metric, agg in pivot_results.columns if agg == 'mean']
(
        pivot_results.style
        .highlight_min(subset=mean_metric_subset, color='lightcoral', axis=0)
        .highlight_max(subset=mean_metric_subset, color='green', axis=0)
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean
model,metric_type,lvl,Unnamed: 3_level_2
popular,MAP,1,0.047186
popular,MRR,1,0.097211
popular,novelty,1,2.422735
popular,prec,1,0.097211
popular,recall,1,0.047186
popular,serendipity,1,3e-06
popular,MAP,10,0.098549
popular,MRR,10,0.17672
popular,novelty,10,3.722852
popular,prec,10,0.045964


# Визуальный анализ

In [20]:
items = pd.read_csv('data_original/items.csv')

In [165]:
items.iloc[1]['title']

'Голые перцы'

In [170]:
def visual_analysis(model, dataset, user_ids: list, item_data: list):
    dataset_val = Dataset.construct(dataset)
    recos = model.recommend(
                users=user_ids,
                dataset=dataset_val,
                k=10,
                filter_viewed=True,
            )
    
    visual_result = collections.defaultdict(list)

    items_counts = dataset['item_id'].value_counts()

    for user in user_ids:
        # get user history
        dataset['datetime'] = pd.to_datetime(dataset['datetime'])
        # append sort
        dataset =  dataset.sort_values(by=['datetime'])
        last_10_ids = dataset[dataset['user_id'] == user].tail(10)

        # get info about user history
        for i in list(last_10_ids['item_id']):
            
            visual_result['user'].append(user)
            visual_result['status'].append('просмотрено')
            visual_result['count of views'].append(items_counts[i])
            for j in item_data:
                visual_result[j].append(items.iloc[i][j])

        # get info about user recomendations
        for i in list(recos[recos['user_id'] == user]['item_id']):
            
            visual_result['user'].append(user)
            visual_result['status'].append('рекомендовано')
            visual_result['count of views'].append(items_counts[i])
            for j in item_data:
                visual_result[j].append(items.iloc[i][j])

    result_df = pd.DataFrame(visual_result)
    
    display(result_df)



In [172]:
visual_analysis(model=models['random'], dataset=dataset, user_ids=[666262, 672861, 955527], item_data=['title', 'genres'])

Unnamed: 0,user,status,count of views,title,genres
0,666262,просмотрено,746,Гастарбайтер,драмы
1,666262,просмотрено,485,Мамина любовь,мелодрамы
2,666262,просмотрено,10370,Они были первыми,"драмы, советские, военные"
3,666262,рекомендовано,99,Мои домашние питомцы,"русские, для детей, сериалы, хочу всё знать, п..."
4,666262,рекомендовано,1,Уральская рябинушка,"русские, мюзиклы"
5,666262,рекомендовано,51,Король воров,"драмы, триллеры"
6,666262,рекомендовано,262,"Люблю, значит верю","русские, криминал, мелодрамы"
7,666262,рекомендовано,19,Концерт SIROTKIN,"концерт, музыкальные"
8,666262,рекомендовано,4,Эпоха героев,"приключения, драмы, зарубежные, триллеры, воен..."
9,666262,рекомендовано,2,Пьянство. Зона риска,"фильмы hbo, документальное"
