# Подготовка

In [None]:
!pip -q install dill
!pip -q install lightfm
!pip -q install rectools
!pip -q install shap

In [None]:
import os
import warnings

warnings.simplefilter("ignore")

import gc
import json
import pickle
from typing import Any, Dict, Tuple
from zipfile import ZipFile

import numpy as np
import pandas as pd
import requests
import shap
from google.colab import drive
from lightfm import LightFM
from lightgbm import LGBMClassifier, LGBMRanker
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, NDCG, MeanInvUserFreq, Precision, Recall, calc_metrics
from rectools.models import LightFMWrapperModel, PopularModel
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
# from tools import avg_user_metric, generate_lightfm_recs_mapper
from tqdm.auto import tqdm

In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


# Подготовка данных

In [None]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...',
                        total=total_size_in_bytes,
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [None]:
!unzip kion_train.zip -x '__MACOSX/*'

Archive:  kion_train.zip
   creating: data_original/
  inflating: data_original/interactions.csv  
  inflating: data_original/users.csv  
  inflating: data_original/items.csv  


In [None]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

## `interactions`: взаимодействия пользователь - айтем

In [None]:
print(interactions.shape)
interactions.head(3)

(5476251, 5)


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0


In [None]:
# Меняем названия колонок для использования rectools
interactions.rename(
    columns={
        "last_watch_dt": Columns.Datetime,
        "watched_pct": Columns.Weight,
    },
    inplace=True,
)
# Меняем тип данных
interactions["datetime"] = interactions["datetime"].astype(np.datetime64)

# Заполняем пропуски
interactions_default_values: Dict[str, Any] = {
    Columns.Datetime: interactions[Columns.Datetime].median(),
    Columns.Weight: 0.0,
    "total_dur": 0,
}
interactions.fillna(interactions_default_values, inplace=True)

In [None]:
# Смотрим что получилось
interactions.sample(10)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
2072108,151530,4538,2021-05-01,832,17.0
3317954,813317,11237,2021-08-09,6629,100.0
1167847,16840,13927,2021-08-01,776,20.0
2931123,218933,11505,2021-08-04,3001,8.0
2470536,725973,7102,2021-08-21,114,2.0
2381430,840822,4266,2021-06-21,305,4.0
1148705,792557,1978,2021-06-29,29535,15.0
1698026,1063449,5724,2021-06-26,37,73.0
4369066,658879,884,2021-08-08,100,1.0
4682104,49147,16029,2021-07-02,228,4.0


Возьмем фичи для пользователей:

In [None]:
users_cat_cols = ["age",  "income", "sex", "kids_flg"]

In [None]:
def encode_cat_cols(df: pd.DataFrame, cat_cols) -> Tuple[pd.DataFrame, Dict]:
    cat_col_encoding = {}

    for col in cat_cols:
        cat_col = df[col].astype("category").cat
        cat_col_encoding[col] = cat_col.categories
        df[col] = cat_col.codes.astype("category")
    return df, cat_col_encoding

users, users_cat_col_encoding = encode_cat_cols(users, users_cat_cols)

# None уже кодируется как -1
users_cat_col_encoding["income"], users["income"].unique()

(Index(['income_0_20', 'income_150_inf', 'income_20_40', 'income_40_60',
        'income_60_90', 'income_90_150'],
       dtype='object'),
 [4, 2, 3, 0, -1, 5, 1]
 Categories (7, int64): [-1, 0, 1, 2, 3, 4, 5])

И фичи для айтемов:

In [None]:
items_cat_cols = ["content_type", "for_kids", "studios"]

Текстовые колонки удалим, категориальные закодируем:

In [None]:
items_text_cols = [
    "title",
    "title_orig",
    "genres",
    "countries",
    "directors",
    "actors",
    "description",
    "keywords",
]
items_num_cols = [
    "release_year",
    "age_rating",
]
default_values_items = {
    "release_year": items["release_year"].median(),
    "age_rating": items["age_rating"].median(),
}

items, items_cat_col_encoding = encode_cat_cols(items, items_cat_cols)
items = items.drop(items_text_cols, axis=1)
items.fillna(default_values_items, inplace=True)

# Трейн-вал-тест сплит

In [None]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")
print(f"Продолжительность: {max_date - min_date}")

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


## Схема валидации с разбиением по юзерам

In [None]:
# Обучать ранжирование будем на последнем месяце (30 дней) не считая отложенной недели
ranker_days_count = 30

ranker_data = interactions[
    (interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=ranker_days_count))
]

# В дальнейшем ranker_data разбиваем по юзерам
#  на train val test для обучения, валидации и тестирования ранкера
train_size = 0.7
val_size = 0.15
test_size = 0.15

# В train_test_split очень удобно можно сохранить исходное распределение по нужным факторам,
#  задав параметр stratify. Правда мы на это пока забьем)

train_val_users, test_users = train_test_split(
    ranker_data["user_id"].unique(), random_state=42, test_size=test_size
)

train_users, val_users = train_test_split(
    train_val_users,
    random_state=42,
    test_size=val_size / (train_size + val_size),  # 15% от общего размера
)

In [None]:
ranker_data.sample(5)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
4123534,272950,8636,2021-08-08,7707,100.0
1766071,661899,4436,2021-07-25,3017,34.0
3412475,160625,3150,2021-08-04,17,0.0
1104524,445599,15297,2021-07-24,27479,88.0
2716510,918133,12770,2021-08-22,149,3.0


In [None]:
# Для базовых моделей первого уровня (в нашем случае только lightfm)
#  оставим все оставшиеся взаимодействия для обучения

base_models_data = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

In [None]:
base_models_data.sample(5)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight
4217644,678218,13723,2021-06-27,4,0.0
1439311,533487,13865,2021-06-22,21,0.0
3299105,78442,5803,2021-05-28,7418,100.0
1959216,840456,2823,2021-06-19,1,0.0
2980353,807516,3734,2021-06-24,6742,100.0


# Генерим кандидатов, которыми дополним датасет ранкера

## LigtFM

Не хватило оперативной памяти для обучения модели первого уровня здесь, сделала в отдельном ноутбуке. Сюда сразу подгружу результаты

In [None]:
candidates = pd.read_csv("/content/drive/MyDrive/candidates_lfm_feats.csv")

Обрежу до 50, слишком много памяти просит:

In [None]:
candidates = candidates[candidates['lfm_rank'] < 51]

Также попробуем добавить популярное из rectools

In [None]:
dataset = Dataset.construct(base_models_data)
popular_model = PopularModel()
popular_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7849a1ddc310>

In [None]:
n_items = items[Columns.Item].nunique()
pop_candidates = popular_model.recommend(
    dataset.user_id_map.external_ids[:1], dataset, n_items, True
)

pop_candidates.rename({"rank": "pop_rank", "score": "pop_score"}, axis=1, inplace=True)
pop_candidates.drop(Columns.User, axis=1, inplace=True)

In [None]:
candidates = candidates.merge(pop_candidates, how="left", on=[Columns.Item])
candidates.head()

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank,pop_score,pop_rank
0,176549,10440,0.001207,1,141889.0,1.0
1,176549,2150,0.001033,2,14.0,6563.0
2,176549,12138,0.001013,3,1675.0,343.0
3,176549,4240,0.000903,4,1.0,13299.0
4,176549,10843,0.000795,5,101.0,3196.0


In [None]:
del pop_candidates
gc.collect()

8

## Метрики

In [None]:
# Считаем метрики
def calc_metrics_(candidates_df, rank_col: str) -> Dict[str, float]:
    metrics = {
        # "ndcg@10": NDCG(k=10),
        "map@10": MAP(k=10),
        "Precision@10": Precision(k=10),
        "recall@10": Recall(k=10),
        #"novelty@10": MeanInvUserFreq(k=10),
    }
    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df.rename(columns={rank_col: Columns.Rank})[
                [Columns.User, Columns.Item, Columns.Rank]
            ][candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data[[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]][
                ranker_data[Columns.User].isin(test_users)
            ]
        ),
        prev_interactions=(
            base_models_data[[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]][
                base_models_data[Columns.User].isin(test_users)
            ]
        ),
        catalog=items["item_id"].unique(),
    )

models_metrics: Dict[str, Dict[str, float]] = dict()
models_metrics["lfm"] = calc_metrics_(candidates, "lfm_rank")
models_metrics["lfm"]

models_metrics["pop"] = calc_metrics_(candidates, "pop_rank")
models_metrics["pop"]

{'Precision@10': 0.006348134391355733,
 'recall@10': 0.021812664977126063,
 'map@10': 0.014773339295411662}

# Формируем датасет для ранкера

## Генерим фичи для ранкера

In [None]:
len(base_models_data['user_id'].unique())

720875

In [None]:
# Получаем длину истории юзера
base_models_data["user_hist"] = base_models_data.groupby("user_id")["item_id"].transform("count")
# Получаем популярность контента
base_models_data["item_pop"] = base_models_data.groupby("item_id")["user_id"].transform("count")
# Получаем среднюю популярность контента, просматриваемого этим юзером
base_models_data["user_avg_pop"] = base_models_data.groupby("user_id")["item_pop"].transform("mean")
# Получаем среднюю длину истории пользователя, которые смотрит этот контент
base_models_data["item_avg_hist"] = base_models_data.groupby("item_id")["user_hist"].transform(
    "mean"
)
# Получаем популярность последнего просмотренного контента
base_models_data.sort_values(
    by=[Columns.User, Columns.Datetime],
    ascending=[True, False],
    ignore_index=True,
    inplace=True,
)
base_models_data["user_last_pop"] = base_models_data.groupby("user_id")["item_pop"].transform(
    "first"
)

Сделаем признак, определяющий холодность пользователя: количество фильмов, просмотренных больше, чем на половину:

In [None]:
viewed_items = (
    base_models_data[base_models_data['weight'] >= 50].groupby('user_id')
    .agg({"item_id": "count"})
    .rename(columns={"item_id": "viewed"})
)

viewed_items.head()

Unnamed: 0_level_0,viewed
user_id,Unnamed: 1_level_1
2,42
3,1
4,1
11,6
13,4


In [None]:
base_models_data = base_models_data.merge(viewed_items, how="left", on='user_id')
base_models_data['viewed'] = base_models_data['viewed'].fillna(0)

In [None]:
base_models_data.sample(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,user_hist,item_pop,user_avg_pop,item_avg_hist,user_last_pop,viewed
38273,11520,13865,2021-06-11,5452,86.0,12,93403,22826.333333,10.40852,50004,9.0
1236136,362888,12580,2021-07-16,8,0.0,6,265,1018.333333,29.049057,266,3.0
3393351,995743,14687,2021-05-26,4794,73.0,1,931,931.0,24.392052,931,1.0


In [None]:
# Добавляем новые фичи в соответствующие таблички
items = pd.merge(
    left=items,
    right=(base_models_data[["item_id", "item_pop", "item_avg_hist"]].drop_duplicates()),
    how="left",
    on="item_id",
)

users = pd.merge(
    left=users,
    right=(
        base_models_data[
            ["user_id", "user_hist", "user_avg_pop", "user_last_pop", "viewed"]
        ].drop_duplicates()
    ),
    how="left",
    on="user_id",
)
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,viewed
0,973171,1,4,1,1,5.0,19550.8,93403.0,3.0
1,962099,0,2,1,0,13.0,1329.307692,260.0,7.0
2,1047345,3,3,0,0,,,,


In [None]:
# Для новых фичей айтемов
default_values_items["item_pop"] = base_models_data["item_pop"].median()
default_values_items["item_avg_hist"] = base_models_data["item_avg_hist"].median()

# Для новых фичей юзеров
default_values_users = {
    "user_hist": 0,
    "user_avg_pop": base_models_data["user_avg_pop"].median(),
    "user_last_pop": base_models_data["user_last_pop"].median(),
    "viewed": base_models_data["viewed"].median(),
}

### Джойним кандидатов и юзер/айтем фичи

In [None]:
# Вспоминаем про наши выборки интеракций для ранкера.
# Мы отобрали юзеров для обучения, валидации и теста.
# Оставляем среди них только тех, для кого есть и рекомы и таргеты


def users_filter(
    user_list: np.ndarray,
    candidates_df: pd.DataFrame,
    df: pd.DataFrame,
) -> pd.DataFrame:
    # Джойним интеракции на наших кандидатов для юзеров из трейна, вал и теста
    df = pd.merge(
        df[df["user_id"].isin(user_list)],
        candidates_df[candidates_df["user_id"].isin(user_list)],
        how="right",
        on=["user_id", "item_id"],
    )
    # Проставляем дефолтные значения интеракций
    lfm_min_score: float = df["lfm_score"].min() - 0.01
    lfm_max_rank: int = df["lfm_rank"].max() + 1  # 51
    pop_min_score: float = df["pop_score"].min() - 0.01
    pop_max_rank: int = df["pop_rank"].max() + 1  # 51

    default_values = {
        "lfm_score": lfm_min_score,
        "lfm_rank": lfm_max_rank,
        "pop_score": pop_min_score,
        "pop_rank": pop_max_rank,
        # Важно использовате те же дефолтные значения для интеракций,
        # чтобы не сделать утечку
        **interactions_default_values,
    }
    df.fillna(default_values, inplace=True)

    # Сортируем по user_id - это пригодится для вычисления рангов и групп для ранжирования
    df.sort_values(
        by=["user_id", "item_id"],
        inplace=True,
    )
    return df


ranker_train = users_filter(train_users, candidates, ranker_data)
ranker_val = users_filter(val_users, candidates, ranker_data)
ranker_test = users_filter(test_users, candidates, ranker_data)

In [None]:
ranker_train[ranker_train['total_dur'] == 2179]

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,pop_score,pop_rank
5491623,452429,6686,2021-07-23,2179.0,27.0,0.000659,24,1083.0,578.0


In [None]:
# Добавляем фичи
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(df, users, how="left", on=["user_id"])
    df = pd.merge(df, items, how="left", on=["item_id"])

    # При джойне могут получиться строки с несуществующими айтемами или юзерами.
    # Надо заполнить пропуски. Используем заготовленные дефолтные значения,
    # чтобы не сделать утечку
    df.fillna(default_values_items, inplace=True)
    df.fillna(default_values_users, inplace=True)

    # Категориальные фичи закодированы пандасом так, что None === -1
    # Если изначально пропусков не было, то нужно добавить такое значение категории
    for col in df.columns:
        if isinstance(df[col].dtype, pd.CategoricalDtype):
            if -1 not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories(-1)
            df.fillna({col: -1}, inplace=True)
    return df


# Не забываем добавить фичи в трейн, вал и тест
# Еще правильнее бы было сначала подготовить датасет,
# а потом его разбивать по юзерам - так бы мы избежали дублирования операций.
ranker_train = add_features(ranker_train)
ranker_val = add_features(ranker_val)
ranker_test = add_features(ranker_test)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
ranker_train[ranker_train['total_dur'] > 0].head()

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,pop_score,pop_rank,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,viewed,content_type,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist
29,3,9728,2021-07-23,10448.0,100.0,0.001698,29,0.99,13300.0,-1,-1,-1,-1,0.0,11957.864865,2858.0,5.0,0,2021.0,-1,18.0,-1,76978.0,11.165736
32,3,10440,2021-07-23,44827.0,90.0,0.002361,1,141889.0,1.0,-1,-1,-1,-1,0.0,11957.864865,2858.0,5.0,1,2021.0,-1,18.0,-1,141889.0,8.068716
296,60,15297,2021-07-24,14896.0,88.0,0.000416,12,137128.0,2.0,2,2,1,1,15.0,18067.4,548.0,12.0,1,2021.0,-1,18.0,-1,137128.0,7.364295
332,65,10440,2021-07-24,26820.0,100.0,0.001254,1,141889.0,1.0,4,2,0,0,2.0,1485.0,2081.0,0.0,1,2021.0,-1,18.0,-1,141889.0,8.068716
396,75,15297,2021-08-18,10382.0,50.0,0.000641,13,137128.0,2.0,1,3,1,0,1.0,22769.0,22769.0,1.0,1,2021.0,-1,18.0,-1,137128.0,7.364295


In [None]:
del candidates
gc.collect()

0

In [None]:
# Датасеты готовы, остались только таргеты,
# которые можно посчитать на основе колонок total_dur и watched_pct

# Делаем еще один чекпоинт.
for name in ["train", "val", "test"]:
    path: str = f"/content/drive/MyDrive/ranker_{name}.csv"
    locals()[f"ranker_{name}"].to_csv(path, index=False)

# Обучаем ранкер

## Pointwise

In [None]:
# Загружаем данные
for name in ["train", "val", "test"]:
    path: str = f"/content/drive/MyDrive/ranker_{name}.csv"
    locals()[f"ranker_{name}"] = pd.read_csv(path)

In [None]:
# таргет бинарный, так что -
# будем считать просмотр хорошим если доля досмотра больше половины


def add_target(df: pd.DataFrame) -> pd.DataFrame:
    df["target"] = df[Columns.Weight] > 50  # 'watched_pct'
    df["target"] = df["target"].astype(int)
    return df


ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

In [None]:
# В train и val можно удалить 'плохих' пользователей,
# Например тех у кого слишком много или мало просмотров или
# тех для которых нет достаточного количества рекомендаций от LightFM
# Тестовую группу не меняем


def filter_group(df: pd.DataFrame) -> pd.DataFrame:
    df.sort_values(
        by=["user_id", "item_id"],
        inplace=True,
    )
    groups_df = (
        df[["user_id", "item_id"]]
        .groupby(by=["user_id"])
        .count()
        .rename(columns={"item_id": "group_size"})
    )
    df = pd.merge(df, groups_df, how="left", on=["user_id"])
    #Удаляем группы, без достаточного числа просмотров/кандидатов
    df = df[df["group_size"] >= 50]

    # Колонка больше не нужна
    df.drop(columns=["group_size"], inplace=True)
    return df


def filter_interations(df: pd.DataFrame) -> pd.DataFrame:
    return df[df["lfm_rank"] <= 50]


ranker_train = filter_interations(ranker_train)
ranker_val = filter_interations(ranker_val)

ranker_train = filter_group(ranker_train)
ranker_val = filter_group(ranker_val)
ranker_train = filter_interations(ranker_train)
ranker_val = filter_interations(ranker_val)
ranker_test = filter_group(ranker_test)

In [None]:
# Вспоминаем какие есть колонки
ranker_train.columns

Index(['user_id', 'item_id', 'datetime', 'total_dur', 'weight', 'lfm_score',
       'lfm_rank', 'pop_score', 'pop_rank', 'age', 'income', 'sex', 'kids_flg',
       'user_hist', 'user_avg_pop', 'user_last_pop', 'viewed', 'content_type',
       'release_year', 'for_kids', 'age_rating', 'studios', 'item_pop',
       'item_avg_hist', 'target'],
      dtype='object')

In [None]:
ranker_train.shape, ranker_val.shape, ranker_test.shape

((6760250, 25), (1450400, 25), (1441800, 25))

In [None]:
# Убираем ненужные айдишники, временные метки и таргеты.
# Для обучения используются только cols:
cols = [
    "lfm_score",
    "lfm_rank",
    "pop_score",
    "pop_rank",
    "age",
    "income",
    "sex",
    "kids_flg",
    "user_hist",
    "user_avg_pop",
    "user_last_pop",
    "viewed",
    "content_type",
    "release_year",
    "for_kids",
    "age_rating",
    "studios",
    "item_pop",
    "item_avg_hist",
]
# Из них категориальные:
cat_cols = [
    "age",
    "income",
    "sex",
    "kids_flg",
    "content_type",
    "for_kids",
    "studios",
]

In [None]:
early_stopping_rounds = 32

params = {
    'objective': 'binary',
    'n_estimators': 10000,  # максимальное число деревьев
    'max_depth': 4,  # максимальная глубина дерева
    'num_leaves': 10,  # число листьев << 2^max_depth
    'min_child_samples': 100,  # число примеров в листе
    'learning_rate': 0.25,  # шаг обучения
    'reg_lambda': 1,  # L2 регуляризация
    'colsample_bytree': 0.9,  # доля колонок, которая используется в каждом дереве
    'early_stopping_rounds': early_stopping_rounds,  # число итераций, в течение которых нет улучшения метрик
    'verbose': early_stopping_rounds // 8,  # период вывода метрик
    'random_state': 42,
}

fit_params = {
    "X": ranker_train[cols],
    "y": ranker_train["target"],
    "eval_set": [(ranker_val[cols], ranker_val["target"])],
    "eval_metric": "logloss",
    "categorical_feature": cat_cols,
    "feature_name": cols,
}
pointwise_model = LGBMClassifier(**params)
pointwise_model.fit(**fit_params)

[LightGBM] [Info] Number of positive: 29336, number of negative: 6730914
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.130270
[LightGBM] [Debug] init for col-wise cost 0.000023 seconds, init for row-wise cost 0.870336 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.416266 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 6760250, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004339 -> initscore=-5.435651
[LightGBM] [Info] Start training from score -5.435651
[LightGBM] [Debug] Trained a tree with leaves = 10 and depth = 4
Training until validation scores don't improve for 32 rounds
[LightGBM] [Debug] Trained a tree with leaves = 10 and depth = 4
[LightGBM] [Debug] Traine

In [None]:
# Смотрим на логлосс на валидации
pointwise_model.best_score_["valid_0"]["binary_logloss"]

0.018520795254510232

In [None]:
# # SHAP - values
# explainer = shap.Explainer(pointwise_model)
# shap_values = explainer(ranker_test[cols].iloc[:10_000])

# # lightGBM returns probabilities for both classes and I have to modify the SHAP values as
# shap_values.values = shap_values.values[:, :, 1]
# shap_values.base_values = shap_values.base_values[:, 1]

In [None]:
# # summarize the effects of all the features
# shap.plots.beeswarm(shap_values, max_display=len(cols))

Модель немного переобучилась на популярность айтемов.

In [None]:
# # mean shap-values
# shap.plots.bar(shap_values, max_display=len(cols))

Здесь опять же видим, что популярность айтема очень влияет на предсказания.

In [None]:
# Получаем предсказания для тестовых юзеров
y_pred: np.ndarray = pointwise_model.predict_proba(ranker_test[cols])[:, 1]
y_true: np.ndarray = np.array(ranker_test["target"])

y_pred

array([2.20136521e-05, 1.04969226e-03, 2.01191915e-03, ...,
       1.47168392e-05, 2.05649438e-04, 1.70044724e-04])

In [None]:
def add_score_and_rank(df: pd.DataFrame, y_pred_scores: np.ndarray, name: str) -> pd.DataFrame:
    # Добавляем скор модели второго уровня
    df[f"{name}_score"] = y_pred_scores
    # Добавляем ранг модели второго уровня
    df.sort_values(
        by=["user_id", f"{name}_score"],
        ascending=[True, False],
        inplace=True,
    )
    df[f"{name}_rank"] = df.groupby("user_id").cumcount() + 1

    # Исключаем айтемы, которые не были предсказаны на первом уровне
    mask = (df["lfm_rank"] < 101).to_numpy()
    # Добавляем общий скор двухэтапной модели
    eps: float = 0.001
    min_score: float = min(y_pred_scores) - eps
    df[f"{name}_hybrid_score"] = df[f"{name}_score"] * mask
    df[f"{name}_hybrid_score"].replace(
        0,
        min_score,
        inplace=True,
    )
    # Добавляем общий ранг двухэтапной модели
    df[f"{name}_hybrid_rank"] = df[f"{name}_rank"] * mask
    max_rank: int = 101
    df[f"{name}_hybrid_rank"].replace(
        0,
        max_rank,
        inplace=True,
    )
    return df

In [None]:
ranker_test = add_score_and_rank(ranker_test, y_pred, "pointwise")
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,pop_score,pop_rank,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,viewed,content_type,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist,target,pointwise_score,pointwise_rank,pointwise_hybrid_score,pointwise_hybrid_rank
31,53,9728,2021-07-01,0.0,0.0,0.000511,26,0.99,13300.0,3,2,1,0,10.0,39954.3,559.0,6.0,0,2021.0,-1,18.0,-1,76978.0,11.165736,0,0.081721,1,0.081721,1
21,53,5693,2021-07-01,0.0,0.0,0.000539,19,5841.0,60.0,3,2,1,0,10.0,39954.3,559.0,6.0,0,2019.0,-1,16.0,-1,5841.0,24.857387,0,0.004396,2,0.004396,2
44,53,12995,2021-07-01,0.0,0.0,0.000437,39,19394.0,17.0,3,2,1,0,10.0,39954.3,559.0,6.0,0,2020.0,-1,18.0,-1,19394.0,18.095133,0,0.004008,3,0.004008,3


In [None]:
# Считаем рок аук от sklearn (не усредненный по юзерам)
roc_auc_score(y_true, y_pred)  # Получилась ерунда

0.9506424618149191

In [None]:
from tools import generate_lightfm_recs_mapper, avg_user_metric

In [None]:
# усредненный рок аук по юзерам
# df должен быть отсортирован по user_id
(
    avg_user_metric(
        y_true=np.array(ranker_test["target"]),
        y_pred=np.array(ranker_test["pointwise_score"]),
        user_ids=np.array(ranker_test["user_id"]),
        metric_function=roc_auc_score,
    ),
    avg_user_metric(
        y_true=np.array(ranker_test["target"]),
        y_pred=np.array(ranker_test["pointwise_hybrid_score"]),
        user_ids=np.array(ranker_test["user_id"]),
        metric_function=roc_auc_score,
    ),
)

(0.956642200121392, 0.956642200121392)

In [None]:
# Получим значения метрик
models_metrics["lfm"] = calc_metrics_(ranker_test, "lfm_rank")
models_metrics["pointwise"] = calc_metrics_(ranker_test, "pointwise_rank")
models_metrics["pointwise_hybrid"] = calc_metrics_(ranker_test, "pointwise_hybrid_rank")

pd.DataFrame(models_metrics)[["lfm", "pointwise", "pointwise_hybrid"]]

Unnamed: 0,lfm,pointwise,pointwise_hybrid
Precision@10,0.006348,0.017853,0.017853
recall@10,0.021813,0.060187,0.060187
map@10,0.014773,0.040226,0.040226


Метрики чуть улучшились относительно LightFM

## Pairwise/Listwise

In [None]:
# Для обучения ранжированию нужно правильно сформировать группы.
# В нашем случае группа равна одному юзеру.
# Для LGBMRanker нужно задать отсортированный по юзерам (группам) датафрейм,
# для которого списком групп будет список из количества
# ранжируемых айтемов на каждого юзера (группу).


def get_group(df: pd.DataFrame) -> np.ndarray:
    return np.array(df[["user_id", "item_id"]].groupby(by=["user_id"]).count()["item_id"])

In [None]:
# Добавим таргет посложнее


def add_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    0 - доля досмотра < 0.15
    1 - 0.15 <= доля досмотра < 0.75
    2 - 0.75 <= доля досмотра
    """
    df["target_ranker"] = (df[Columns.Weight] >= 15).astype(int)  # 'watched_pct'
    df["target_ranker"] += (df[Columns.Weight] >= 75).astype(int)
    return df


ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

In [None]:
ranker_train.sample(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,pop_score,pop_rank,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,viewed,content_type,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist,target,target_ranker
983028,158934,8579,2021-07-01,0.0,0.0,0.000725,27,399.0,1485.0,1,4,1,0,9.0,12691.444444,1444.0,2.0,0,2008.0,-1,16.0,-1,399.0,40.182957,0,0
937671,151651,5766,2021-07-01,0.0,0.0,0.001748,16,362.0,1588.0,-1,-1,-1,-1,0.0,11957.864865,2858.0,5.0,0,2016.0,-1,12.0,-1,362.0,31.190608,0,0
1685648,272899,15751,2021-07-01,0.0,0.0,0.000582,20,378.0,1548.0,1,3,1,0,4.0,36121.75,32399.0,2.0,0,2004.0,-1,16.0,-1,378.0,28.772487,0,0


In [None]:
early_stopping_rounds = 32

params = {
    'objective': 'binary',
    'n_estimators': 10000,  # максимальное число деревьев
    'max_depth': 4,  # максимальная глубина дерева
    'num_leaves': 10,  # число листьев << 2^max_depth
    'min_child_samples': 100,  # число примеров в листе
    'learning_rate': 0.25,  # шаг обучения
    'reg_lambda': 1,  # L2 регуляризация
    'colsample_bytree': 0.9,  # доля колонок, которая используется в каждом дереве
    'early_stopping_rounds': early_stopping_rounds,  # число итераций, в течение которых нет улучшения метрик
    'verbose': early_stopping_rounds // 8,  # период вывода метрик
    'random_state': 42,
}

fit_params = {
    "X": ranker_train[cols],
    "y": ranker_train["target_ranker"],
    "group": get_group(ranker_train),
    "eval_set": [(ranker_val[cols], ranker_val["target_ranker"])],
    "eval_group": [get_group(ranker_val)],
    "eval_metric": "ndcg",
    "eval_at": (3, 5, 10),
    "categorical_feature": cat_cols,
    "feature_name": cols,
}
listwise_model = LGBMRanker(**params)
listwise_model.fit(**fit_params)

[LightGBM] [Info] Number of positive: 40241, number of negative: 6720009
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.130270
[LightGBM] [Debug] init for col-wise cost 0.000070 seconds, init for row-wise cost 0.891104 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.419584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 6760250, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005953 -> initscore=-5.117958
[LightGBM] [Info] Start training from score -5.117958
[LightGBM] [Debug] Trained a tree with leaves = 10 and depth = 4
Training until validation scores don't improve for 32 rounds
[LightGBM] [Debug] Trained a tree with leaves = 10 and depth = 4
[LightGBM] [Debug] Traine

In [None]:
listwise_model.best_score_["valid_0"]

OrderedDict([('ndcg@3', 0.9130120683878239),
             ('ndcg@5', 0.9232134496062854),
             ('ndcg@10', 0.927098342162755),
             ('binary_logloss', 0.02398688292640981)])

In [None]:
y_pred: np.ndarray = listwise_model.predict(ranker_test[cols])
ranker_test = add_score_and_rank(ranker_test, y_pred, "listwise")
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,pop_score,pop_rank,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,viewed,content_type,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist,target,pointwise_score,pointwise_rank,pointwise_hybrid_score,pointwise_hybrid_rank,target_ranker,listwise_score,listwise_rank,listwise_hybrid_score,listwise_hybrid_rank
31,53,9728,2021-07-01,0.0,0.0,0.000511,26,0.99,13300.0,3,2,1,0,10.0,39954.3,559.0,6.0,0,2021.0,-1,18.0,-1,76978.0,11.165736,0,0.081721,1,0.081721,1,0,0.106969,1,0.106969,1
44,53,12995,2021-07-01,0.0,0.0,0.000437,39,19394.0,17.0,3,2,1,0,10.0,39954.3,559.0,6.0,0,2020.0,-1,18.0,-1,19394.0,18.095133,0,0.004008,3,0.004008,3,0,0.008675,2,0.008675,2
21,53,5693,2021-07-01,0.0,0.0,0.000539,19,5841.0,60.0,3,2,1,0,10.0,39954.3,559.0,6.0,0,2019.0,-1,16.0,-1,5841.0,24.857387,0,0.004396,2,0.004396,2,0,0.00684,3,0.00684,3


In [None]:
models_metrics["listwise"] = calc_metrics_(ranker_test, "listwise_rank")
models_metrics["listwise_hybrid"] = calc_metrics_(ranker_test, "listwise_hybrid_rank")
pd.DataFrame(models_metrics)[["listwise", "listwise_hybrid"]]

Unnamed: 0,listwise,listwise_hybrid
Precision@10,0.017921,0.017921
recall@10,0.060395,0.060395
map@10,0.040579,0.040579


In [None]:
(
    avg_user_metric(
        y_true=np.array(ranker_test["target"]),  # target_ranker
        y_pred=np.array(ranker_test["listwise_score"]),
        user_ids=np.array(ranker_test["user_id"]),
        metric_function=roc_auc_score,
    ),
    avg_user_metric(
        y_true=np.array(ranker_test["target"]),
        y_pred=np.array(ranker_test["listwise_hybrid_score"]),
        user_ids=np.array(ranker_test["user_id"]),
        metric_function=roc_auc_score,
    ),
)

(0.9570028763832641, 0.9570028763832641)

## Итоговые метрики

In [None]:
pd.DataFrame(models_metrics)[["lfm", "pointwise_hybrid", "listwise_hybrid"]]

Unnamed: 0,lfm,pointwise_hybrid,listwise_hybrid
Precision@10,0.006348,0.017853,0.017921
recall@10,0.021813,0.060187,0.060395
map@10,0.014773,0.040226,0.040579


In [None]:
MODEL_PATH = "/content/drive/MyDrive/listwise_hybrid.pkl"
pickle.dump(listwise_model, open(MODEL_PATH, "wb"))

# Сохраняем рекомендации для сервиса

In [156]:
for name in ["train", "val", "test"]:
    path: str = f"/content/drive/MyDrive/ranker_{name}.csv"
    locals()[f"ranker_{name}"] = pd.read_csv(path)

In [157]:
ranker_full = pd.concat([ranker_train, ranker_val, ranker_test]).sample(652450)
len(ranker_full)

652450

In [158]:
listwise_model = pickle.load(open("/content/drive/MyDrive/listwise_hybrid.pkl", "rb"))

In [159]:
y_pred: np.ndarray = listwise_model.predict(ranker_full[cols])
ranker_test = add_score_and_rank(ranker_full, y_pred, "listwise")
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,pop_score,pop_rank,age,...,release_year,for_kids,age_rating,studios,item_pop,item_avg_hist,listwise_score,listwise_rank,listwise_hybrid_score,listwise_hybrid_rank
46,3,15297,2021-07-01,0.0,0.0,0.001786,13,137128.0,2.0,-1,...,2021.0,-1,18.0,-1,137128.0,7.364295,0.123461,1,0.123461,1
13,3,4471,2021-07-01,0.0,0.0,0.001658,36,3658.0,106.0,-1,...,2017.0,-1,16.0,-1,3658.0,28.708037,0.002738,2,0.002738,2
6,3,2887,2021-07-01,0.0,0.0,0.001654,37,1238.0,491.0,-1,...,2017.0,-1,16.0,-1,1238.0,38.317447,0.001977,3,0.001977,3


In [160]:
K_RECS = 10
ranker_full = ranker_full[[Columns.User, Columns.Item, "listwise_hybrid_rank"]]

ranker_full = (
    ranker_full.sort_values([Columns.User, "listwise_hybrid_rank"])
    .groupby(Columns.User)
    .head(K_RECS)
)
ranker_full.drop("listwise_hybrid_rank", axis=1, inplace=True)

In [161]:
ranker_full['pop_rank'] = 11

In [162]:
ranker_full

Unnamed: 0,user_id,item_id,pop_rank
46,3,15297,11
13,3,4471,11
6,3,2887,11
35,3,10845,11
38,3,11268,11
...,...,...,...
6760185,1097534,10845,11
6760243,1097544,12995,11
6760222,1097544,6686,11
6760207,1097544,2887,11


In [None]:
from tqdm import tqdm

In [165]:
list_of_users = ranker_full['user_id'].unique()

In [167]:
a = (
    ranker_full.groupby('user_id')
    .agg({"item_id": "count"})
    .rename(columns={"item_id": "viewed"})
)
a

Unnamed: 0_level_0,viewed
user_id,Unnamed: 1_level_1
3,6
11,4
14,1
21,6
30,3
...,...
1097515,7
1097521,2
1097526,4
1097534,2


In [170]:
for user in tqdm(list_of_users):
  top = pop_candidates.head(10)[['item_id', 'pop_rank']]
  top['user_id'] = user
  ranker_full = pd.concat([ranker_full, top])

100%|██████████| 187289/187289 [1:01:07<00:00, 51.07it/s]


In [225]:
ranker_full_uniq = ranker_full.sort_values(by=['user_id', 'pop_rank'], ascending=False).drop_duplicates(keep='first')

In [231]:
ranker_full_uniq.shape, ranker_full.shape

((2525234, 3), (2525234, 3))

In [227]:
ranker_full.sort_values(by=['user_id', 'pop_rank'], ascending=False)

Unnamed: 0,user_id,item_id,pop_rank
6760243,1097544,12995,11
6760222,1097544,6686,11
6760207,1097544,2887,11
6760202,1097544,1627,11
9,1097544,4740,10
...,...,...,...
4,3,2657,5
3,3,4151,4
2,3,13865,3
1,3,15297,2


In [228]:
ranker_full_uniq.loc[ranker_full_uniq['pop_rank'] == 11, 'pop_rank'] = 0

In [245]:
ranker_full_uniq.sort_values(by=['user_id', 'pop_rank']).head(20)

Unnamed: 0,user_id,item_id,pop_rank,rank
46,3,15297,0,1
13,3,4471,0,2
6,3,2887,0,3
35,3,10845,0,4
38,3,11268,0,5
40,3,12089,0,6
0,3,10440,1,7
2,3,13865,3,8
3,3,4151,4,9
4,3,2657,5,10


In [242]:
ranker_full_uniq = ranker_full_uniq.sort_values(by=['user_id', 'pop_rank']).drop_duplicates(['user_id', 'item_id'])

In [244]:
ranker_full_uniq['rank'] = ranker_full_uniq.groupby('user_id').cumcount() + 1

In [246]:
ranker_full_uniq = ranker_full_uniq.sort_values(by=['user_id', 'pop_rank'])

In [247]:
ranker_full_uniq = ranker_full_uniq[ranker_full_uniq['rank'] < 11]

In [248]:
ranker_full_u = ranker_full_uniq[['user_id',	'item_id']]

In [249]:
ranker_full_u = ranker_full_u.drop_duplicates()

In [250]:
a = (
    ranker_full_u.groupby('user_id')
    .agg({"item_id": "count"})
    .rename(columns={"item_id": "viewed"})
)
a

Unnamed: 0_level_0,viewed
user_id,Unnamed: 1_level_1
3,10
11,10
14,10
21,10
30,10
...,...
1097515,10
1097521,10
1097526,10
1097534,10


In [251]:
ranker_full_uniq.head(30)

Unnamed: 0,user_id,item_id,pop_rank,rank
46,3,15297,0,1
13,3,4471,0,2
6,3,2887,0,3
35,3,10845,0,4
38,3,11268,0,5
40,3,12089,0,6
0,3,10440,1,7
2,3,13865,3,8
3,3,4151,4,9
4,3,2657,5,10


In [252]:
ranker_full.sort_values(by='user_id')

Unnamed: 0,user_id,item_id,pop_rank
46,3,15297,11
0,3,10440,1
1,3,15297,2
2,3,13865,3
3,3,4151,4
...,...,...,...
8,1097544,6809,9
6760243,1097544,12995,11
6760222,1097544,6686,11
1,1097544,15297,2


In [169]:
pop_candidates.head(10)

Unnamed: 0,item_id,pop_score,pop_rank
0,10440,141889.0,1
1,15297,137128.0,2
2,13865,93403.0,3
3,4151,69641.0,4
4,2657,55146.0,5
5,3734,50004.0,6
6,4880,43676.0,7
7,142,35862.0,8
8,6809,32399.0,9
9,4740,30708.0,10


In [253]:
recos_dict = ranker_full_uniq.groupby(Columns.User).agg(list).to_dict()[Columns.Item]

In [254]:
RECOS_PATH = "/content/ranker_reco.json"
with open(RECOS_PATH, "w") as jf:
    json.dump(recos_dict, jf)

In [255]:
recos = pd.read_json('/content/ranker_reco.json', orient='index')

In [258]:
recos = recos.T

In [259]:
recos

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
3,15297,4471,2887,10845,11268,12089,10440,13865,4151,2657
11,3594,11868,5136,10843,10440,15297,13865,4151,2657,3734
14,7554,10440,15297,13865,4151,2657,3734,4880,142,6809
21,6686,111,2981,11145,2950,14701,10440,15297,13865,4151
30,6210,4863,7554,10440,15297,13865,4151,2657,3734,4880
...,...,...,...,...,...,...,...,...,...,...
1097515,12995,2887,4702,2689,8579,1679,7554,10440,15297,13865
1097521,6964,11268,10440,15297,13865,4151,2657,3734,4880,142
1097526,10256,10843,7684,2950,10440,15297,13865,4151,2657,3734
1097534,4237,10845,10440,15297,13865,4151,2657,3734,4880,142


In [260]:
recos.to_excel('/content/reco_ranker.xlsx')