In [1]:
%pip install lightgbm >>None
%pip install xgboost >>None
%pip install catboost >>None
%pip install ipywidgets >>None


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import sys
sys.path.append("../")

In [3]:
import warnings
warnings.simplefilter('ignore')
from pathlib import Path
import dill
import numpy as np
import pandas as pd
import requests
# import shap

from lightfm import LightFM
from lightfm.data import Dataset
from lightgbm import LGBMRanker, LGBMClassifier, Booster
from xgboost import XGBRanker
from catboost import CatBoostRanker, Pool

import rectools
from rectools.metrics import calc_metrics, NDCG, MAP, Precision, Recall, MeanInvUserFreq
from rectools import Columns
from rectools.models import ImplicitALSWrapperModel, PopularModel
from implicit.als import AlternatingLeastSquares

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from typing import Any, Dict, Tuple
from tqdm.auto import tqdm
from zipfile import ZipFile

from utils.tools import generate_lightfm_recs_mapper, avg_user_metric

## Подготовка данных

In [4]:
interactions = pd.read_csv('../data_original/interactions.csv')
users = pd.read_csv('../data_original/users.csv')
items = pd.read_csv('../data_original/items.csv')

In [5]:
# Меняем названия колонок для использования rectools
interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'watched_pct': Columns.Weight,
    }, 
    inplace=True,
) 

# Меняем тип данных
interactions['datetime'] = interactions['datetime'].astype(np.datetime64)

# Заполняем пропуски
interactions_default_values: Dict[str, Any] = {
   Columns.Datetime: interactions[Columns.Datetime].median(),
    Columns.Weight: 0.,
    'total_dur': 0,
}
interactions.fillna(interactions_default_values, inplace=True)

In [6]:
def encode_cat_cols(df: pd.DataFrame, cat_cols) -> Tuple[pd.DataFrame, Dict]:
    cat_col_encoding = {}  # словарь с категориями

    for col in cat_cols:
        cat_col = df[col].astype('category').cat
        cat_col_encoding[col] = cat_col.categories
        df[col] = cat_col.codes.astype('category')
    return df, cat_col_encoding

users_cat_cols = [
     'age', 'income', 'sex', 'kids_flg'
]
users, users_cat_col_encoding = encode_cat_cols(users, users_cat_cols)

In [7]:
# Аналогичным образом кодируем категориальные колонки
items_cat_cols = [ 
    'content_type', 'for_kids', 'studios',
]

items_text_cols = [
    'title', 'title_orig', 'genres', 'countries', 'directors', 'actors', 'description', 'keywords',
]

items_num_cols = [
    'release_year', 'age_rating', 
]

default_values_items = {
    'release_year': items['release_year'].median(),
    'age_rating': items['age_rating'].median(),
}

items, items_cat_col_encoding = encode_cat_cols(items, items_cat_cols) 
items.fillna(default_values_items, inplace=True)

## Трейн-вал-тест сплит

In [8]:
max_date = interactions[Columns.Datetime].max()
min_date = interactions[Columns.Datetime].min()

print(f'min дата в interactions: {min_date}')
print(f'max дата в interactions: {max_date}')
print(f'Продолжительность: {max_date - min_date}')

min дата в interactions: 2021-03-13 00:00:00
max дата в interactions: 2021-08-22 00:00:00
Продолжительность: 162 days 00:00:00


In [9]:
ranker_days_count = 30

ranker_data = interactions[
    (interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=ranker_days_count))
]

train_size = 0.7
val_size = 0.15
test_size = 0.15 

train_val_users, test_users = train_test_split(
    ranker_data['user_id'].unique(), random_state=42, test_size=test_size
)

train_users, val_users = train_test_split(
    train_val_users, random_state=42, test_size=val_size / (train_size + val_size)  # 15% от общего размера
)

In [10]:
base_models_data = interactions[
    (interactions[Columns.Datetime] < max_date - pd.Timedelta(days=ranker_days_count))
]

## Обучаем модели первого уровня

### LightFM

возьмём lightfm без юзер и айтем фичей

In [11]:
lightfm_dataset = Dataset()
lightfm_user_ids = base_models_data['user_id'].unique()
lightfm_item_ids = base_models_data['item_id'].unique()
lightfm_dataset.fit(lightfm_user_ids, lightfm_item_ids)

In [12]:
interactions_matrix, weights_matrix = lightfm_dataset.build_interactions(
    zip(*base_models_data[['user_id', 'item_id', Columns.Weight]].values.T)
)
weights_matrix = weights_matrix.tocsr()

In [13]:
lfm_model = LightFM(
    no_components=64, 
    learning_rate=0.1, 

    loss='warp', 
    max_sampled=5, 
    random_state=42,
)

num_epochs = 10

for _ in tqdm(range(num_epochs)):
    lfm_model.fit_partial(weights_matrix)

100%|██████████| 10/10 [00:49<00:00,  4.92s/it]


In [14]:
# save model  
with open(f'../models/lfm_model.dill', 'wb') as f:
    dill.dump(lfm_model, f)

### Popular

Будем использовать для холодных пользователей

In [15]:
users = users.loc[users[Columns.User].isin(base_models_data[Columns.User])].copy()

user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [16]:
items = items.loc[items[Columns.Item].isin(base_models_data[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

item_features = pd.concat((genre_feature, content_feature))

In [17]:
items = items.drop(items_text_cols, axis=1)

In [18]:
dataset = rectools.dataset.Dataset.construct(
    interactions_df=base_models_data,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [19]:
pop_model = PopularModel()
pop_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x7de3dca0d630>

In [20]:
# save model
with open('../models/model_popular.dill', 'wb') as f:
   dill.dump(pop_model, f)

## Генерим кандидатов, которыми дополним датасет ранкера

In [21]:
lightfm_mapping = lightfm_dataset.mapping()
lightfm_mapping = {
    'user_id_to_iid': lightfm_mapping[0],
    'item_id_to_iid': lightfm_mapping[2],
}

lightfm_mapping['user_iid_to_id'] = {v: k for k, v in lightfm_mapping['user_id_to_iid'].items()}
lightfm_mapping['item_iid_to_id'] = {v: k for k, v in lightfm_mapping['item_id_to_iid'].items()}

In [22]:
top_N = 50

user_lfm_index = np.array(list(lightfm_mapping['user_id_to_iid'].values()))
item_lfm_index = np.array(list(lightfm_mapping['item_id_to_iid'].values()))

mapper = generate_lightfm_recs_mapper(
    model=lfm_model, 
    N=top_N,
    item_iids=item_lfm_index, 
    user_id_to_iid=lightfm_mapping['user_id_to_iid'],
    item_iid_to_id=lightfm_mapping['item_iid_to_id'],
    known_item_ids=dict(),
    num_threads=32,
)

In [None]:
# скоры и ранги lightfm
lfm_candidates = pd.DataFrame({'user_id': lightfm_user_ids})
lfm_candidates['item_id'], lfm_candidates['lfm_score'] = zip(*lfm_candidates['user_id'].map(mapper))
lfm_candidates = lfm_candidates.explode(['item_id', 'lfm_score'], ignore_index=True)
lfm_candidates['lfm_rank'] = lfm_candidates.groupby('user_id').cumcount() + 1 

lfm_candidates.head(3)

In [None]:
lfm_candidates.to_csv('../processed_data/lfm_candidates.csv', index=False)

In [25]:
lfm_candidates = pd.read_csv('../processed_data/lfm_candidates.csv')

Создаем и добавляем информацию о холодных пользователях с помощью модели  popular

In [None]:
popular_candidates = pop_model.recommend(lightfm_user_ids, 
                                         dataset=dataset, 
                                         k=50, 
                                         filter_viewed=False)

popular_candidates.rename(columns={'score': 'popular_score',
                                   'rank': 'popular_rank',}, 
                      inplace=True,)

popular_candidates.head(3)

Unnamed: 0,user_id,item_id,popular_score,popular_rank
0,176549,10440,141889.0,1
1,176549,15297,137128.0,2
2,176549,13865,93403.0,3


In [None]:
candidates = pd.merge(lfm_candidates, popular_candidates,
                      on=['user_id', 'item_id'],
                      how='outer')
lfm_min_score: float =  candidates['lfm_score'].min() - 0.01
lfm_max_rank: int = candidates['lfm_rank'].max() + 1

pop_min_score: float =  candidates['popular_score'].min() - 0.01
pop_max_rank: int = candidates['popular_rank'].max() + 1

default_values = {
        'lfm_score': lfm_min_score, 'lfm_rank': lfm_max_rank,
        'popular_score': pop_min_score, 'popular_rank': pop_max_rank,
        **interactions_default_values,
    }

candidates.fillna(default_values, inplace=True)

candidates.head()

Unnamed: 0,user_id,item_id,lfm_score,lfm_rank,popular_score,popular_rank
0,176549,9728.0,1.180826,1.0,76978.0,4.0
1,176549,7571.0,1.132079,2.0,20407.0,16.0
2,176549,10440.0,1.090371,3.0,141889.0,1.0
3,176549,11237.0,1.056374,4.0,22769.0,14.0
4,176549,1785.0,0.922613,5.0,7415.99,51.0


In [None]:
candidates.to_csv('../processed_data/candidates.csv', index=False)

In [26]:
# candidates = pd.read_csv('../processed_data/candidates.csv')

In [27]:
def calc_metrics_(candidates_df, rank_col: str) -> Dict[str, float]:
    metrics = {
        'ndcg@10': NDCG(k = 10),
        'map@10': MAP(k = 10),
        'Precision@10': Precision(k = 10),
        'recall@10': Recall(k = 10),
        'novelty@10': MeanInvUserFreq(k = 10),
    }
    return calc_metrics(
        metrics=metrics,
        reco=(
            candidates_df
            .rename(columns={rank_col: Columns.Rank})
            [[Columns.User, Columns.Item, Columns.Rank]]
            [candidates_df[Columns.User].isin(test_users)]
        ),
        interactions=(
            ranker_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [ranker_data[Columns.User].isin(test_users)]
        ), 
        prev_interactions=(
            base_models_data
            [[Columns.User, Columns.Item, Columns.Datetime, Columns.Weight]]
            [base_models_data[Columns.User].isin(test_users)]
        ),
        catalog=items['item_id'].unique()
    )

In [28]:
models_metrics: Dict[str, Dict[str, float]] = dict()

In [29]:
models_metrics['lfm'] = calc_metrics_(candidates, 'lfm_rank')
models_metrics['lfm']

{'Precision@10': 0.022124844596565007,
 'recall@10': 0.07392885951675815,
 'ndcg@10': 0.02505424198029947,
 'map@10': 0.030817161793326404,
 'novelty@10': 4.063809779126102}

## Формируем датасет для ранкера

### Генерим фичи для ранкера

In [30]:
# Получаем длину истории юзера 
base_models_data['user_hist'] = (
    base_models_data.groupby('user_id')
    ['item_id'].transform('count')
)
# Получаем популярность контента
base_models_data['item_pop'] = (
    base_models_data.groupby('item_id')
    ['user_id'].transform('count')
)
# Получаем среднюю популярность контента, просматриваемого этим юзером
base_models_data['user_avg_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('mean')
)
# Получаем среднюю длину истории пользователя, которые смотрит этот контент
base_models_data['item_avg_hist'] = (
    base_models_data.groupby('item_id')
    ['user_hist'].transform('mean')
)
# Получаем популярность последнего просмотренного контента
base_models_data.sort_values(
    by=[Columns.User, Columns.Datetime], 
    ascending=[True, False], 
    ignore_index=True,
    inplace=True,
)
base_models_data['user_last_pop'] = (
    base_models_data.groupby('user_id')
    ['item_pop'].transform('first')
)

In [31]:
# Определяем холодность пользователя
base_models_data['user_cold'] = (
    base_models_data.groupby('item_id')
    ['user_hist'].transform('sum') < 100
).astype(int)

In [32]:
base_models_data.head()

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,user_hist,item_pop,user_avg_pop,item_avg_hist,user_last_pop,user_cold
0,0,6006,2021-07-20,1,0.0,6,5208,41885.0,16.891897,5208,0
1,0,7102,2021-07-19,169,3.0,6,11626,41885.0,20.349475,5208,0
2,0,14359,2021-07-19,130,2.0,6,6053,41885.0,22.546836,5208,0
3,0,15297,2021-07-19,459,0.0,6,137128,41885.0,7.364295,5208,0
4,0,9728,2021-07-19,4,0.0,6,76978,41885.0,11.165736,5208,0


In [33]:
# Добавляем новые фичи в соответствующие таблички
items = pd.merge(
    left=items, 
    right=(
        base_models_data
        [['item_id', 'item_pop', 'item_avg_hist']]
        .drop_duplicates()
    ),
    how='left',
    on='item_id',
)

users = pd.merge(
    left=users, 
    right=(
        base_models_data
        [['user_id', 'user_hist', 'user_avg_pop', 'user_last_pop', 'user_cold']]
        .drop_duplicates()
    ),
    how='left',
    on='user_id',
)
users.head(3)

Unnamed: 0,user_id,age,income,sex,kids_flg,user_hist,user_avg_pop,user_last_pop,user_cold
0,973171,1,4,1,1,5,19550.8,93403,0
1,962099,0,2,1,0,13,1329.307692,260,0
2,721985,3,2,0,0,13,6009.461538,446,0


In [34]:
# Обновляем дефолтные значения
# Прямо сейчас обновлять таблички users и items не обязательно, 
# сделаем это при джойне с кандидатами

# Для новых фичей айтемов
default_values_items['item_pop'] = base_models_data['item_pop'].median()
default_values_items['item_avg_hist'] = base_models_data['item_avg_hist'].median()

# Для новых фичей юзеров
default_values_users = {
    'user_hist': 0,
    'user_avg_pop': base_models_data['user_avg_pop'].median(),
    'user_last_pop': base_models_data['user_last_pop'].median(),
}

### Джойним кандидатов и юзер/айтем фичи

In [35]:
# Вспоминаем про наши выборки интеракций для ранкера.
# Мы отобрали юзеров для обучения, валидации и теста.
# Оставляем среди них только тех, для кого есть и рекомы и таргеты

def users_filter(
    user_list: np.ndarray,
    candidates_df: pd.DataFrame, 
    df: pd.DataFrame,
) -> pd.DataFrame:
    # Джойним интеракции на наших кандидатов для юзеров из трейна, вал и теста
    df = pd.merge(
        df[df['user_id'].isin(user_list)], 
        candidates_df[candidates_df['user_id'].isin(user_list)], 
        how='outer',  # right ? 
        on=['user_id', 'item_id']
    )
    # Проставляем дефолтные значения интеракций
    min_score: float =  df['lfm_score'].min() - 0.01
    max_rank: int = df['lfm_rank'].max() + 1  # 101
    
    default_values = {
        'lfm_score': min_score, 'lfm_rank': max_rank,
        # Важно использовате те же дефолтные значения для интеракций, 
        # чтобы не сделать утечку
        **interactions_default_values,
    }
    df.fillna(default_values, inplace=True)
        
    # Сортируем по user_id - это пригодится для вычисления рангов и групп для ранжирования
    df.sort_values(
        by=['user_id', 'item_id'],
        inplace=True,
    )
    return df

ranker_train = users_filter(train_users, candidates, ranker_data)
ranker_val = users_filter(val_users, candidates, ranker_data)
ranker_test = users_filter(test_users, candidates, ranker_data)

ranker_train.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank
1180642,3,47.0,2021-08-16,2179.0,27.0,-7.196952,52.0,,
6964783,3,101.0,2021-07-01,0.0,0.0,3.295629,48.0,9542.0,36.0
998971,3,142.0,2021-08-13,5892.0,100.0,4.370899,7.0,35862.0,9.0


In [36]:
# Добавляем фичи
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(
        df, 
        users, 
        how='left', 
        on=['user_id']
    )
    df = pd.merge(
        df, 
        items, 
        how='left', 
        on=['item_id']
    )

    df.fillna(default_values_items, inplace=True)
    df.fillna(default_values_users, inplace=True)

    for col in df.columns:
        if isinstance(df[col].dtype, pd.CategoricalDtype):
            if -1 not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories(-1)
            df.fillna({col: -1}, inplace=True)
    return df

ranker_train = add_features(ranker_train)
ranker_val = add_features(ranker_val)
ranker_test = add_features(ranker_test)

## Обучаем ранкер

In [37]:
def filter_group(df: pd.DataFrame) -> pd.DataFrame:
    df.sort_values(
        by=['user_id', 'item_id'],
        inplace=True,
    )
    groups_df = (
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        .rename(columns={'item_id': 'group_size'})
    )
    df = pd.merge(
        df, 
        groups_df, 
        how='left', 
        on=['user_id']
    )
    # Удаляем группы, без достаточного числа просмотров/кандидатов
    df = df[df['group_size'] >= 100]

    # Колонка больше не нужна
    df.drop(columns=['group_size'], inplace=True)
    return df

ranker_train = filter_group(ranker_train)
ranker_val = filter_group(ranker_val)
ranker_test = filter_group(ranker_test)

In [38]:
cols = [
    'lfm_score', 'lfm_rank',
    'popular_score', 'popular_rank',
    'age', 'income', 'sex', 'kids_flg', 'user_hist', 'user_avg_pop', 'user_last_pop',
    'content_type', 'release_year', 'for_kids', 'age_rating', 'studios', 'item_pop', 'item_avg_hist',
]

cat_cols = [
    'age', 'income', 'sex', 'kids_flg',
    'content_type', 'for_kids', 'studios',
]

In [39]:
def add_score_and_rank(df: pd.DataFrame, y_pred_scores: np.ndarray, name: str) -> pd.DataFrame:
    df[f'{name}_score'] = y_pred_scores
    
    df.sort_values(
        by=['user_id', f'{name}_score'],
        ascending=[True, False],
        inplace=True,
    )
    df[f'{name}_rank'] = df.groupby('user_id').cumcount() + 1

    mask = (df['lfm_rank'] < 101).to_numpy()
    eps: float = 0.001
    min_score: float = min(y_pred_scores) - eps
    df[f'{name}_hybrid_score'] = df[f'{name}_score'] * mask
    df[f'{name}_hybrid_score'].replace(
        0,
        min_score,
        inplace=True,
    )
    
    df[f'{name}_hybrid_rank'] = df[f'{name}_rank'] * mask
    max_rank: int = 101
    df[f'{name}_hybrid_rank'].replace(
        0,
        max_rank,
        inplace=True,
    )
    return df

In [40]:
# таргет
def add_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    0 - доля досмотра < 0.15
    1 - 0.15 <= доля досмотра < 0.75
    2 - 0.75 <= доля досмотра
    """
    df['target_ranker'] = (df[Columns.Weight] >= 15).astype(int)  # 'watched_pct'
    df['target_ranker'] += (df[Columns.Weight] >= 75).astype(int)
    return df

ranker_train = add_target(ranker_train)
ranker_val = add_target(ranker_val)
ranker_test = add_target(ranker_test)

ranker_train.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,user_cold,content_type,release_year,for_kids,age_rating,studios,genre,item_pop,item_avg_hist,target_ranker
701,106,101.0,2021-07-01,0.0,0.0,-7.186952,51.0,9542.0,36.0,-1,...,,0,2019.0,-1,18.0,-1,"[историческое, мелодрамы]",9542.0,17.990673,0
702,106,142.0,2021-07-01,0.0,0.0,-7.186952,51.0,35862.0,9.0,-1,...,,0,2020.0,-1,16.0,-1,"[драмы, триллеры]",35862.0,15.251464,0
703,106,334.0,2021-07-01,0.0,0.0,3.740743,44.0,7415.99,51.0,-1,...,,0,2012.0,-1,6.0,-1,"[мультфильм, фэнтези, приключения, комедии]",2631.0,25.280882,0


### LGBMRanker

In [41]:
def get_group_lgbm(df: pd.DataFrame) -> np.ndarray:
    return np.array(
        df[['user_id', 'item_id']]
        .groupby(by=['user_id']).count()
        ['item_id']
    )

In [42]:
params = {
    'objective': 'lambdarank',
    'n_estimators': 1000,
    'max_depth': 10,
    'num_leaves': 10,
    'min_child_samples': 100,
    'learning_rate': 0.25,
    'reg_lambda': 1,
    'colsample_bytree': 0.9,
    'early_stopping_rounds': 100,
    'random_state': 42,
}
early_stopping_rounds = 32
fit_params = {
    'X': ranker_train[cols],
    'y': ranker_train['target_ranker'],
    'group': get_group_lgbm(ranker_train),
    'eval_set': [(ranker_val[cols], ranker_val['target_ranker'])],
    'eval_group': [get_group_lgbm(ranker_val)],
    'eval_metric': 'ndcg',
    'eval_at': (3, 5, 10),
    'feature_name': cols,
}
LGBMRanker_model = LGBMRanker(**params)

In [43]:
%%time
LGBMRanker_model.fit(**fit_params)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013730 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1784
[LightGBM] [Info] Number of data points in the train set: 625451, number of used features: 18
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[165]	valid_0's ndcg@3: 0.640498	valid_0's ndcg@5: 0.64572	valid_0's ndcg@10: 0.666694
CPU times: user 36.2 s, sys: 20.5 ms, total: 36.2 s
Wall time: 6.17 s


In [51]:
with open(f'../models/LGBMRanker_model.dill', 'wb') as f:
    dill.dump(LGBMRanker_model, f)

In [53]:
y_pred = LGBMRanker_model.predict(ranker_test[cols])
y_pred

array([-5.94620056, -5.94620056, -4.68826203, ..., -3.93900025,
        1.12467369, -4.37940766])

In [54]:
y_pred: np.ndarray = LGBMRanker_model.predict(ranker_test[cols])
ranker_test = add_score_and_rank(ranker_test, y_pred, 'LGBMRanker')
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,age_rating,studios,genre,item_pop,item_avg_hist,target_ranker,LGBMRanker_score,LGBMRanker_rank,LGBMRanker_hybrid_score,LGBMRanker_hybrid_rank
1597,955,5693.0,2021-08-10,6354.0,90.0,-7.196952,52.0,,,1,...,16.0,-1,"[боевики, фантастика, триллеры, приключения]",5841.0,24.857387,2,1.653494,1,1.653494,1
1598,955,5693.0,2021-08-10,6354.0,90.0,-7.196952,52.0,,,1,...,16.0,-1,"[боевики, фантастика, триллеры, приключения]",5841.0,24.857387,2,1.653494,2,1.653494,2
1679,955,13915.0,2021-07-24,5899.0,100.0,-7.196952,52.0,,,1,...,6.0,-1,"[для детей, приключения, семейное, фэнтези, ко...",5982.0,23.285523,2,1.595705,3,1.595705,3


In [55]:
models_metrics['LGBMRanker'] = calc_metrics_(ranker_test, 'LGBMRanker_rank')

pd.DataFrame(models_metrics)[['LGBMRanker']]

Unnamed: 0,LGBMRanker
Precision@10,0.01482
recall@10,0.008383
ndcg@10,0.015265
map@10,0.008293
novelty@10,8.384046


### XGBRanker

In [56]:
def get_group_xgb(df: pd.DataFrame) -> np.ndarray:
    return np.array(df['user_id'].value_counts())

In [57]:
fit_params = {
    'X': ranker_train[cols].drop(cat_cols, axis=1),
    'y': ranker_train['target_ranker'],
    'group': get_group_xgb(ranker_train),
    'eval_set': [(ranker_val[cols].drop(cat_cols, axis=1), ranker_val['target_ranker'])],
    'eval_group': [get_group_xgb(ranker_val)],
    'eval_metric': 'ndcg',
    'verbose': early_stopping_rounds / 8,
}
params = {
    'objective': 'rank:ndcg',
    'n_estimators': 100,
    'max_depth': 4,
    'num_leaves': 10,
    'min_child_samples': 100,
    'learning_rate': 0.25,
    'reg_lambda': 1,
    'colsample_bytree': 0.9,
    'random_state': 42,
}

In [58]:
XGBRanker_model = XGBRanker(**params)

In [59]:
%%time
XGBRanker_model.fit(**fit_params)

[0]	validation_0-ndcg:0.71394
[4]	validation_0-ndcg:0.73666
[8]	validation_0-ndcg:0.73808
[12]	validation_0-ndcg:0.73842
[16]	validation_0-ndcg:0.73940
[20]	validation_0-ndcg:0.73936
[24]	validation_0-ndcg:0.74067
[28]	validation_0-ndcg:0.74230
[32]	validation_0-ndcg:0.74289
[36]	validation_0-ndcg:0.74364
[40]	validation_0-ndcg:0.74421
[44]	validation_0-ndcg:0.74445
[48]	validation_0-ndcg:0.74390
[52]	validation_0-ndcg:0.74403
[56]	validation_0-ndcg:0.74441
[60]	validation_0-ndcg:0.74505
[64]	validation_0-ndcg:0.74505
[68]	validation_0-ndcg:0.74618
[72]	validation_0-ndcg:0.74648
[76]	validation_0-ndcg:0.74664
[80]	validation_0-ndcg:0.74762
[84]	validation_0-ndcg:0.74681
[88]	validation_0-ndcg:0.74679
[92]	validation_0-ndcg:0.74730
[96]	validation_0-ndcg:0.74734
[99]	validation_0-ndcg:0.74716
CPU times: user 1min 32s, sys: 343 ms, total: 1min 32s
Wall time: 9.46 s


In [60]:
# save model  
with open(f'../models/xgb_ranker_model.dill', 'wb') as f:
    dill.dump(XGBRanker_model, f)

In [61]:
y_pred: np.ndarray = XGBRanker_model.predict(ranker_test[cols].drop(cat_cols, axis=1))
ranker_test = add_score_and_rank(ranker_test, y_pred, 'XGBRanker')
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,item_avg_hist,target_ranker,LGBMRanker_score,LGBMRanker_rank,LGBMRanker_hybrid_score,LGBMRanker_hybrid_rank,XGBRanker_score,XGBRanker_rank,XGBRanker_hybrid_score,XGBRanker_hybrid_rank
1679,955,13915.0,2021-07-24,5899.0,100.0,-7.196952,52.0,,,1,...,23.285523,2,1.595705,3,1.595705,3,1.511064,1,1.511064,1
1680,955,13915.0,2021-07-24,5899.0,100.0,-7.196952,52.0,,,1,...,23.285523,2,1.595705,4,1.595705,4,1.511064,2,1.511064,2
1623,955,7612.0,2021-07-25,8169.0,100.0,-7.196952,52.0,,,1,...,29.339623,2,1.207769,13,1.207769,13,1.218168,3,1.218168,3


In [62]:
models_metrics['XGBRanker'] = calc_metrics_(ranker_test, 'XGBRanker_rank')
pd.DataFrame(models_metrics)[['XGBRanker']]

Unnamed: 0,XGBRanker
Precision@10,0.014828
recall@10,0.008412
ndcg@10,0.015269
map@10,0.008298
novelty@10,8.20611


### CatBoostRanker

In [63]:
params = {
    'n_estimators': 100,#00,
    'depth': 4,
    'learning_rate': 0.25,
    'reg_lambda': 1, 
    'random_seed': 42,
    'early_stopping_rounds': early_stopping_rounds,
    'custom_metric': 'NDCG:top=10',
    'verbose': 1,
}

fit_params = {
    'X': Pool(data=ranker_train[cols],
              label=ranker_train['target_ranker'],
              group_id=ranker_train['user_id'].values,
              cat_features=cat_cols),          
    'eval_set': Pool(data=ranker_val[cols],
              label=ranker_val['target_ranker'],
              group_id=ranker_val['user_id'].values,
              cat_features=cat_cols),
    'early_stopping_rounds': early_stopping_rounds,
    'plot': 1,
}

In [64]:
ranker_train.shape, ranker_test.shape, ranker_val.shape

((625451, 26), (132983, 34), (137371, 26))

In [67]:
%%time
CatBoostRanker_model = CatBoostRanker(**params)
CatBoostRanker_model.fit(**fit_params)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.6704435	best: 0.6704435 (0)	total: 452ms	remaining: 44.8s
1:	test: 0.7204536	best: 0.7204536 (1)	total: 868ms	remaining: 42.6s
2:	test: 0.7402139	best: 0.7402139 (2)	total: 1.28s	remaining: 41.3s
3:	test: 0.7525907	best: 0.7525907 (3)	total: 1.67s	remaining: 40s
4:	test: 0.7706675	best: 0.7706675 (4)	total: 2.05s	remaining: 38.9s
5:	test: 0.7706434	best: 0.7706675 (4)	total: 2.44s	remaining: 38.2s
6:	test: 0.7971443	best: 0.7971443 (6)	total: 2.79s	remaining: 37.1s
7:	test: 0.8044094	best: 0.8044094 (7)	total: 3.27s	remaining: 37.6s
8:	test: 0.8048548	best: 0.8048548 (8)	total: 3.98s	remaining: 40.3s
9:	test: 0.8096355	best: 0.8096355 (9)	total: 4.53s	remaining: 40.8s
10:	test: 0.8099049	best: 0.8099049 (10)	total: 5.06s	remaining: 41s
11:	test: 0.8104475	best: 0.8104475 (11)	total: 5.56s	remaining: 40.8s
12:	test: 0.8126452	best: 0.8126452 (12)	total: 6.05s	remaining: 40.5s
13:	test: 0.8139797	best: 0.8139797 (13)	total: 6.6s

<catboost.core.CatBoostRanker at 0x7de3e13d9090>

In [68]:
with open(f'../models/CatBoostRanker_model.dill', 'wb') as f:
    dill.dump(CatBoostRanker_model, f)

In [69]:
y_pred = CatBoostRanker_model.predict(ranker_test[cols])
ranker_test = add_score_and_rank(ranker_test, y_pred, 'CatBoostRanker')
ranker_test.head(3)

Unnamed: 0,user_id,item_id,datetime,total_dur,weight,lfm_score,lfm_rank,popular_score,popular_rank,age,...,LGBMRanker_hybrid_score,LGBMRanker_hybrid_rank,XGBRanker_score,XGBRanker_rank,XGBRanker_hybrid_score,XGBRanker_hybrid_rank,CatBoostRanker_score,CatBoostRanker_rank,CatBoostRanker_hybrid_score,CatBoostRanker_hybrid_rank
1679,955,13915.0,2021-07-24,5899.0,100.0,-7.196952,52.0,,,1,...,1.595705,3,1.511064,1,1.511064,1,4.130808,1,4.130808,1
1680,955,13915.0,2021-07-24,5899.0,100.0,-7.196952,52.0,,,1,...,1.595705,4,1.511064,2,1.511064,2,4.130808,2,4.130808,2
1699,955,15404.0,2021-07-23,8171.0,100.0,-7.196952,52.0,,,1,...,1.35047,9,1.156867,9,1.156867,9,3.893623,3,3.893623,3


In [70]:
models_metrics['CatBoostRanker'] = calc_metrics_(ranker_test, 'CatBoostRanker_rank')
pd.DataFrame(models_metrics)[['CatBoostRanker']]

Unnamed: 0,CatBoostRanker
Precision@10,0.014813
recall@10,0.008381
ndcg@10,0.01526
map@10,0.008285
novelty@10,8.292941


## Итоговые метрики

Время обучения:

LGBMRanker     - 6.17 s

XGBRanker      - 9.46 s

CatBoostRanker - 39.8s

In [81]:
models_metrics['metric'] = {'Precision@10':'Precision@10', 'recall@10':'recall@10', 'ndcg@10':'ndcg@10', 'map@10':'map@10', 'novelty@10':'novelty@10'}

In [83]:
models_metrics

{'lfm': {'Precision@10': 0.022124844596565007,
  'recall@10': 0.07392885951675815,
  'ndcg@10': 0.02505424198029947,
  'map@10': 0.030817161793326404,
  'novelty@10': 4.063809779126102},
 'LGBMRanker': {'Precision@10': 0.01482049943978021,
  'recall@10': 0.00838349936851658,
  'ndcg@10': 0.015265075416591146,
  'map@10': 0.008292713588206395,
  'novelty@10': 8.38404552206468},
 'XGBRanker': {'Precision@10': 0.014828173683483496,
  'recall@10': 0.008411502776640626,
  'ndcg@10': 0.015269162826731948,
  'map@10': 0.0082981172142241,
  'novelty@10': 8.206109835207583},
 'CatBoostRanker': {'Precision@10': 0.014812825196076927,
  'recall@10': 0.008381288838778535,
  'ndcg@10': 0.015259638919955056,
  'map@10': 0.008285351908722944,
  'novelty@10': 8.2929405699862},
 'metric': {'Precision@10': 'Precision@10',
  'recall@10': 'recall@10',
  'ndcg@10': 'ndcg@10',
  'map@10': 'map@10',
  'novelty@10': 'novelty@10'}}

In [71]:
pd.DataFrame(models_metrics)[['LGBMRanker', 'XGBRanker', 'CatBoostRanker']]

Unnamed: 0,LGBMRanker,XGBRanker,CatBoostRanker
Precision@10,0.01482,0.014828,0.014813
recall@10,0.008383,0.008412,0.008381
ndcg@10,0.015265,0.015269,0.01526
map@10,0.008293,0.008298,0.008285
novelty@10,8.384046,8.20611,8.292941


In [84]:
pivot_results = (pd.DataFrame(models_metrics)[['LGBMRanker', 'XGBRanker', 'CatBoostRanker', 'metric']].groupby(["metric"], sort=False).agg(["mean"]))
mean_metric_subset = [(metric, "mean") for metric in pivot_results.columns.levels[0]]
(
    pivot_results.style
    .highlight_min(subset=mean_metric_subset, color='coral', axis=1)
    .highlight_max(subset=mean_metric_subset, color='green', axis=1)
)

Unnamed: 0_level_0,LGBMRanker,XGBRanker,CatBoostRanker
Unnamed: 0_level_1,mean,mean,mean
metric,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Precision@10,0.01482,0.014828,0.014813
recall@10,0.008383,0.008412,0.008381
ndcg@10,0.015265,0.015269,0.01526
map@10,0.008293,0.008298,0.008285
novelty@10,8.384046,8.20611,8.292941


Вывод:

XGBRanker работает быстрее и выигрывает по большей части метрик, не сильно отставая от novelty LGBMRanker-а