In [None]:
import polars as pl
from datetime import date, timedelta

## Ranker

In [None]:
user_actions_full = pl.read_parquet('../data/user_actions_full')

In [47]:
ranker_data = pl.read_parquet('../data/user_actions_7_days_ranker')
ranker_data.select(pl.max('date').alias('max_date'), pl.min('date').alias('min_date'))

max_date,min_date
date,date
2024-06-30,2024-06-23


In [48]:
TEST_START = date(2024, 6, 23)

In [49]:
ranker_data

user_id,product_id,date,action_type
i32,i64,date,str
4453096,621177276,2024-06-27,"""view"""
2346229,390557159,2024-06-23,"""view"""
9589535,146396320,2024-06-23,"""view"""
2085760,149293540,2024-06-30,"""view"""
1289097,851213293,2024-06-27,"""view"""
…,…,…,…
9786236,138860210,2024-06-28,"""order"""
6661721,147740723,2024-06-30,"""view"""
8865354,261366874,2024-06-30,"""view"""
2391586,252410311,2024-06-26,"""view"""


In [50]:
ranker_data.group_by('action_type').agg(pl.count("product_id"))

action_type,product_id
str,u32
"""click""",3240738
"""view""",24461070
"""order""",2103547
"""favorite""",193474
"""to_cart""",3912035


In [51]:
map_target = {
    'view': 0,
    'click': 0.01,
    'favorite': 0.1,
    'to_cart': 0.3,
    'order': 0.59,
}
map_target_df = pl.DataFrame(data={
    'action_type': map_target.keys(),
    'target': map_target.values(),
}, schema={"action_type": pl.String, "target": pl.Float32})

In [52]:
# create targets
ranker_data_target = (
    ranker_data
    .join(map_target_df, on='action_type')
    .select('user_id', 'product_id', 'target')
)

In [53]:
ranker_data_target

user_id,product_id,target
i32,i64,f32
4453096,621177276,0.0
2346229,390557159,0.0
9589535,146396320,0.0
2085760,149293540,0.0
1289097,851213293,0.0
…,…,…
9786236,138860210,0.59
6661721,147740723,0.0
8865354,261366874,0.0
2391586,252410311,0.0


In [54]:
pos_users = (
    ranker_data_target
    .group_by('user_id')
    .agg(pl.max('target').alias('max_target'))
    .filter(pl.col('max_target') > 0)
    .sort(by='user_id')
    .sample(100_000, seed=0)
)

In [55]:
ranker_data_target_filtered = (
    ranker_data_target
    .join(pos_users, on='user_id')
    .group_by('user_id', 'product_id')
    .agg(pl.sum('target').alias('target'))
)
ranker_data_target_filtered.shape

(2706980, 3)

In [56]:
ranker_data_target_filtered.sort('user_id')

user_id,product_id,target
i32,i64,f32
54,147887697,0.01
128,1562705601,0.0
128,149724906,0.89
128,1023814617,0.0
170,1528912372,0.0
…,…,…
11184165,686824985,0.0
11184165,138423936,0.01
11184165,583497558,0.0
11184165,312129116,0.89


In [57]:
TEST_START

datetime.date(2024, 6, 23)

In [58]:
data = (
    user_actions_full
    .filter(pl.col('date') < TEST_START)
    .filter(pl.col('date') >= TEST_START - timedelta(days=3 * 30))
)
data.shape   

(102174245, 4)

In [59]:
product_information_full = pl.read_parquet('../data/product_information_full')
product_information_full

product_id,name,brand,type,category_id,category_name
i64,str,str,str,i32,str
160839072,"""CeraVe Смягчающий крем для сух…","""CeraVe""","""Гель для ухода за кожей""",38,"""Сыворотки для лица"""
161689127,"""Yves Rocher / Ив Роше / Увлажн…","""Yves Rocher France""","""Гель для ухода за кожей""",38,"""Сыворотки для лица"""
221508445,"""Bioderma Эликсир для ухода за …","""Bioderma""","""Эликсир для ухода за кожей""",38,"""Сыворотки для лица"""
309017861,"""ART&FACT. / Сыворотка для лица…","""ART&FACT.""","""Сыворотка для лица""",38,"""Сыворотки для лица"""
793710195,"""Breylee Сыворотка для лица Ант…","""Breylee""","""Сыворотка для лица""",38,"""Сыворотки для лица"""
…,…,…,…,…,…
1154315599,"""Сушилка для овощей и фруктов 3…","""Великие реки""","""Дегидратор""",284,"""Сушилки для овощей"""
1196912369,"""GFGRIL Электрическая сушилка д…","""GFGRIL""","""Дегидратор""",284,"""Сушилки для овощей"""
1255681315,"""Дегидратор сушилка для овощей …","""Marta""","""Дегидратор""",284,"""Сушилки для овощей"""
483756641,"""Дегидратор Kitfort КТ-1915-1, …","""Kitfort""","""Дегидратор""",284,"""Сушилки для овощей"""


In [60]:
feature_dfs = {}

In [61]:
for suf in ['click', 'favorite', 'to_cart', 'order']:
    feature_dfs[f'{suf}_ui_features'] = (
        data
        .filter(pl.col('action_type') == suf)
        .group_by('user_id', 'product_id')
        .agg(
            pl.count('product_id').alias(f'ui_num_{suf}')
        )
    )
    feature_dfs[f'{suf}_i_features'] = (
        data
        .filter(pl.col('action_type') == suf)
        .group_by('product_id')
        .agg(
            pl.count('user_id').alias(f'i_num_{suf}')
        )
    )

In [62]:
ranker_data_target_filtered_with_features = ranker_data_target_filtered
for key, df in feature_dfs.items():
    if 'ui' in key:
        ranker_data_target_filtered_with_features = (
            ranker_data_target_filtered_with_features
            .join(df, on=['user_id', 'product_id'], how='left')
        )
    else:
        ranker_data_target_filtered_with_features = (
            ranker_data_target_filtered_with_features
            .join(df, on=['product_id'], how='left')
        )

In [63]:
del feature_dfs

In [64]:
del data

In [65]:
ranker_data_target_filtered_with_features.shape

(2706980, 11)

In [66]:
ranker_data_target_filtered_with_features

user_id,product_id,target,ui_num_click,i_num_click,ui_num_favorite,i_num_favorite,ui_num_to_cart,i_num_to_cart,ui_num_order,i_num_order
i32,i64,f32,u32,u32,u32,u32,u32,u32,u32,u32
1020859,965580704,0.0,,10001,,711,,1322,,331
7223791,240480230,0.3,,1387,,68,,771,,245
10875382,1525488864,0.0,,5256,,504,,8537,,3001
4719515,303734515,0.59,,3304,,109,1,16441,11,6307
3239820,1506192518,0.0,,2731,,355,,2233,,669
…,…,…,…,…,…,…,…,…,…,…
7718163,142120787,0.0,1,8798,,401,1,6916,,2158
126513,296399078,0.3,,3690,,122,,3860,,1175
9209518,1074808936,0.0,,3879,,425,,12504,,3997
3856441,261375229,0.0,,905,,50,,4710,,1862


In [68]:
ranker_data_target_filtered_with_features = (
    ranker_data_target_filtered_with_features
    .join(
        product_information_full
        .select('product_id', 'brand', 'type', 'category_id'),
        on=['product_id'],
        # how='left'
    )
    # .with_columns(
    #     pl.col('brand').fill_nan(pl.lit('no_brand')),
    #     pl.col('category_id').fill_nan(pl.lit(0)),
    #     pl.col('type').fill_nan(pl.lit('no_type')),
    # )
)

In [70]:
ranker_data_target_filtered_with_features.shape

(2703911, 14)

In [69]:
ranker_data_target_filtered_with_features

user_id,product_id,target,ui_num_click,i_num_click,ui_num_favorite,i_num_favorite,ui_num_to_cart,i_num_to_cart,ui_num_order,i_num_order,brand,type,category_id
i32,i64,f32,u32,u32,u32,u32,u32,u32,u32,u32,str,str,i32
1020859,965580704,0.0,,10001,,711,,1322,,331,"""Crazy Getup""","""Комплект постельного белья""",345
7223791,240480230,0.3,,1387,,68,,771,,245,"""Fitstart""","""Хлебцы""",296
10875382,1525488864,0.0,,5256,,504,,8537,,3001,"""Lay's""","""Чипсы""",236
4719515,303734515,0.59,,3304,,109,1,16441,11,6307,"""ЭкоНива""","""Молоко""",420
3239820,1506192518,0.0,,2731,,355,,2233,,669,"""Ozon fresh""","""Замороженные ягоды""",910
…,…,…,…,…,…,…,…,…,…,…,…,…,…
7718163,142120787,0.0,1,8798,,401,1,6916,,2158,"""Индилайт""","""Птица охлажденная""",623
126513,296399078,0.3,,3690,,122,,3860,,1175,"""Городской батон""","""Хлеб""",858
9209518,1074808936,0.0,,3879,,425,,12504,,3997,"""Село Зеленое""","""Мороженое""",413
3856441,261375229,0.0,,905,,50,,4710,,1862,"""Простоквашино""","""Творог""",354


In [71]:
import catboost

In [72]:
df = ranker_data_target_filtered_with_features.sort(by='user_id').to_pandas()
mask = df.user_id % 10 <= 7

In [73]:
df.columns

Index(['user_id', 'product_id', 'target', 'ui_num_click', 'i_num_click',
       'ui_num_favorite', 'i_num_favorite', 'ui_num_to_cart', 'i_num_to_cart',
       'ui_num_order', 'i_num_order', 'brand', 'type', 'category_id'],
      dtype='object')

In [74]:
cols = [
    'ui_num_click', 'i_num_click',
    'ui_num_favorite', 'i_num_favorite', 'ui_num_to_cart', 'i_num_to_cart',
    'ui_num_order', 'i_num_order',
]

In [75]:
train_pool = catboost.Pool(
    df.loc[mask, cols],
    label=df.loc[mask].target,
    group_id=df.loc[mask].user_id,
    # cat_features=['brand', 'type', 'category_id'],
)
eval_pool = catboost.Pool(
    df.loc[~mask, cols],
    label=df.loc[~mask].target,
    group_id=df.loc[~mask].user_id,
    # cat_features=['brand', 'type', 'category_id'],
)

In [76]:
params = {
    'iterations': 200,
    'thread_count': -1,
    'depth': 6, 
    'learning_rate': 0.1, 
    'random_state': 1,
    'loss_function': 'YetiRankPairwise',
    'eval_metric': 'NDCG',
#     'eval_metric': 'AUC',
#     'loss_function': 'Logloss',
    'task_type': 'CPU',
}

In [32]:
model = catboost.CatBoost(params)
model.fit(
    train_pool, 
    eval_set=eval_pool,
    use_best_model=True,
    verbose=10,
    early_stopping_rounds=50,
)

0:	test: 0.6282144	best: 0.6282144 (0)	total: 1.34s	remaining: 4m 26s
10:	test: 0.6691653	best: 0.6691653 (10)	total: 14.9s	remaining: 4m 15s
20:	test: 0.6777329	best: 0.6777329 (20)	total: 28s	remaining: 3m 58s
30:	test: 0.6883204	best: 0.6883204 (30)	total: 40.8s	remaining: 3m 42s
40:	test: 0.6919649	best: 0.6919649 (40)	total: 54.5s	remaining: 3m 31s
50:	test: 0.6931478	best: 0.6931614 (49)	total: 1m 7s	remaining: 3m 16s
60:	test: 0.6996091	best: 0.6996091 (60)	total: 1m 20s	remaining: 3m 2s
70:	test: 0.7000428	best: 0.7000428 (70)	total: 1m 33s	remaining: 2m 49s
80:	test: 0.7006474	best: 0.7007887 (78)	total: 1m 46s	remaining: 2m 36s
90:	test: 0.7033825	best: 0.7034330 (87)	total: 1m 59s	remaining: 2m 23s
100:	test: 0.7041027	best: 0.7041027 (100)	total: 2m 12s	remaining: 2m 10s
110:	test: 0.7052497	best: 0.7052497 (110)	total: 2m 25s	remaining: 1m 56s
120:	test: 0.7058822	best: 0.7061790 (117)	total: 2m 37s	remaining: 1m 43s
130:	test: 0.7062165	best: 0.7063451 (127)	total: 2m 50s

<catboost.core.CatBoost at 0x35c870f70>

In [33]:
name = 'ranker_v1'
model.save_model(f"../models/{name}.bin")

In [34]:
fi = model.get_feature_importance(eval_pool, prettified=True)
fi.head(50)

Unnamed: 0,Feature Id,Importances
0,ui_num_to_cart,0.016169
1,ui_num_order,0.004055
2,ui_num_click,0.003388
3,i_num_favorite,0.00241
4,i_num_click,0.001336
5,ui_num_favorite,0.000278
6,i_num_to_cart,5.4e-05
7,i_num_order,-0.003105


In [77]:
model

<catboost.core.CatBoost at 0x35c870f70>

In [78]:
model_load = catboost.CatBoost()
model_load.load_model(f"../models/{name}.bin")

<catboost.core.CatBoost at 0x547b239d0>