### 0. Loading data and importing libraries

In [None]:
!mkdir data
!mkdir data / baseline_catboost

!wget https: // storage.yandexcloud.net / datasouls-ods / materials / 0433a4ca / transactions.zip -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / 0554f0cf / clickstream.zip -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / acfacf11 / train_matching.csv -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / b949c04c / mcc_codes.csv -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / 705abbab / click_categories.csv -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / e33f2201 / currency_rk.csv -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / b99fed70 / puzzle.csv -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / f76e8087 / sample_submission.csv -P data
!wget https: // storage.yandexcloud.net / datasouls-ods / materials / 24687252 / baseline_catboost.zip -P data

!unzip data / transactions.zip -d data
!unzip data / clickstream.zip -d data
!unzip data / baseline_catboost.zip -d data / baseline_catboost

!rm data / transactions.zip
!rm data / clickstream.zip
!rm data / baseline_catboost.zip

In [3]:
!pip install catboost





In [1]:
import numpy as np
import pandas as pd
import sys
from catboost import CatBoostRanker
import catboost
from sklearn.preprocessing import LabelEncoder
from catboost import Pool
import multiprocessing
from psutil import virtual_memory

data, output_path = sys.argv[1:]
input_folder = 'data'
ram_gb = round(virtual_memory().total / 1024 ** 3, 1)

print(data)
print(output_path)
print(catboost.__version__, np.__version__, pd.__version__)
print('CPU:', multiprocessing.cpu_count())
print('RAM GB:', ram_gb)

-f
C:\Users\Romanov\AppData\Roaming\jupyter\runtime\kernel-b4ed262e-e037-4920-acd9-baa62cc37536.json
1.0.4 1.20.3 1.3.5
CPU: 12
RAM GB: 31.9


### 1. Read and preprocess data

### 1.1 Clickstream

In [2]:
all_dicts = {}

clickstream = pd.read_csv(r'data\clickstream.csv')
clickstream['timestamp'] = pd.to_datetime(clickstream['timestamp'])

# чтобы уменьшить объём файла, закодируем user_id числами
all_dicts['rtk_le'] = LabelEncoder().fit(clickstream['user_id'])
clickstream['user_id'] = all_dicts['rtk_le'].transform(clickstream['user_id']) + 1
clickstream_dtypes = {'user_id': np.int16, 'cat_id': np.int16, 'new_uid': np.int32}
clickstream = clickstream.astype(clickstream_dtypes)
clickstream['hour'] = clickstream['timestamp'].dt.hour

In [3]:
# Было
clickstream.head()

Unnamed: 0,user_id,cat_id,timestamp,new_uid,hour
0,1,165,2021-01-30 20:08:12,1873448,20
1,1,165,2021-01-31 20:06:29,1873448,20
2,1,308,2021-01-31 20:12:00,1873448,20
3,1,931,2021-01-31 22:12:00,1873448,22
4,1,931,2021-02-01 16:57:00,1873448,16


In [4]:
# Стало
clickstream.head()

Unnamed: 0,user_id,cat_id,timestamp,new_uid,hour
0,1,165,2021-01-30 20:08:12,1873448,20
1,1,165,2021-01-31 20:06:29,1873448,20
2,1,308,2021-01-31 20:12:00,1873448,20
3,1,931,2021-01-31 22:12:00,1873448,22
4,1,931,2021-02-01 16:57:00,1873448,16


### 1.2 Transactions

In [5]:
transactions = pd.read_csv('data/transactions.csv')
transactions['transaction_dttm'] = pd.to_datetime(transactions['transaction_dttm'])
transactions['hour'] = transactions['transaction_dttm'].dt.hour

# Аналогично, чтобы уменьшить объём файла закодируем user_id числами
all_dicts['bank_le'] = LabelEncoder().fit(transactions['user_id'])
transactions['user_id'] = all_dicts['bank_le'].transform(transactions['user_id']) + 1
transactions_dtypes = {'user_id': np.int16, 'mcc_code': np.int16, 'currency_rk': np.int8}
transactions = transactions.astype(transactions_dtypes)

In [6]:
# Было
transactions.head()

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,hour
0,1,5411,48,-361.0723,2020-08-03 08:05:23,8
1,1,5499,48,-137.31398,2020-08-05 01:27:40,1
2,1,5499,48,-138.84981,2020-08-05 03:28:11,3
3,1,4829,48,-309.47653,2020-08-06 00:36:29,0
4,1,5411,48,-133.4737,2020-08-09 00:30:13,0


In [7]:
# Стало
transactions.head()

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,hour
0,1,5411,48,-361.0723,2020-08-03 08:05:23,8
1,1,5499,48,-137.31398,2020-08-05 01:27:40,1
2,1,5499,48,-138.84981,2020-08-05 03:28:11,3
3,1,4829,48,-309.47653,2020-08-06 00:36:29,0
4,1,5411,48,-133.4737,2020-08-09 00:30:13,0


### 1.3 Other files

In [8]:
puzzle = pd.read_csv('data/puzzle.csv')
print(puzzle.shape)
puzzle.head(2)

puzzle['bank'] = all_dicts['bank_le'].transform(puzzle['bank']) + 1
puzzle['rtk'] = all_dicts['rtk_le'].transform(puzzle['rtk']) + 1

train = pd.read_csv('data/train_matching.csv')
print(train.shape)
train.head(2)

train['bank'] = all_dicts['bank_le'].transform(train['bank']) + 1
train.loc[train.rtk == '0', 'rtk'] = 0
train.loc[train.rtk != 0, 'rtk'] = all_dicts['rtk_le'].transform(train.loc[train.rtk != 0, 'rtk']) + 1

(4952, 2)
(17581, 2)


In [9]:
puzzle.head()

Unnamed: 0,bank,rtk
0,9720,6550
1,3027,15496
2,20887,957
3,1517,4508
4,12223,14723


In [10]:
train.head()

Unnamed: 0,bank,rtk
0,2091,17319
1,6347,9604
2,21724,13623
3,6456,7502
4,4866,14760


## 2. Делаем эмбеддинги на основе часов проведённой транзакции и часов в кликстриме

### Транзакции:

In [11]:
tr = pd.pivot_table\
    (transactions, index='user_id', columns='hour', values='transaction_dttm', aggfunc='count').fillna(0)
tr.head()

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,83.0,90.0,80.0,93.0,73.0,63.0,38.0,20.0,16.0,7.0,...,1.0,1.0,0.0,0.0,0.0,36.0,51.0,57.0,60.0,69.0
2,4.0,12.0,25.0,29.0,31.0,32.0,34.0,26.0,35.0,46.0,...,77.0,61.0,53.0,22.0,8.0,5.0,0.0,0.0,0.0,1.0
3,9.0,16.0,26.0,49.0,72.0,83.0,85.0,83.0,63.0,84.0,...,51.0,19.0,9.0,5.0,4.0,4.0,1.0,2.0,0.0,2.0
4,2.0,1.0,1.0,4.0,3.0,10.0,9.0,4.0,5.0,3.0,...,12.0,11.0,15.0,10.0,6.0,74.0,225.0,226.0,77.0,0.0
5,39.0,48.0,38.0,26.0,21.0,36.0,67.0,86.0,115.0,85.0,...,59.0,26.0,14.0,7.0,20.0,23.0,23.0,3.0,0.0,5.0


In [12]:
tr_sv = pd.pivot_table\
    (transactions, index='user_id', columns='hour', values='transaction_amt', aggfunc='count').fillna(0)

In [13]:
tr_sv.head()  # к-во транзакций в каждый час

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,83.0,90.0,80.0,93.0,73.0,63.0,38.0,20.0,16.0,7.0,...,1.0,1.0,0.0,0.0,0.0,36.0,51.0,57.0,60.0,69.0
2,4.0,12.0,25.0,29.0,31.0,32.0,34.0,26.0,35.0,46.0,...,77.0,61.0,53.0,22.0,8.0,5.0,0.0,0.0,0.0,1.0
3,9.0,16.0,26.0,49.0,72.0,83.0,85.0,83.0,63.0,84.0,...,51.0,19.0,9.0,5.0,4.0,4.0,1.0,2.0,0.0,2.0
4,2.0,1.0,1.0,4.0,3.0,10.0,9.0,4.0,5.0,3.0,...,12.0,11.0,15.0,10.0,6.0,74.0,225.0,226.0,77.0,0.0
5,39.0,48.0,38.0,26.0,21.0,36.0,67.0,86.0,115.0,85.0,...,59.0,26.0,14.0,7.0,20.0,23.0,23.0,3.0,0.0,5.0


In [14]:
tr_sv['summs'] = tr_sv.sum(axis=1)
for i in tr_sv.columns[:-1]:
    tr_sv[i] /= tr_sv['summs']
tr_sv.columns = ['trans_h_' + str(i) for i in tr_sv.columns]

In [15]:
tr_sv.head()  # какая часть всех транзакций проводилась в каждый отельный час

Unnamed: 0_level_0,trans_h_0,trans_h_1,trans_h_2,trans_h_3,trans_h_4,trans_h_5,trans_h_6,trans_h_7,trans_h_8,trans_h_9,...,trans_h_15,trans_h_16,trans_h_17,trans_h_18,trans_h_19,trans_h_20,trans_h_21,trans_h_22,trans_h_23,trans_h_summs
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.098225,0.106509,0.094675,0.110059,0.086391,0.074556,0.04497,0.023669,0.018935,0.008284,...,0.001183,0.0,0.0,0.0,0.042604,0.060355,0.067456,0.071006,0.081657,845.0
2,0.005348,0.016043,0.033422,0.03877,0.041444,0.042781,0.045455,0.034759,0.046791,0.061497,...,0.081551,0.070856,0.029412,0.010695,0.006684,0.0,0.0,0.0,0.001337,748.0
3,0.009288,0.016512,0.026832,0.050568,0.074303,0.085655,0.087719,0.085655,0.065015,0.086687,...,0.019608,0.009288,0.00516,0.004128,0.004128,0.001032,0.002064,0.0,0.002064,969.0
4,0.002766,0.001383,0.001383,0.005533,0.004149,0.013831,0.012448,0.005533,0.006916,0.004149,...,0.015214,0.020747,0.013831,0.008299,0.102351,0.311203,0.312586,0.106501,0.0,723.0
5,0.037901,0.046647,0.036929,0.025267,0.020408,0.034985,0.065112,0.083576,0.111759,0.082604,...,0.025267,0.013605,0.006803,0.019436,0.022352,0.022352,0.002915,0.0,0.004859,1029.0


### Кликстрим:

In [16]:
cl = pd.pivot_table(clickstream, index='user_id', columns='hour', values='cat_id', aggfunc='count').fillna(0)
cl.head()

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,14.0,10.0,74.0,150.0,168.0,134.0,91.0,140.0,235.0,223.0,...,481.0,594.0,734.0,1163.0,1866.0,2177.0,1941.0,1158.0,431.0,79.0
2,26.0,30.0,63.0,102.0,124.0,194.0,227.0,216.0,202.0,152.0,...,324.0,427.0,550.0,553.0,545.0,490.0,424.0,264.0,106.0,40.0
3,33.0,28.0,64.0,115.0,119.0,138.0,207.0,265.0,326.0,356.0,...,418.0,473.0,477.0,471.0,461.0,488.0,426.0,331.0,188.0,105.0
4,126.0,112.0,118.0,189.0,228.0,209.0,246.0,429.0,555.0,506.0,...,478.0,380.0,288.0,196.0,123.0,63.0,62.0,74.0,83.0,127.0
5,53.0,148.0,289.0,335.0,298.0,277.0,269.0,285.0,311.0,272.0,...,420.0,518.0,496.0,428.0,194.0,64.0,22.0,4.0,7.0,5.0


In [17]:
cl_sv = pd.pivot_table(clickstream, index='user_id', columns='hour', values='timestamp', aggfunc='count').fillna(0)

In [18]:
cl_sv.head()  # количество кликов каждый отдельный час

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,14.0,10.0,74.0,150.0,168.0,134.0,91.0,140.0,235.0,223.0,...,481.0,594.0,734.0,1163.0,1866.0,2177.0,1941.0,1158.0,431.0,79.0
2,26.0,30.0,63.0,102.0,124.0,194.0,227.0,216.0,202.0,152.0,...,324.0,427.0,550.0,553.0,545.0,490.0,424.0,264.0,106.0,40.0
3,33.0,28.0,64.0,115.0,119.0,138.0,207.0,265.0,326.0,356.0,...,418.0,473.0,477.0,471.0,461.0,488.0,426.0,331.0,188.0,105.0
4,126.0,112.0,118.0,189.0,228.0,209.0,246.0,429.0,555.0,506.0,...,478.0,380.0,288.0,196.0,123.0,63.0,62.0,74.0,83.0,127.0
5,53.0,148.0,289.0,335.0,298.0,277.0,269.0,285.0,311.0,272.0,...,420.0,518.0,496.0,428.0,194.0,64.0,22.0,4.0,7.0,5.0


In [19]:
cl_sv['summs'] = cl_sv.sum(axis=1)
for i in cl_sv.columns[:-1]:
    cl_sv[i] /= cl_sv['summs']
cl_sv.columns = ['click_h_' + str(i) for i in cl_sv.columns]

In [20]:
cl_sv.head()  # сколько всего кликов и какая часть кликов была сделана в каждый отдельный час

Unnamed: 0_level_0,click_h_0,click_h_1,click_h_2,click_h_3,click_h_4,click_h_5,click_h_6,click_h_7,click_h_8,click_h_9,...,click_h_15,click_h_16,click_h_17,click_h_18,click_h_19,click_h_20,click_h_21,click_h_22,click_h_23,click_h_summs
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.001061,0.000758,0.005607,0.011365,0.012729,0.010153,0.006895,0.010608,0.017806,0.016896,...,0.045007,0.055614,0.088119,0.141385,0.164949,0.147068,0.087741,0.032656,0.005986,13198.0
2,0.004559,0.00526,0.011047,0.017885,0.021743,0.034017,0.039804,0.037875,0.03542,0.026653,...,0.074873,0.09644,0.096967,0.095564,0.08592,0.074347,0.046291,0.018587,0.007014,5703.0
3,0.004788,0.004063,0.009286,0.016686,0.017266,0.020023,0.030035,0.03845,0.047301,0.051654,...,0.06863,0.069211,0.06834,0.066889,0.070807,0.061811,0.048027,0.027278,0.015235,6892.0
4,0.019833,0.017629,0.018574,0.02975,0.035889,0.032898,0.038722,0.067527,0.08736,0.079647,...,0.059814,0.045333,0.030852,0.019361,0.009917,0.009759,0.011648,0.013065,0.019991,6353.0
5,0.008656,0.024171,0.047199,0.054712,0.048669,0.045239,0.043933,0.046546,0.050792,0.044423,...,0.084599,0.081006,0.0699,0.031684,0.010452,0.003593,0.000653,0.001143,0.000817,6123.0


### 3. Сделаем эмбеддинги как предлагается в базовом решении

In [21]:
# CLICKSTREAM
clickstream_embed = clickstream.pivot_table(index='user_id',
                                            values=['timestamp'],
                                            columns=['cat_id'],
                                            aggfunc=['count']).fillna(
    0)  # считаем количество дат посещений каждой категории сайтов
clickstream_embed.columns = [f'rtk_{str(i[0])}-{str(i[2])}' for i in clickstream_embed.columns]
clickstream_embed.loc[0] = np.empty(len(clickstream_embed.columns))
clickstream_embed.head()  # Сколько посещений каждой категории сайта?

Unnamed: 0_level_0,rtk_count-1,rtk_count-2,rtk_count-3,rtk_count-8,rtk_count-11,rtk_count-12,rtk_count-13,rtk_count-14,rtk_count-15,rtk_count-19,...,rtk_count-1819,rtk_count-1840,rtk_count-1848,rtk_count-1849,rtk_count-1850,rtk_count-1853,rtk_count-1856,rtk_count-1857,rtk_count-1858,rtk_count-1861
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,33.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,52.0,0.0,0.0,0.0,0.0,233.0,0.0,1.0,41.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,278.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,94.0,0.0,11.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# TRANSACTIONS
bankclient_embed = transactions.pivot_table(index='user_id',
                                            values=['transaction_amt'],
                                            columns=['mcc_code'],
                                            aggfunc=['sum', 'mean', 'count']).fillna(0)
bankclient_embed.columns = [f'bank_{str(i[0])}-{str(i[2])}' for i in bankclient_embed.columns]
bankclient_embed.head()  # Сумма транзакций, средняя сумма тразакций по категории, количество транзакций в категории

Unnamed: 0_level_0,bank_sum--1,bank_sum-742,bank_sum-763,bank_sum-780,bank_sum-1520,bank_sum-1711,bank_sum-1731,bank_sum-1740,bank_sum-1750,bank_sum-1761,...,bank_count-8931,bank_count-8999,bank_count-9211,bank_count-9222,bank_count-9223,bank_count-9311,bank_count-9399,bank_count-9402,bank_count-9405,bank_count-9406
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3075.678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
2,1795.567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,2329993.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
4,1648119.0,-1455.3394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,0.0,0.0
5,100721.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
bankclient_embed.fillna(0, inplace=True)
clickstream_embed.fillna(0, inplace=True)

print(bankclient_embed.memory_usage().sum() / 1024 / 1024, 'Mb', clickstream_embed.memory_usage().sum() / 1024 / 1024,
      'Mb')
dtype = pd.SparseDtype(np.int32, fill_value=0)
bankclient_embed = bankclient_embed.astype(dtype)
clickstream_embed = clickstream_embed.astype(dtype)
print(bankclient_embed.memory_usage().sum() / 1024 / 1024, 'Mb', clickstream_embed.memory_usage().sum() / 1024 / 1024,
      'Mb')

199.2473373413086 Mb 60.33685302734375 Mb
24.50006103515625 Mb 6.69207763671875 Mb


## 4. Обучение модели

In [31]:
# Соберём обучающую выборку, где на 1 правильный мэтч будет приходиться 15 неправильных
k = 35
cor_dict = train.set_index('bank')['rtk'].to_dict()

train_bank_ids = train[(train.rtk != 0)]['bank']
train_rtk_ids = train[train.bank.isin(train_bank_ids)]['rtk'].drop_duplicates()
df_train = pd.DataFrame(train_bank_ids, columns=['bank'])
df_train['rtk'] = df_train['bank'].apply(
    lambda x: [cor_dict[x]] + train_rtk_ids.sample(k, random_state=x).values.tolist())

df_train = df_train.explode('rtk')

train['bank+rtk'] = train['bank'].astype('str') + '_' + train['rtk'].astype('str')
df_train['bank+rtk'] = df_train['bank'].astype('str') + '_' + df_train['rtk'].astype('str')
df_train['target'] = df_train['bank+rtk'].isin(train['bank+rtk']).astype('int')

df_train.drop_duplicates('bank+rtk', inplace=True)
df_train.reset_index(inplace=True, drop=True)

print(df_train.shape, df_train['target'].mean())
print(df_train.bank.nunique(), df_train.rtk.nunique(), df_train['bank+rtk'].nunique())

(528118, 4) 0.027779776489345185
14671 14671 528118


In [25]:
df_train.head()

Unnamed: 0,bank,rtk,bank+rtk,target
0,2091,17319,2091_17319,1
1,2091,13096,2091_13096,0
2,2091,4368,2091_4368,0
3,2091,13496,2091_13496,0
4,2091,338,2091_338,0


In [26]:
X_train = df_train.merge(bankclient_embed, how='left', left_on='bank', right_index=True
                         ).merge(clickstream_embed, how='left', left_on='rtk', right_index=True
                                 ).merge(cl_sv, how='left', left_on='rtk', right_index=True
                                         ).merge(tr_sv, how='left', left_on='bank', right_index=True
                                                 ).fillna(0)
X_train.head()

Unnamed: 0,bank,rtk,bank+rtk,target,bank_sum--1,bank_sum-742,bank_sum-763,bank_sum-780,bank_sum-1520,bank_sum-1711,...,trans_h_15,trans_h_16,trans_h_17,trans_h_18,trans_h_19,trans_h_20,trans_h_21,trans_h_22,trans_h_23,trans_h_summs
0,2091,17319,2091_17319,1,641825,-1482,0,0,0,0,...,0.011561,0.007948,0.008671,0.007948,0.114884,0.297688,0.284682,0.115607,0.002168,1384.0
1,2091,13096,2091_13096,0,641825,-1482,0,0,0,0,...,0.011561,0.007948,0.008671,0.007948,0.114884,0.297688,0.284682,0.115607,0.002168,1384.0
2,2091,4368,2091_4368,0,641825,-1482,0,0,0,0,...,0.011561,0.007948,0.008671,0.007948,0.114884,0.297688,0.284682,0.115607,0.002168,1384.0
3,2091,13496,2091_13496,0,641825,-1482,0,0,0,0,...,0.011561,0.007948,0.008671,0.007948,0.114884,0.297688,0.284682,0.115607,0.002168,1384.0
4,2091,338,2091_338,0,641825,-1482,0,0,0,0,...,0.011561,0.007948,0.008671,0.007948,0.114884,0.297688,0.284682,0.115607,0.002168,1384.0


In [27]:
X_train.target = X_train.target.astype(float)

In [28]:
cat_features = list(X_train.select_dtypes(["category"]))

In [29]:
categorical_cols = ['bank', 'rtk', 'bank+rtk', 'target']

train_pool = Pool(data=X_train.drop(['bank', 'rtk', 'bank+rtk', 'target'], axis=1),
                  label=X_train['target'],
                  cat_features=cat_features,
                  group_id=X_train['bank']
                  )

### Metrics calculation and graph plotting

In [39]:
def train_cb(train_pool):
    default_parameters = {
        'custom_metric': ["NDCG", "QueryAUC", "PFound", 'AverageGain:top=10'],
        'random_seed': 42,
        "loss_function": "YetiRank",
        "train_dir": "YetiRank",
        "metric_period": 50,
        "bootstrap_type": "Bayesian",
        "bagging_temperature": 0.66,

    }

    parameters = {
        'iterations': 8500,
        'task_type': "CPU",
        **default_parameters
    }

    ranker = CatBoostRanker(**parameters)
    ranker.fit(train_pool)
    return ranker

In [40]:
print("Fitting model...")
model = train_cb(train_pool)

Fitting model...
0:	total: 2.15s	remaining: 3h 6m 8s
50:	total: 1m 47s	remaining: 3h 11s
100:	total: 3m 31s	remaining: 2h 57m 54s
150:	total: 5m 16s	remaining: 2h 56m 12s
200:	total: 6m 59s	remaining: 2h 53m 43s
250:	total: 8m 43s	remaining: 2h 51m 53s
300:	total: 10m 25s	remaining: 2h 49m 35s
350:	total: 12m 8s	remaining: 2h 47m 40s
400:	total: 13m 54s	remaining: 2h 46m 31s
450:	total: 15m 37s	remaining: 2h 44m 30s
500:	total: 17m 19s	remaining: 2h 42m 29s
550:	total: 19m 2s	remaining: 2h 40m 42s
600:	total: 20m 44s	remaining: 2h 38m 42s
650:	total: 22m 23s	remaining: 2h 36m 27s
700:	total: 24m 2s	remaining: 2h 34m 18s
750:	total: 25m 41s	remaining: 2h 32m 10s
800:	total: 27m 22s	remaining: 2h 30m 20s
850:	total: 29m 4s	remaining: 2h 28m 34s
900:	total: 30m 48s	remaining: 2h 26m 58s
950:	total: 32m 31s	remaining: 2h 25m 18s
1000:	total: 34m 10s	remaining: 2h 23m 20s
1050:	total: 35m 49s	remaining: 2h 21m 23s
1100:	total: 37m 27s	remaining: 2h 19m 27s
1150:	total: 39m 6s	remaining: 2h 

In [41]:
model.save_model('models/open_sol_2504.cbm')