In [150]:
import lightgbm as lgb, xgboost as xgb, catboost as cb

for lib in [lgb, xgb, cb]:
    print(lib.__name__, lib.__version__)

lightgbm 4.0.0
xgboost 1.7.6
catboost 1.2.1


In [151]:
%load_ext autoreload
%autoreload 2

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Посмотрим данные

In [152]:
data = pd.read_csv('data/train.csv')

In [153]:
data.dtypes

order_id                int64
create_time            object
good_id                 int64
price                   int64
utm_medium              int64
utm_source            float64
sessionkey_id           int64
category_id             int64
parent_id               int64
root_id                 int64
model_id                int64
is_moderated            int64
rating_value          float64
rating_count          float64
description_length      int64
goods_qty               int64
pics_qty                int64
model_create_time      object
is_callcenter           int64
dtype: object

In [154]:
for col in data.loc[:, data.dtypes == object].columns:
    data[col] = data[col].astype('category').cat.codes.astype('category') # для catboost

In [155]:
data.dtypes

order_id                 int64
create_time           category
good_id                  int64
price                    int64
utm_medium               int64
utm_source             float64
sessionkey_id            int64
category_id              int64
parent_id                int64
root_id                  int64
model_id                 int64
is_moderated             int64
rating_value           float64
rating_count           float64
description_length       int64
goods_qty                int64
pics_qty                 int64
model_create_time     category
is_callcenter            int64
dtype: object

In [156]:
nan_count = data.isna().sum()

print(nan_count)
print(data.shape[0])

order_id                  0
create_time               0
good_id                   0
price                     0
utm_medium                0
utm_source            10450
sessionkey_id             0
category_id               0
parent_id                 0
root_id                   0
model_id                  0
is_moderated              0
rating_value          70854
rating_count          52982
description_length        0
goods_qty                 0
pics_qty                  0
model_create_time         0
is_callcenter             0
dtype: int64
104595


In [329]:
selected_columns = ['utm_source', 'rating_value', 'rating_count']

In [330]:
min_values = data[selected_columns].min()
max_values = data[selected_columns].max()

for column in selected_columns:
    print(f"Диапазон значений в столбце '{column}': от {min_values[column]} до {max_values[column]}")

Диапазон значений в столбце 'utm_source': от 1.0 до 555.0
Диапазон значений в столбце 'rating_value': от 1.0 до 10.0
Диапазон значений в столбце 'rating_count': от 0.0 до 35.0


In [159]:
data.head(7)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,model_id,is_moderated,rating_value,rating_count,description_length,goods_qty,pics_qty,model_create_time,is_callcenter
0,1269921,61981,9896348,753,5,8.0,123777004,139,133,124,123517,1,5.0,6.0,1204,6,2,0,1
1,1270034,62058,9896348,753,1,2.0,123781654,139,133,124,123517,1,5.0,6.0,1204,6,2,0,0
2,1268272,61012,9896348,753,2,3.0,123591002,139,133,124,123517,1,5.0,6.0,1204,6,2,0,1
3,1270544,62410,9896348,753,1,1.0,123832302,139,133,124,123517,1,5.0,6.0,1204,6,2,0,1
4,1270970,62718,9896348,753,3,56.0,123881603,139,133,124,123517,1,5.0,6.0,1204,6,2,0,0
5,1270999,62740,9896348,753,5,30.0,123883918,139,133,124,123517,1,5.0,6.0,1204,6,2,0,1
6,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,123517,1,5.0,6.0,1204,6,2,0,0


In [160]:
val_size = int(0.3 * (data.shape[0]))
val_idx = np.random.choice(np.arange(data.shape[0]), size=val_size, replace=False)

data_dict = dict()
data_dict['val'] = data.loc[val_idx].reset_index(drop=True)

data_dict['tr'] = data.drop(index=val_idx)
data_dict['tr'].reset_index(drop=True, inplace=True)

for key, df in data_dict.items():
    print(key, 'shape:', df.shape)
        
X_tr, y_tr = data_dict['tr'].iloc[:, :-1], data_dict['tr'].is_callcenter
X_val, y_val = data_dict['val'].iloc[:, :-1], data_dict['val'].is_callcenter

val shape: (31378, 19)
tr shape: (73217, 19)


### Попробуем LightGBM без добавления признаков

In [161]:
lgb_tr = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.1,
    'lambda': 1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(X_val, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8107
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.622471
[6]	валидация's auc: 0.624613
[9]	валидация's auc: 0.626033
[12]	валидация's auc: 0.628536
[15]	валидация's auc: 0.629658
[18]	валидация's auc: 0.630754
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.631752


Я поперебирал эту и получил, что при 0.3 скор наивысший (хотя в целом несильно отличается - второй знак после запятой)

In [162]:
lgb_tr = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.3,
    'lambda': 1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(X_val, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8107
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.624907
[6]	валидация's auc: 0.630214
[9]	валидация's auc: 0.632115
Early stopping, best iteration is:
[8]	валидация's auc: 0.632239


Аналогично с лямбдой. Пока дальше перебирать не буду.

In [163]:
lgb_tr = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.3,
    'lambda': 5e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(X_val, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8107
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.624907
[6]	валидация's auc: 0.630214
[9]	валидация's auc: 0.632116
Early stopping, best iteration is:
[8]	валидация's auc: 0.632238


### Добавим признаков из второго датасета

In [164]:
webstat = pd.read_csv('data/t1_webstat.csv')

In [165]:
webstat.dtypes

sessionkey_id              int64
date_time                 object
page_type                  int64
pageview_number            int64
pageview_duration_sec    float64
category_id              float64
model_id                 float64
good_id                  float64
price                    float64
product_in_sale          float64
dtype: object

In [166]:
webstat.head(15)

Unnamed: 0,sessionkey_id,date_time,page_type,pageview_number,pageview_duration_sec,category_id,model_id,good_id,price,product_in_sale
0,122243978,1975-12-18 14:52:38.130,2,28,53.0,1773.0,,,,
1,122243978,1975-12-18 15:02:32.403,5,36,163.0,,,,,
2,122243978,1975-12-18 15:08:11.147,5,40,156.0,,,,,
3,122243992,1975-12-18 14:23:33.177,1,4,565.0,3973.0,26706175.0,45171119.0,8436.0,1.0
4,122243992,1975-12-18 14:32:58.763,1,5,73.0,3973.0,17297250.0,65195750.0,6624.0,1.0
5,122243998,1975-12-18 14:21:46.550,1,2,15.0,1870.0,268894.0,59718865.0,691.0,1.0
6,122244115,1975-12-18 14:20:53.793,7,1,62.0,,,,,
7,122244115,1975-12-18 14:21:55.320,1,2,,3779.0,16507112.0,29666875.0,1051.0,1.0
8,122244260,1975-12-18 14:22:46.657,1,3,77.0,4723.0,522923.0,67362756.0,166.0,1.0
9,122244260,1975-12-18 14:34:44.937,3,18,12.0,,,,,


Первое, что приходит в голову - усреднить продолжительность просмотра страницы, а еще медиану посещенных страниц.

In [167]:
session_agg = webstat.groupby('sessionkey_id', sort=False).agg(
    pageview_duration_sec_mean=('pageview_duration_sec', lambda x: x.dropna().mean() if x.notnull().any() else None),
    page_type_median=('page_type', lambda x: x.median(skipna=True)),
    pageview_number_mean=('pageview_number', lambda x: x.dropna().mean() if x.notnull().any() else None)
)
session_agg

Unnamed: 0_level_0,pageview_duration_sec_mean,page_type_median,pageview_number_mean
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122243978,75.800000,2.0,21.000
122243992,137.714286,4.5,4.500
122243998,37.500000,8.0,3.000
122244115,62.000000,4.0,1.500
122244260,35.282051,3.0,22.375
...,...,...,...
118720116,192.000000,1.5,1.500
118720198,34.000000,1.5,1.500
118721114,,1.0,1.000
118721222,,1.0,1.000


In [168]:
tr1 = X_tr.merge(session_agg, how='left', on='sessionkey_id', sort=False)
tr1.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,is_moderated,rating_value,rating_count,description_length,goods_qty,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,1,5.0,6.0,1204,6,2,0,59.25,3.0,4.8
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,1,5.0,6.0,1204,6,2,0,170.333333,2.0,4.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,1,5.0,6.0,1204,6,2,0,245.5,3.0,3.0


In [169]:
val1 = X_val.merge(session_agg, how='left', on='sessionkey_id', sort=False)
val1.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,is_moderated,rating_value,rating_count,description_length,goods_qty,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,1,5.0,1.0,716,23,4,14225,86.0,3.0,7.454545
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,1,,,0,8,2,17749,134.5,1.0,4.2
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,1,5.0,2.0,1957,4,4,2738,101.653846,2.0,14.0


In [171]:
lgb_tr = lgb.Dataset(tr1, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.3,
    'lambda': 10e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val1, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8640
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 21
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.799666
[6]	валидация's auc: 0.80841
[9]	валидация's auc: 0.815104
[12]	валидация's auc: 0.819802
[15]	валидация's auc: 0.820155
[18]	валидация's auc: 0.82421
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.825173


In [172]:
webstat.product_in_sale.unique()

array([nan,  1.])

In [173]:
session_agg2 = webstat.groupby('sessionkey_id', sort=False).agg(
    product_in_sale_prob=('product_in_sale', lambda x: (x.fillna(0).sum() / x.shape[0]) if x.notnull().any() else 0)
)
session_agg2

Unnamed: 0_level_0,product_in_sale_prob
sessionkey_id,Unnamed: 1_level_1
122243978,0.317073
122243992,0.375000
122243998,0.400000
122244115,0.500000
122244260,0.225000
...,...
118720116,0.000000
118720198,0.500000
118721114,1.000000
118721222,1.000000


In [174]:
tr2 = tr1.merge(session_agg2, how='left', on='sessionkey_id', sort=False)
tr2.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,rating_value,rating_count,description_length,goods_qty,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean,product_in_sale_prob
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,5.0,6.0,1204,6,2,0,59.25,3.0,4.8,0.4
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,5.0,6.0,1204,6,2,0,170.333333,2.0,4.0,0.428571
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,5.0,6.0,1204,6,2,0,245.5,3.0,3.0,0.333333


In [175]:
val2 = val1.merge(session_agg2, how='left', on='sessionkey_id', sort=False)
val2.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,rating_value,rating_count,description_length,goods_qty,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean,product_in_sale_prob
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,5.0,1.0,716,23,4,14225,86.0,3.0,7.454545,0.181818
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,,,0,8,2,17749,134.5,1.0,4.2,0.4
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,5.0,2.0,1957,4,4,2738,101.653846,2.0,14.0,0.444444


In [176]:
lgb_tr = lgb.Dataset(tr2, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.3,
    'lambda': 10e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val2, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8885
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.808934
[6]	валидация's auc: 0.816931
[9]	валидация's auc: 0.820476
[12]	валидация's auc: 0.826907
[15]	валидация's auc: 0.828207
[18]	валидация's auc: 0.828413
Early stopping, best iteration is:
[16]	валидация's auc: 0.828495


In [177]:
session_agg3 = webstat.groupby('sessionkey_id', sort=False).agg(
    price_mean=('price', lambda x: x.dropna().mean() if x.notnull().any() else None)
)
session_agg3

Unnamed: 0_level_0,price_mean
sessionkey_id,Unnamed: 1_level_1
122243978,746.384615
122243992,7566.666667
122243998,598.500000
122244115,1051.000000
122244260,444.888889
...,...
118720116,
118720198,2239.000000
118721114,60.000000
118721222,717.000000


In [184]:
tr3 = tr2.merge(session_agg3, how='left', on='sessionkey_id', sort=False)
tr3.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,rating_count,description_length,goods_qty,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean,product_in_sale_prob,price_mean
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,6.0,1204,6,2,0,59.25,3.0,4.8,0.4,1006.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,6.0,1204,6,2,0,170.333333,2.0,4.0,0.428571,1497.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,6.0,1204,6,2,0,245.5,3.0,3.0,0.333333,761.0


In [185]:
val3 = val2.merge(session_agg3, how='left', on='sessionkey_id', sort=False)
val3.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,rating_count,description_length,goods_qty,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean,product_in_sale_prob,price_mean
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,1.0,716,23,4,14225,86.0,3.0,7.454545,0.181818,334.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,,0,8,2,17749,134.5,1.0,4.2,0.4,1140.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,2.0,1957,4,4,2738,101.653846,2.0,14.0,0.444444,2508.083333


In [198]:
lgb_tr = lgb.Dataset(tr3, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.4,
    'lambda': 10e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val3, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9140
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.809791
[6]	валидация's auc: 0.82079
[9]	валидация's auc: 0.824993
[12]	валидация's auc: 0.825843
[15]	валидация's auc: 0.830063
[18]	валидация's auc: 0.834065
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.834286


In [199]:
session_agg4 = webstat.groupby('sessionkey_id', sort=False).agg(
    category_mode=('category_id', lambda x: x.mode().iloc[0] if x.notnull().any() else None),
    model_mode=('model_id', lambda x: x.mode().iloc[0] if x.notnull().any() else None),
    good_mode=('good_id', lambda x: x.mode().iloc[0] if x.notnull().any() else None)
)
session_agg4

Unnamed: 0_level_0,category_mode,model_mode,good_mode
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122243978,1496.0,2345848.0,57791307.0
122243992,3973.0,17297250.0,45171119.0
122243998,1870.0,268888.0,4087157.0
122244115,3779.0,16507112.0,29666875.0
122244260,4723.0,522923.0,67362756.0
...,...,...,...
118720116,1241.0,22251463.0,
118720198,1200.0,136805.0,34914400.0
118721114,6880.0,3658198.0,62273476.0
118721222,1513.0,2724018.0,64632283.0


In [203]:
tr4 = tr3.merge(session_agg4, how='left', on='sessionkey_id', sort=False)
tr4.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean,product_in_sale_prob,price_mean,category_mode,model_mode,good_mode
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,2,0,59.25,3.0,4.8,0.4,1006.0,139.0,123517.0,9896348.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,2,0,170.333333,2.0,4.0,0.428571,1497.0,139.0,123517.0,10560054.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,2,0,245.5,3.0,3.0,0.333333,761.0,139.0,123517.0,9896348.0


In [204]:
val4 = val3.merge(session_agg4, how='left', on='sessionkey_id', sort=False)
val4.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pics_qty,model_create_time,pageview_duration_sec_mean,page_type_median,pageview_number_mean,product_in_sale_prob,price_mean,category_mode,model_mode,good_mode
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,4,14225,86.0,3.0,7.454545,0.181818,334.0,1200.0,12838934.0,40460831.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,2,17749,134.5,1.0,4.2,0.4,1140.0,189.0,17138869.0,44856961.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,4,2738,101.653846,2.0,14.0,0.444444,2508.083333,1333.0,7083154.0,20490741.0


In [206]:
lgb_tr = lgb.Dataset(tr4, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.3,
    'lambda': 10e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val4, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9905
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.809208
[6]	валидация's auc: 0.816167
[9]	валидация's auc: 0.824707
[12]	валидация's auc: 0.826458
[15]	валидация's auc: 0.829502
[18]	валидация's auc: 0.831283
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.832766


In [207]:
session_agg5 = webstat.groupby('sessionkey_id', sort=False).agg(
    pageview_duration_sec_max=('pageview_duration_sec', lambda x: x.dropna().max() if x.notnull().any() else None),
    pageview_duration_sec_min=('pageview_duration_sec', lambda x: x.dropna().min() if x.notnull().any() else None),
    pageview_number_max=('pageview_number', lambda x: x.dropna().max() if x.notnull().any() else None),
    pageview_number_min=('pageview_number', lambda x: x.dropna().min() if x.notnull().any() else None)
)
session_agg5

Unnamed: 0_level_0,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
122243978,424.0,2.0,41,1
122243992,565.0,2.0,8,1
122243998,86.0,15.0,5,1
122244115,62.0,62.0,2,1
122244260,173.0,3.0,55,1
...,...,...,...,...
118720116,192.0,192.0,2,1
118720198,34.0,34.0,2,1
118721114,,,1,1
118721222,,,1,1


In [208]:
tr5 = tr4.merge(session_agg5, how='left', on='sessionkey_id', sort=False)
tr5.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pageview_number_mean,product_in_sale_prob,price_mean,category_mode,model_mode,good_mode,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,4.8,0.4,1006.0,139.0,123517.0,9896348.0,65.0,52.0,8.0,1.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,4.0,0.428571,1497.0,139.0,123517.0,10560054.0,269.0,42.0,7.0,1.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,3.0,0.333333,761.0,139.0,123517.0,9896348.0,455.0,36.0,6.0,1.0


In [209]:
val5 = val4.merge(session_agg5, how='left', on='sessionkey_id', sort=False)
val5.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pageview_number_mean,product_in_sale_prob,price_mean,category_mode,model_mode,good_mode,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,7.454545,0.181818,334.0,1200.0,12838934.0,40460831.0,422.0,3.0,15.0,1.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,4.2,0.4,1140.0,189.0,17138869.0,44856961.0,490.0,5.0,8.0,1.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,14.0,0.444444,2508.083333,1333.0,7083154.0,20490741.0,575.0,2.0,27.0,1.0


In [213]:
lgb_tr = lgb.Dataset(tr5, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.3,
    'lambda': 1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val5, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10629
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.813586
[6]	валидация's auc: 0.82567
[9]	валидация's auc: 0.838497
[12]	валидация's auc: 0.840981
[15]	валидация's auc: 0.842234
[18]	валидация's auc: 0.846237
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.847354


In [214]:
session_agg6 = webstat.groupby('sessionkey_id', sort=False).agg(
    category_nan_sum=('category_id', lambda x: x.isna().sum()),
    model_nan_sum=('model_id', lambda x: x.isna().sum()),
    good_nan_sum=('good_id', lambda x: x.isna().sum())
)
session_agg6

Unnamed: 0_level_0,category_nan_sum,model_nan_sum,good_nan_sum
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122243978,9,28,28
122243992,4,5,5
122243998,3,3,3
122244115,1,1,1
122244260,21,31,31
...,...,...,...
118720116,0,1,2
118720198,0,1,1
118721114,0,0,0
118721222,0,0,0


In [217]:
tr6 = tr5.merge(session_agg6, how='left', on='sessionkey_id', sort=False)
tr6.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,category_mode,model_mode,good_mode,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,139.0,123517.0,9896348.0,65.0,52.0,8.0,1.0,3.0,3.0,3.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,139.0,123517.0,10560054.0,269.0,42.0,7.0,1.0,0.0,4.0,4.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,139.0,123517.0,9896348.0,455.0,36.0,6.0,1.0,2.0,2.0,2.0


In [218]:
val6 = val5.merge(session_agg6, how='left', on='sessionkey_id', sort=False)
val6.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,category_mode,model_mode,good_mode,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,1200.0,12838934.0,40460831.0,422.0,3.0,15.0,1.0,7.0,9.0,9.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,189.0,17138869.0,44856961.0,490.0,5.0,8.0,1.0,2.0,2.0,3.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,1333.0,7083154.0,20490741.0,575.0,2.0,27.0,1.0,3.0,15.0,15.0


In [238]:
lgb_tr = lgb.Dataset(tr6, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.4,
    'lambda': 1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val6, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11013
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.833909
[6]	валидация's auc: 0.846325
[9]	валидация's auc: 0.851611
[12]	валидация's auc: 0.852871
[15]	валидация's auc: 0.85942
[18]	валидация's auc: 0.862727
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.863707


In [227]:
session_agg7 = webstat.groupby('sessionkey_id', sort=False).agg(
    price_max=('price', lambda x: x.dropna().max() if x.notnull().any() else None),
    price_min=('price', lambda x: x.dropna().min() if x.notnull().any() else None)
)
session_agg7

Unnamed: 0_level_0,price_max,price_min
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1
122243978,1258.0,140.0
122243992,8436.0,6624.0
122243998,691.0,506.0
122244115,1051.0,1051.0
122244260,1497.0,54.0
...,...,...
118720116,,
118720198,2239.0,2239.0
118721114,60.0,60.0
118721222,717.0,717.0


In [239]:
tr7 = tr6.merge(session_agg7, how='left', on='sessionkey_id', sort=False)
tr7.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,good_mode,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum,price_max,price_min
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,9896348.0,65.0,52.0,8.0,1.0,3.0,3.0,3.0,1346.0,666.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,10560054.0,269.0,42.0,7.0,1.0,0.0,4.0,4.0,2866.0,742.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,9896348.0,455.0,36.0,6.0,1.0,2.0,2.0,2.0,761.0,761.0


In [240]:
val7 = val6.merge(session_agg7, how='left', on='sessionkey_id', sort=False)
val7.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,good_mode,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum,price_max,price_min
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,40460831.0,422.0,3.0,15.0,1.0,7.0,9.0,9.0,334.0,334.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,44856961.0,490.0,5.0,8.0,1.0,2.0,2.0,3.0,1140.0,1140.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,20490741.0,575.0,2.0,27.0,1.0,3.0,15.0,15.0,3640.0,1154.0


In [241]:
lgb_tr = lgb.Dataset(tr7, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.4,
    'lambda': 0.1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val7, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11523
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.833615
[6]	валидация's auc: 0.846486
[9]	валидация's auc: 0.851364
[12]	валидация's auc: 0.856572
[15]	валидация's auc: 0.860069
[18]	валидация's auc: 0.861596
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.863251


In [242]:
session_agg8 = webstat.groupby('sessionkey_id', sort=False).agg(
    product_nan_sum=('product_in_sale', lambda x: x.isna().sum())
)
session_agg8

Unnamed: 0_level_0,product_nan_sum
sessionkey_id,Unnamed: 1_level_1
122243978,28
122243992,5
122243998,3
122244115,1
122244260,31
...,...
118720116,2
118720198,1
118721114,0
118721222,0


In [243]:
tr8 = tr7.merge(session_agg8, how='left', on='sessionkey_id', sort=False)
tr8.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum,price_max,price_min,product_nan_sum
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,65.0,52.0,8.0,1.0,3.0,3.0,3.0,1346.0,666.0,3.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,269.0,42.0,7.0,1.0,0.0,4.0,4.0,2866.0,742.0,4.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,455.0,36.0,6.0,1.0,2.0,2.0,2.0,761.0,761.0,2.0


In [244]:
val8 = val7.merge(session_agg8, how='left', on='sessionkey_id', sort=False)
val8.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pageview_duration_sec_max,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum,price_max,price_min,product_nan_sum
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,422.0,3.0,15.0,1.0,7.0,9.0,9.0,334.0,334.0,9.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,490.0,5.0,8.0,1.0,2.0,2.0,3.0,1140.0,1140.0,3.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,575.0,2.0,27.0,1.0,3.0,15.0,15.0,3640.0,1154.0,15.0


In [245]:
lgb_tr = lgb.Dataset(tr8, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.4,
    'lambda': 0.1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val8, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=20, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11671
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.833615
[6]	валидация's auc: 0.846486
[9]	валидация's auc: 0.851364
[12]	валидация's auc: 0.856572
[15]	валидация's auc: 0.860069
[18]	валидация's auc: 0.861596
Did not meet early stopping. Best iteration is:
[20]	валидация's auc: 0.863251


In [259]:
session_agg9 = webstat.groupby('sessionkey_id', sort=False).agg(
    page_type_mode=('page_type', lambda x: x.mode().iloc[0] if x.notnull().any() else None)
)
session_agg9

Unnamed: 0_level_0,page_type_mode
sessionkey_id,Unnamed: 1_level_1
122243978,2
122243992,1
122243998,8
122244115,1
122244260,3
...,...
118720116,1
118720198,1
118721114,1
118721222,1


In [260]:
tr9 = tr8.merge(session_agg9, how='left', on='sessionkey_id', sort=False)
tr9.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum,price_max,price_min,product_nan_sum,page_type_mode
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,52.0,8.0,1.0,3.0,3.0,3.0,1346.0,666.0,3.0,1.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,42.0,7.0,1.0,0.0,4.0,4.0,2866.0,742.0,4.0,2.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,36.0,6.0,1.0,2.0,2.0,2.0,761.0,761.0,2.0,1.0


In [261]:
val9 = val8.merge(session_agg9, how='left', on='sessionkey_id', sort=False)
val9.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,pageview_duration_sec_min,pageview_number_max,pageview_number_min,category_nan_sum,model_nan_sum,good_nan_sum,price_max,price_min,product_nan_sum,page_type_mode
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,3.0,15.0,1.0,7.0,9.0,9.0,334.0,334.0,9.0,3.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,5.0,8.0,1.0,2.0,2.0,3.0,1140.0,1140.0,3.0,1.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,2.0,27.0,1.0,3.0,15.0,15.0,3640.0,1154.0,15.0,1.0


In [280]:
lgb_tr = lgb.Dataset(tr9, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.4,
    'lambda': 100e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val9, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=50, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11686
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.831625
[6]	валидация's auc: 0.84602
[9]	валидация's auc: 0.851615
[12]	валидация's auc: 0.855114
[15]	валидация's auc: 0.859918
[18]	валидация's auc: 0.862589
[21]	валидация's auc: 0.863008
[24]	валидация's auc: 0.863868
[27]	валидация's auc: 0.865926
[30]	валидация's auc: 0.867169
[33]	валидация's auc: 0.867686
[36]	валидация's auc: 0.868507
[39]	валидация's auc: 0.869045
[42]	валидация's auc: 0.869585
[45]	валидация's auc: 0.869671
[48]	валидация's auc: 0.870083
Did not meet early stopping. Best iteration is:
[49]	вали

In [282]:
tr9.columns

Index(['order_id', 'create_time', 'good_id', 'price', 'utm_medium',
       'utm_source', 'sessionkey_id', 'category_id', 'parent_id', 'root_id',
       'model_id', 'is_moderated', 'rating_value', 'rating_count',
       'description_length', 'goods_qty', 'pics_qty', 'model_create_time',
       'pageview_duration_sec_mean', 'page_type_median',
       'pageview_number_mean', 'product_in_sale_prob', 'price_mean',
       'category_mode', 'model_mode', 'good_mode', 'pageview_duration_sec_max',
       'pageview_duration_sec_min', 'pageview_number_max',
       'pageview_number_min', 'category_nan_sum', 'model_nan_sum',
       'good_nan_sum', 'price_max', 'price_min', 'product_nan_sum',
       'page_type_mode'],
      dtype='object')

In [285]:
session_agg10 = webstat.groupby('sessionkey_id', sort=False).agg(
    page_type_max=('page_type', lambda x: x.dropna().max() if x.notnull().any() else None),
    page_type_min=('page_type', lambda x: x.dropna().min() if x.notnull().any() else None),
    page_type_mean=('page_type', lambda x: x.dropna().mean() if x.notnull().any() else None),
    page_type_nan_sum=('page_type', lambda x: x.isna().sum())
)
session_agg10

Unnamed: 0_level_0,page_type_max,page_type_min,page_type_mean,page_type_nan_sum
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
122243978,9,1,2.682927,0
122243992,8,1,4.375000,0
122243998,8,1,5.200000,0
122244115,7,1,4.000000,0
122244260,10,1,2.750000,0
...,...,...,...,...
118720116,2,1,1.500000,0
118720198,2,1,1.500000,0
118721114,1,1,1.000000,0
118721222,1,1,1.000000,0


In [286]:
tr10 = tr9.merge(session_agg10, how='left', on='sessionkey_id', sort=False)
tr10.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,model_nan_sum,good_nan_sum,price_max,price_min,product_nan_sum,page_type_mode,page_type_max,page_type_min,page_type_mean,page_type_nan_sum
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,3.0,3.0,1346.0,666.0,3.0,1.0,5.0,1.0,3.0,0.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,4.0,4.0,2866.0,742.0,4.0,2.0,2.0,1.0,1.571429,0.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,2.0,2.0,761.0,761.0,2.0,1.0,4.0,1.0,2.666667,0.0


In [287]:
val10 = val9.merge(session_agg10, how='left', on='sessionkey_id', sort=False)
val10.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,model_nan_sum,good_nan_sum,price_max,price_min,product_nan_sum,page_type_mode,page_type_max,page_type_min,page_type_mean,page_type_nan_sum
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,9.0,9.0,334.0,334.0,9.0,3.0,8.0,1.0,3.909091,0.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,2.0,3.0,1140.0,1140.0,3.0,1.0,6.0,1.0,2.4,0.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,15.0,15.0,3640.0,1154.0,15.0,1.0,7.0,1.0,2.111111,0.0


In [298]:
lgb_tr = lgb.Dataset(tr10, label=y_tr, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.4,
    'lambda': 1e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val10, label=y_val, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=50, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11970
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.863565
[6]	валидация's auc: 0.869282
[9]	валидация's auc: 0.872108
[12]	валидация's auc: 0.875275
[15]	валидация's auc: 0.876053
[18]	валидация's auc: 0.879219
[21]	валидация's auc: 0.879715
[24]	валидация's auc: 0.88069
[27]	валидация's auc: 0.880599
[30]	валидация's auc: 0.881695
[33]	валидация's auc: 0.882032
[36]	валидация's auc: 0.882417
[39]	валидация's auc: 0.882127
Early stopping, best iteration is:
[36]	валидация's auc: 0.882417


In [299]:
tr10.columns

Index(['order_id', 'create_time', 'good_id', 'price', 'utm_medium',
       'utm_source', 'sessionkey_id', 'category_id', 'parent_id', 'root_id',
       'model_id', 'is_moderated', 'rating_value', 'rating_count',
       'description_length', 'goods_qty', 'pics_qty', 'model_create_time',
       'pageview_duration_sec_mean', 'page_type_median',
       'pageview_number_mean', 'product_in_sale_prob', 'price_mean',
       'category_mode', 'model_mode', 'good_mode', 'pageview_duration_sec_max',
       'pageview_duration_sec_min', 'pageview_number_max',
       'pageview_number_min', 'category_nan_sum', 'model_nan_sum',
       'good_nan_sum', 'price_max', 'price_min', 'product_nan_sum',
       'page_type_mode', 'page_type_max', 'page_type_min', 'page_type_mean',
       'page_type_nan_sum'],
      dtype='object')

In [300]:
session_agg11 = webstat.groupby('sessionkey_id', sort=False).agg(
    pageview_nan_sum=('pageview_number', lambda x: x.isna().sum()),
    pageview_duration_sec_nan_sum=('pageview_duration_sec', lambda x: x.isna().sum()),
    price_nan_sum=('price', lambda x: x.isna().sum())
)
session_agg11

Unnamed: 0_level_0,pageview_nan_sum,pageview_duration_sec_nan_sum,price_nan_sum
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122243978,0,1,28
122243992,0,1,5
122243998,0,1,3
122244115,0,1,1
122244260,0,1,31
...,...,...,...
118720116,0,1,2
118720198,0,1,1
118721114,0,1,0
118721222,0,1,0


In [301]:
tr11 = tr10.merge(session_agg11, how='left', on='sessionkey_id', sort=False)
tr11.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,price_min,product_nan_sum,page_type_mode,page_type_max,page_type_min,page_type_mean,page_type_nan_sum,pageview_nan_sum,pageview_duration_sec_nan_sum,price_nan_sum
0,1233828,41100,9896348,666,5,8.0,120125762,139,133,124,...,666.0,3.0,1.0,5.0,1.0,3.0,0.0,0.0,1.0,3.0
1,1239674,44451,21892647,742,1,1.0,120616344,139,133,124,...,742.0,4.0,2.0,2.0,1.0,1.571429,0.0,0.0,1.0,4.0
2,1183828,13559,9896348,761,4,2.0,115226312,139,133,124,...,761.0,2.0,1.0,4.0,1.0,2.666667,0.0,0.0,1.0,2.0


In [302]:
val11 = val10.merge(session_agg11, how='left', on='sessionkey_id', sort=False)
val11.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,price_min,product_nan_sum,page_type_mode,page_type_max,page_type_min,page_type_mean,page_type_nan_sum,pageview_nan_sum,pageview_duration_sec_nan_sum,price_nan_sum
0,1268806,61397,40460833,334,1,2.0,123631342,1200,5674,1183,...,334.0,9.0,3.0,8.0,1.0,3.909091,0.0,0.0,1.0,9.0
1,1319667,91233,44856961,1140,1,1.0,129227524,189,3370,3368,...,1140.0,3.0,1.0,6.0,1.0,2.4,0.0,0.0,1.0,3.0
2,1227316,37428,58512612,1154,3,28.0,119373430,5605,5604,1183,...,1154.0,15.0,1.0,7.0,1.0,2.111111,0.0,0.0,1.0,15.0


In [345]:
tr11.dtypes.unique()


array([dtype('int64'),
       CategoricalDtype(categories=[     0,      1,      2,      3,      4,      5,      6,
                              7,      8,      9,
                         ...
                         102988, 102989, 102990, 102991, 102992, 102993, 102994,
                         102995, 102996, 102997],
       , ordered=False)                                                                    ,
       dtype('float64'),
       CategoricalDtype(categories=[   -1,     0,     1,     2,     3,     4,     5,     6,
                             7,     8,
                         ...
                         31686, 31687, 31688, 31689, 31690, 31691, 31692, 31693,
                         31694, 31695],
       , ordered=False)                                                                    ],
      dtype=object)

In [357]:
categorical_features = ['create_time', 'model_create_time']

lgb_tr = lgb.Dataset(tr11, label=y_tr, categorical_feature=categorical_features, free_raw_data=False)

params = {
    'objective': 'binary',
    'eta': 0.4,
    'lambda': 10e-3,
    
    'seed': 911,
    'num_threads': 32,
    'verbosity': 1
}

lgb_val = lgb.Dataset(val11, label=y_val, categorical_feature=categorical_features, free_raw_data=False)

params.update({
    'metric': ['auc', ''], # '' = objective; 'None' = off
})

model = lgb.train(
    params, lgb_tr, num_boost_round=50, # basic
    valid_sets=[lgb_val], valid_names=['валидация'],
    feval=None,
    callbacks=[
        lgb.early_stopping(stopping_rounds=3, min_delta=0.),
        lgb.log_evaluation(period=3) # чтобы выводились результаты подсчета метрики для early_stopping
   ] 
)

[LightGBM] [Info] Number of positive: 25989, number of negative: 47228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12128
[LightGBM] [Info] Number of data points in the train set: 73217, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.354959 -> initscore=-0.597314
[LightGBM] [Info] Start training from score -0.597314
Training until validation scores don't improve for 3 rounds
[3]	валидация's auc: 0.885057
[6]	валидация's auc: 0.893007
[9]	валидация's auc: 0.896093
[12]	валидация's auc: 0.897113
[15]	валидация's auc: 0.900021
[18]	валидация's auc: 0.901493
[21]	валидация's auc: 0.902422
[24]	валидация's auc: 0.90276
[27]	валидация's auc: 0.903494
[30]	валидация's auc: 0.903537
[33]	валидация's auc: 0.904187
[36]	валидация's auc: 0.904152
Early stopping, best iteration is:
[35]	валидация's auc: 0.904263


In [348]:
test_df = pd.read_csv('data/test.csv')

In [349]:
test_df.shape[0]

17196

In [350]:
test_df.dtypes

order_id                int64
create_time            object
good_id                 int64
price                   int64
utm_medium              int64
utm_source            float64
sessionkey_id           int64
category_id             int64
parent_id               int64
root_id                 int64
model_id                int64
is_moderated            int64
rating_value          float64
rating_count          float64
description_length      int64
goods_qty               int64
pics_qty                int64
model_create_time      object
dtype: object

In [354]:
for col in test_df.loc[:, test_df.dtypes == object].columns:
    test_df[col] = test_df[col].astype('category').cat.codes.astype('category') # для catboost

In [355]:
test_df.dtypes

order_id                 int64
create_time           category
good_id                  int64
price                    int64
utm_medium               int64
utm_source             float64
sessionkey_id            int64
category_id              int64
parent_id                int64
root_id                  int64
model_id                 int64
is_moderated             int64
rating_value           float64
rating_count           float64
description_length       int64
goods_qty                int64
pics_qty                 int64
model_create_time     category
dtype: object

In [315]:
session_agg_test = webstat.groupby('sessionkey_id', sort=False).agg(
    pageview_duration_sec_mean=('pageview_duration_sec', lambda x: x.dropna().mean() if x.notnull().any() else None),
    page_type_median=('page_type', lambda x: x.median(skipna=True)),
    pageview_number_mean=('pageview_number', lambda x: x.dropna().mean() if x.notnull().any() else None),
    product_in_sale_prob=('product_in_sale', lambda x: (x.fillna(0).sum() / x.shape[0]) if x.notnull().any() else 0),
    price_mean=('price', lambda x: x.dropna().mean() if x.notnull().any() else None),
    category_mode=('category_id', lambda x: x.mode().iloc[0] if x.notnull().any() else None),
    model_mode=('model_id', lambda x: x.mode().iloc[0] if x.notnull().any() else None),
    good_mode=('good_id', lambda x: x.mode().iloc[0] if x.notnull().any() else None),
    pageview_duration_sec_max=('pageview_duration_sec', lambda x: x.dropna().max() if x.notnull().any() else None),
    pageview_duration_sec_min=('pageview_duration_sec', lambda x: x.dropna().min() if x.notnull().any() else None),
    pageview_number_max=('pageview_number', lambda x: x.dropna().max() if x.notnull().any() else None),
    pageview_number_min=('pageview_number', lambda x: x.dropna().min() if x.notnull().any() else None),
    category_nan_sum=('category_id', lambda x: x.isna().sum()),
    model_nan_sum=('model_id', lambda x: x.isna().sum()),
    good_nan_sum=('good_id', lambda x: x.isna().sum()),
    price_max=('price', lambda x: x.dropna().max() if x.notnull().any() else None),
    price_min=('price', lambda x: x.dropna().min() if x.notnull().any() else None),
    product_nan_sum=('product_in_sale', lambda x: x.isna().sum()),
    page_type_mode=('page_type', lambda x: x.mode().iloc[0] if x.notnull().any() else None),
    page_type_max=('page_type', lambda x: x.dropna().max() if x.notnull().any() else None),
    page_type_min=('page_type', lambda x: x.dropna().min() if x.notnull().any() else None),
    page_type_mean=('page_type', lambda x: x.dropna().mean() if x.notnull().any() else None),
    page_type_nan_sum=('page_type', lambda x: x.isna().sum()),
    pageview_nan_sum=('pageview_number', lambda x: x.isna().sum()),
    pageview_duration_sec_nan_sum=('pageview_duration_sec', lambda x: x.isna().sum()),
    price_nan_sum=('price', lambda x: x.isna().sum())
)
session_agg_test

Unnamed: 0_level_0,pageview_duration_sec_mean,page_type_median,pageview_number_mean,product_in_sale_prob,price_mean,category_mode,model_mode,good_mode,pageview_duration_sec_max,pageview_duration_sec_min,...,price_min,product_nan_sum,page_type_mode,page_type_max,page_type_min,page_type_mean,page_type_nan_sum,pageview_nan_sum,pageview_duration_sec_nan_sum,price_nan_sum
sessionkey_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
122243978,75.800000,2.0,21.000,0.317073,746.384615,1496.0,2345848.0,57791307.0,424.0,2.0,...,140.0,28,2,9,1,2.682927,0,0,1,28
122243992,137.714286,4.5,4.500,0.375000,7566.666667,3973.0,17297250.0,45171119.0,565.0,2.0,...,6624.0,5,1,8,1,4.375000,0,0,1,5
122243998,37.500000,8.0,3.000,0.400000,598.500000,1870.0,268888.0,4087157.0,86.0,15.0,...,506.0,3,8,8,1,5.200000,0,0,1,3
122244115,62.000000,4.0,1.500,0.500000,1051.000000,3779.0,16507112.0,29666875.0,62.0,62.0,...,1051.0,1,1,7,1,4.000000,0,0,1,1
122244260,35.282051,3.0,22.375,0.225000,444.888889,4723.0,522923.0,67362756.0,173.0,3.0,...,54.0,31,3,10,1,2.750000,0,0,1,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118720116,192.000000,1.5,1.500,0.000000,,1241.0,22251463.0,,192.0,192.0,...,,2,1,2,1,1.500000,0,0,1,2
118720198,34.000000,1.5,1.500,0.500000,2239.000000,1200.0,136805.0,34914400.0,34.0,34.0,...,2239.0,1,1,2,1,1.500000,0,0,1,1
118721114,,1.0,1.000,1.000000,60.000000,6880.0,3658198.0,62273476.0,,,...,60.0,0,1,1,1,1.000000,0,0,1,0
118721222,,1.0,1.000,1.000000,717.000000,1513.0,2724018.0,64632283.0,,,...,717.0,0,1,1,1,1.000000,0,0,1,0


In [358]:
test = test_df.merge(session_agg_test, how='left', on='sessionkey_id', sort=False)
test.head(3)

Unnamed: 0,order_id,create_time,good_id,price,utm_medium,utm_source,sessionkey_id,category_id,parent_id,root_id,...,price_min,product_nan_sum,page_type_mode,page_type_max,page_type_min,page_type_mean,page_type_nan_sum,pageview_nan_sum,pageview_duration_sec_nan_sum,price_nan_sum
0,1350922,5997,9896348,1143,1,2.0,132744630,139,133,124,...,1143.0,1.0,1.0,3.0,1.0,2.0,0.0,0.0,0.0,1.0
1,1354989,8286,69445048,1707,1,1.0,133161905,136,133,124,...,1707.0,2.0,1.0,6.0,1.0,3.333333,0.0,0.0,1.0,2.0
2,1352637,6894,70607886,576,1,1.0,132792626,136,133,124,...,576.0,2.0,2.0,2.0,1.0,1.666667,0.0,0.0,1.0,2.0


In [360]:
preds = model.predict(test)

In [361]:
predictions_df = pd.DataFrame({'order_id': test.order_id, 'is_callcenter': preds})

predictions_df['is_callcenter'] = predictions_df['is_callcenter'].round().astype(int)

In [362]:
predictions_df

Unnamed: 0,order_id,is_callcenter
0,1350922,0
1,1354989,0
2,1352637,1
3,1350050,1
4,1341733,0
...,...,...
17191,1358397,0
17192,1357968,0
17193,1358835,1
17194,1365692,0


In [363]:
predictions_df.to_csv('predictions.csv', index=False)