In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import lightgbm as lgb
import catboost as cb
from sklearn.metrics import precision_score
from sklearn.decomposition import PCA

from utils.create_train_val import  create_train_val_noagg
from utils.local_manipulations import *
from utils.category_to_num import *
from utils.prepare_data import prepare_data_solo, calc_feat_solo
from utils.custom_loss import MSLE, my_obj

%load_ext autoreload
%autoreload 2

### Пайплайн обучения и валидации

Скачиваем готовые данные

In [2]:
train = pd.read_parquet('./data/train_sort.parquet')
test = pd.read_parquet('./data/test_sort.parquet')

In [3]:
num = 2
tr = pd.read_parquet(f'./data/train{num}.parquet')
vl = pd.read_parquet(f'./data/val{num}.parquet')
tr, vl = prepare_data_solo(tr, vl)

In [4]:
X_train, y_train = tr.drop(['totals_transactionRevenue'], axis=1), (tr['totals_transactionRevenue'].values > 0).astype(int)
X_val, y_val = vl.drop(['totals_transactionRevenue'], axis=1), (vl['totals_transactionRevenue'].values > 0).astype(int)

### Тут решил использовать PCA - не улучшило

In [5]:
# pca = PCA()
# tr_embs = get_embeds1(X_train.reset_index(drop=True))
# val_embs = get_embeds1(X_val.reset_index(drop=True))
# X_train_pca = pca.fit_transform(tr_embs)
# X_train_pca = X_train_pca[:, :5]
# X_val_pca = pca.transform(val_embs)
# X_val_pca = X_val_pca[:, :5]
# X_train_pca_df = pd.DataFrame({f'pca_{i}': v for i, v in enumerate(X_train_pca.T)})
# X_val_pca_df = pd.DataFrame({f'pca_{i}': v for i, v in enumerate(X_val_pca.T)})
# X_train1 = pd.concat([X_train.reset_index(drop=True), X_train_pca_df], axis=1)
# X_val1 = pd.concat([X_val.reset_index(drop=True), X_val_pca_df], axis=1)

###  Сначала помечаем объекты, которые мы считаем точно что-то купили. 
Находим те объекты, у которых высокие логиты.

Чтобы не обжечься об ассиметрию метрики - Мы снова занижаем вес класс 1. Чтобы все объекты класса 0 классифицировались как можно точнее.

In [6]:
lgb_tr = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val)
params = {'objective': 'binary', 'metric': 'binary_logloss', 'eta': 0.01, "num_leaves" : 16,
          'verbosity': -1, 'max_bin': 64, 'bagging_fraction': 0.9,
          'feature_fraction': 0.8, 'min_data': 1, 'bagging_freq': 1,
          'scale_pos_weight': 0.5}
model = lgb.train(
    params, lgb_tr, num_boost_round=1200,
    valid_sets=[lgb_tr], valid_names=['train'], callbacks=[lgb.log_evaluation(period=50)]
)



[50]	train's binary_logloss: 0.080806
[100]	train's binary_logloss: 0.07021
[150]	train's binary_logloss: 0.064687
[200]	train's binary_logloss: 0.0613812
[250]	train's binary_logloss: 0.0592903
[300]	train's binary_logloss: 0.0578583
[350]	train's binary_logloss: 0.0567373
[400]	train's binary_logloss: 0.055916
[450]	train's binary_logloss: 0.0552277
[500]	train's binary_logloss: 0.0547147
[550]	train's binary_logloss: 0.0541735
[600]	train's binary_logloss: 0.0537484
[650]	train's binary_logloss: 0.0533416
[700]	train's binary_logloss: 0.0529774
[750]	train's binary_logloss: 0.0526561
[800]	train's binary_logloss: 0.0523377
[850]	train's binary_logloss: 0.0520585
[900]	train's binary_logloss: 0.0517581
[950]	train's binary_logloss: 0.0514623
[1000]	train's binary_logloss: 0.0511883
[1050]	train's binary_logloss: 0.0509153
[1100]	train's binary_logloss: 0.0506848
[1150]	train's binary_logloss: 0.0504415
[1200]	train's binary_logloss: 0.0502004


### Итак, регрессия

Решил пообучать так - обучить регрессию на всем трейне, или только на тех объектах, где была покупка. Так же в качестве валидации решил взять как всю валидационную выборку, так и ту часть, которую окрестил "точно купят"

In [7]:
preds = model.predict(X_val, raw_score=True)
nonzeros_val, y_nonzeros_val = X_val[preds > 0.4], vl[preds > 0.4]['totals_transactionRevenue'].values

отобрали покупателей с помощью обученного бустинга.

In [8]:
nonzeros_val1, y_nonzeros_val1 = X_val[(vl['totals_transactionRevenue'].values != 0)], \
    vl['totals_transactionRevenue'][(vl['totals_transactionRevenue'].values != 0)].values

отобрали покупателей из валидации

In [9]:
nonzeros_tr, y_nonzeros_tr = X_train[(tr['totals_transactionRevenue'].values != 0)], \
    tr['totals_transactionRevenue'][(tr['totals_transactionRevenue'].values != 0)].values

скомунизил идею с трюком по метрике от товарища из чата. (P.S. - сделала все только хуже)

In [14]:
all_payments = np.sort(train['totals_transactionRevenue'].unique()) / 1e4

In [15]:
print(*(all_payments), sep=', ')

0.0, 1.0, 4.0, 9.0, 16.0, 20.0, 49.0, 77.0, 79.0, 120.0, 147.0, 150.0, 155.0, 156.0, 159.0, 168.0, 184.0, 189.0, 198.0, 199.0, 216.0, 234.0, 239.0, 240.0, 248.0, 249.0, 250.0, 253.0, 269.0, 273.0, 280.0, 283.0, 286.0, 299.0, 318.0, 324.0, 326.0, 349.0, 350.0, 359.0, 360.0, 391.0, 396.0, 398.0, 399.0, 438.0, 450.0, 477.0, 496.0, 498.0, 499.0, 509.0, 515.0, 548.0, 549.0, 550.0, 556.0, 558.0, 559.0, 597.0, 598.0, 599.0, 600.0, 607.0, 615.0, 617.0, 623.0, 638.0, 639.0, 649.0, 669.0, 670.0, 697.0, 698.0, 699.0, 700.0, 718.0, 719.0, 748.0, 759.0, 776.0, 796.0, 797.0, 798.0, 799.0, 802.0, 812.0, 818.0, 836.0, 838.0, 840.0, 848.0, 849.0, 850.0, 856.0, 864.0, 867.0, 868.0, 875.0, 877.0, 878.0, 879.0, 888.0, 889.0, 897.0, 898.0, 899.0, 903.0, 909.0, 916.0, 919.0, 937.0, 946.0, 947.0, 948.0, 949.0, 955.0, 958.0, 959.0, 973.0, 978.0, 989.0, 990.0, 992.0, 995.0, 996.0, 997.0, 998.0, 999.0, 1021.0, 1026.0, 1036.0, 1037.0, 1038.0, 1039.0, 1047.0, 1050.0, 1059.0, 1078.0, 1084.0, 1086.0, 1097.0, 1098.0

In [18]:
def metric_trick(preds):
    preds /= 1e4
    processed_pred = []
    for pred in preds:
        if pred < 4:
            processed_pred.append(0)
        else:
            closest_val = all_payments[all_payments <= pred][-2]
            processed_pred.append(closest_val)

    return np.array(processed_pred) * 1e4

In [17]:
y_train1 = tr['totals_transactionRevenue'].values

### Здесь обучаем на всем датасете трейна
метрика на трейне растет, на валидации падает. Но то что не обучается на трейне - треш, так что обучим по другому.

### в качестве обджектива - assymetric mse

без логарифмов, чтобы градиенты были побольше, как сказали на семинаре.

In [18]:
lgb_tr = lgb.Dataset(X_train, y_train1)
lgb_val = lgb.Dataset(nonzeros_val, y_nonzeros_val)
params = {'eta': 0.01, 'objective':my_obj,
          'verbosity': -1, 'max_bin': 256, 'bagging_fraction': 0.9, "num_leaves" : 9,
          'feature_fraction': 0.8, 'min_data': 1, 'bagging_freq': 1, 'early_stopping_round': 50}
          
model = lgb.train(
    params, lgb_tr, num_boost_round=400, valid_sets=[lgb_tr, lgb_val], 
    valid_names=['train', 'val'], callbacks=[lgb.log_evaluation(period=50)], feval=MSLE,
)

[LightGBM] [Info] Using self-defined objective function
[50]	train's MSLE: 7.25369	val's MSLE: 171.782
[100]	train's MSLE: 8.70366	val's MSLE: 155.797
[150]	train's MSLE: 9.96449	val's MSLE: 146.984
[200]	train's MSLE: 11.1236	val's MSLE: 140.991
[250]	train's MSLE: 12.1164	val's MSLE: 136.498
[300]	train's MSLE: 13.0106	val's MSLE: 132.855
[350]	train's MSLE: 13.8323	val's MSLE: 129.721
[400]	train's MSLE: 14.5249	val's MSLE: 127.217


In [19]:
preds = model.predict(nonzeros_val)
MSLE(preds, y_nonzeros_val), MSLE(metric_trick(preds), y_nonzeros_val)

(('MSLE', 127.21691708682289, False), ('MSLE', 127.42212, False))

### Обучим на покупках. Вроде обучается.

Параметры берем для бустинга такие, чтобы как-то запревентить оверфит

In [21]:
lgb_tr = lgb.Dataset(nonzeros_tr, y_nonzeros_tr)
lgb_val = lgb.Dataset(nonzeros_val, y_nonzeros_val)
params = {'eta': 0.01, 'objective':my_obj,
          'verbosity': -1, 'max_bin': 256, 'bagging_fraction': 0.9, "num_leaves" : 9,
          'feature_fraction': 0.8, 'min_data': 1, 'bagging_freq': 1, 'early_stopping_round': 50}
          
model = lgb.train(
    params, lgb_tr, num_boost_round=400, valid_sets=[lgb_tr, lgb_val], 
    valid_names=['train', 'val'], callbacks=[lgb.log_evaluation(period=50)], feval=MSLE,
)

[LightGBM] [Info] Using self-defined objective function
[50]	train's MSLE: 74.7755	val's MSLE: 90.1471
[100]	train's MSLE: 63.3761	val's MSLE: 80.9809
[150]	train's MSLE: 57.1455	val's MSLE: 76.1191
[200]	train's MSLE: 52.9178	val's MSLE: 72.8204
[250]	train's MSLE: 49.7492	val's MSLE: 70.3515
[300]	train's MSLE: 47.2335	val's MSLE: 68.4205
[350]	train's MSLE: 45.1629	val's MSLE: 66.8075
[400]	train's MSLE: 43.418	val's MSLE: 65.4768


In [22]:
preds = model.predict(nonzeros_val)
MSLE(preds, y_nonzeros_val), MSLE(metric_trick(preds), y_nonzeros_val)

(('MSLE', 65.4768295247076, False), ('MSLE', 65.62482, False))

### Валидация с полной валидационной выборкой -похожий результат что и выше.

In [23]:
lgb_tr = lgb.Dataset(nonzeros_tr, y_nonzeros_tr)
lgb_val = lgb.Dataset(nonzeros_val1, y_nonzeros_val1)
params = {'eta': 0.01, 'objective':my_obj,
          'verbosity': -1, 'max_bin': 256, 'bagging_fraction': 0.9, "num_leaves" : 9,
          'feature_fraction': 0.8, 'min_data': 1, 'bagging_freq': 1, 'early_stopping_round': 50}
          
model = lgb.train(
    params, lgb_tr, num_boost_round=400, valid_sets=[lgb_tr, lgb_val], 
    valid_names=['train', 'val'], callbacks=[lgb.log_evaluation(period=50)], feval=MSLE,
)

[LightGBM] [Info] Using self-defined objective function
[50]	train's MSLE: 74.7755	val's MSLE: 74.6954
[100]	train's MSLE: 63.3761	val's MSLE: 63.2895
[150]	train's MSLE: 57.1455	val's MSLE: 57.0721
[200]	train's MSLE: 52.9178	val's MSLE: 52.8534
[250]	train's MSLE: 49.7492	val's MSLE: 49.6947
[300]	train's MSLE: 47.2335	val's MSLE: 47.1848
[350]	train's MSLE: 45.1629	val's MSLE: 45.1196
[400]	train's MSLE: 43.418	val's MSLE: 43.3795


In [24]:
preds = model.predict(nonzeros_val)
MSLE(preds, y_nonzeros_val), MSLE(metric_trick(preds), y_nonzeros_val)

(('MSLE', 65.4768295247076, False), ('MSLE', 65.62482, False))

###  ФИНАЛ.

Здесь делаем предсказания.

скачали данные, обработали

In [3]:
columns_to_drop = ['date', 'fullVisitorId', 'sessionId', 'visitId', 
                   'visitStartTime', 'geoNetwork_country', 'geoNetwork_region', 'geoNetwork_networkDomain', 
                   'geoNetwork_metro', 'geoNetwork_city', 'trafficSource_keyword', 
                   'trafficSource_referralPath', 'trafficSource_adwordsClickInfo.page',
                   'trafficSource_adwordsClickInfo.slot', 'trafficSource_adwordsClickInfo.gclId',
                   'trafficSource_adwordsClickInfo.adNetworkType', 'trafficSource_adContent', 'trafficSource_campaign']

X_train, X_test = calc_feat_solo(train.copy(deep=True), test.copy(deep=True), columns_to_drop)
X_train, X_test = prepare_data_solo(X_train, X_test)

In [29]:
X_train.reset_index().to_parquet('./data/TRAIN_FINAL_SOLO.parquet')
X_test.reset_index().to_parquet('./data/TEST_FINAL_SOLO.parquet')

Тут тоже пробовал PCA.

In [4]:
X_train_prob, y_train_prob = X_train.drop(['totals_transactionRevenue'], axis=1), (X_train['totals_transactionRevenue'].values > 0).astype(int)
# pca = PCA()
# tr_embs = get_embeds1(X_train_prob.reset_index(drop=True))
# tst_embs = get_embeds1(X_test.reset_index(drop=True))
# X_train_pca = pca.fit_transform(tr_embs)
# X_train_pca = X_train_pca[:, :5]
# X_tst_pca = pca.transform(tst_embs)
# X_tst_pca = X_tst_pca[:, :5]
# X_train_pca_df = pd.DataFrame({f'pca_{i}': v for i, v in enumerate(X_train_pca.T)})
# X_tst_pca_df = pd.DataFrame({f'pca_{i}': v for i, v in enumerate(X_tst_pca.T)})
# X_train1 = pd.concat([X_train_prob.reset_index(drop=True), X_train_pca_df], axis=1)
# X_tst1 = pd.concat([X_test.reset_index(drop=True), X_tst_pca_df], axis=1)

### Обучили классификатор

In [5]:
lgb_tr = lgb.Dataset(X_train_prob, y_train_prob)
params = {'objective': 'binary', 'metric': 'binary_logloss', 'eta': 0.01, "num_leaves" : 16,
          'verbosity': -1, 'max_bin': 64, 'bagging_fraction': 0.9,
          'feature_fraction': 0.8, 'min_data': 1, 'bagging_freq': 1,
          'scale_pos_weight': 0.5}
model = lgb.train(
    params, lgb_tr, num_boost_round=1200,
    valid_sets=[lgb_tr], valid_names=['train'], callbacks=[lgb.log_evaluation(period=50)]
)

[50]	train's binary_logloss: 0.080958
[100]	train's binary_logloss: 0.0702436
[150]	train's binary_logloss: 0.0646434
[200]	train's binary_logloss: 0.0613582
[250]	train's binary_logloss: 0.0592622
[300]	train's binary_logloss: 0.0578908
[350]	train's binary_logloss: 0.0569667
[400]	train's binary_logloss: 0.0562235
[450]	train's binary_logloss: 0.0556796
[500]	train's binary_logloss: 0.0552035
[550]	train's binary_logloss: 0.0547639
[600]	train's binary_logloss: 0.0543653
[650]	train's binary_logloss: 0.0541068
[700]	train's binary_logloss: 0.0537928
[750]	train's binary_logloss: 0.0534667
[800]	train's binary_logloss: 0.0531875
[850]	train's binary_logloss: 0.0529269
[900]	train's binary_logloss: 0.0526876
[950]	train's binary_logloss: 0.0524651
[1000]	train's binary_logloss: 0.0522299
[1050]	train's binary_logloss: 0.0520278
[1100]	train's binary_logloss: 0.0518211
[1150]	train's binary_logloss: 0.0516218
[1200]	train's binary_logloss: 0.0514357


### отобрали покупателей.

In [31]:
preds = model.predict(X_test, raw_score=True)
nonzeros_tst = X_test[preds > 0.45]

In [32]:
nonzeros_tr, y_nonzeros_tr = X_train_prob[y_train_prob != 0], X_train['totals_transactionRevenue'].values[y_train_prob != 0]
wzeros_tr, y_wzeros_tr = X_train_prob, X_train['totals_transactionRevenue'].values

### обучили регрессию

In [33]:
lgb_tr = lgb.Dataset(nonzeros_tr, y_nonzeros_tr)
params = {'eta': 0.01, 'objective':my_obj,
          'verbosity': -1, 'max_bin': 256, 'bagging_fraction': 0.9, "num_leaves" : 9,
          'feature_fraction': 0.8, 'min_data': 1, 'bagging_freq': 1}
          
model_reg = lgb.train(
    params, lgb_tr, num_boost_round=400, valid_sets=[lgb_tr], 
    valid_names=['train'], callbacks=[lgb.log_evaluation(period=50)], feval=MSLE,
)

[LightGBM] [Info] Using self-defined objective function
[50]	train's MSLE: 79.885
[100]	train's MSLE: 68.0855
[150]	train's MSLE: 61.6226
[200]	train's MSLE: 57.2351
[250]	train's MSLE: 53.9397
[300]	train's MSLE: 51.3276
[350]	train's MSLE: 49.158
[400]	train's MSLE: 47.3276


In [34]:
preds = model_reg.predict(nonzeros_tst)
trick_preds = metric_trick(preds)

### Cоздали сабмишен

лучший скор что выдало - 0.4153, и это без трюка. С трюком предказания на регрессию - хуже.

In [35]:
fullids = test.loc[nonzeros_tst.index]['fullVisitorId'].values

In [36]:
submission = pd.read_csv('sample_submission.csv', dtype={'fullVisitorId': 'object'})
my_subm = dict(submission.values)
for a, b in zip(preds, fullids):
    my_subm[b] += a
my_subm_csv = pd.DataFrame({'fullVisitorId': my_subm.keys(), 'target': my_subm.values()})
my_subm_csv.to_csv('submsolo2.csv')

In [37]:
submission = pd.read_csv('sample_submission.csv', dtype={'fullVisitorId': 'object'})
my_subm = dict(submission.values)
for a, b in zip(trick_preds, fullids):
        my_subm[b] += a
my_subm_csv = pd.DataFrame({'fullVisitorId': my_subm.keys(), 'target': my_subm.values()})
my_subm_csv.to_csv('submsolotrick2.csv')