In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px
from datetime import datetime, timedelta
from utils.plot_tool import create_stats_fig
from sklearn import preprocessing
from utils.prepare_data import prepare_data_solo, calc_feat_solo
from utils.custom_loss import MSLE, my_obj
import lightgbm as lgb

%load_ext autoreload
%autoreload 2

# сбор предпосчитанных данных

In [2]:
train1 = pd.read_parquet('./data/sint_train0.parquet')
train2 = pd.read_parquet('./data/sint_train1.parquet')
train3 = pd.read_parquet('./data/sint_train2.parquet')
train4 = pd.read_parquet('./data/sint_train3.parquet')
test = pd.read_parquet('./data/sint_test.parquet').reset_index()

In [3]:
train_all = pd.concat([train1, train2, train3, train4, test], axis=0, sort=False).reset_index(drop=True)

In [4]:
for column in train_all:
    if train_all[column].dtype == 'object':
        train_all[column] = train_all[column].astype('category')

In [5]:
train_all['hours_max_freq'] = train_all['hours_max_freq'].astype('category')
train_all['weekday_max_freq'] = train_all['weekday_max_freq'].astype('category')
train_all['weekday_max2_freq'] = train_all['weekday_max2_freq'].astype('category')
train_all['daytime_max_freq'] = train_all['daytime_max_freq'].astype('category')
train_all['daytime_max2_freq'] = train_all['daytime_max2_freq'].astype('category')

In [6]:
train = train_all[~train_all['totals_transactionRevenue'].isna()]
test = train_all[train_all['totals_transactionRevenue'].isna()]

# задаем параметры.

Руками поперебирал некоторые параметры, так как времени уже не оставалось на хорошую валидацию

In [7]:
params_lgb1 = {
        "objective" : "binary",
        "metric" : "binary_logloss",
        "num_leaves" : 31,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1,
        'scale_pos_weight': 0.5         
    }

params_lgb2 = {
        "objective" : my_obj,
        "metric" : "rmse", 
        "num_leaves" : 12,
        "min_child_samples" : 1,
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_frequency" : 1      
    }

In [8]:
target_cols = ['totals_transactionRevenue', 'ret', 'fullVisitorId']

dtrain_all = lgb.Dataset(train.drop(target_cols, axis=1), label=train['ret'])

dtrain_ret = lgb.Dataset(train.drop(target_cols, axis=1)[train['ret']==1], 
                         label=train['totals_transactionRevenue'][train['ret']==1])

### запускаем предсказание

предсказываем - вернется ли покупатель, и если вернется - сколько даст денег.

In [10]:
pr_lgb_sum = 0
nums = 5
for i in range(nums):
    print('Interation number ', i)
    lgb_model1 = lgb.train(params_lgb1, dtrain_all, num_boost_round=1200)
    pr_lgb = lgb_model1.predict(test.drop(target_cols, axis=1))
    
    lgb_model2 = lgb.train(params_lgb2, dtrain_ret, num_boost_round=368)
    pr_lgb_ret = lgb_model2.predict(test.drop(target_cols, axis=1))
    
    pr_lgb_sum = pr_lgb_sum + pr_lgb*pr_lgb_ret

pr_final2 = pr_lgb_sum / nums

Interation number  0
[LightGBM] [Info] Number of positive: 0, number of negative: 373937
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.400782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3012
[LightGBM] [Info] Number of data points in the train set: 373937, number of used features: 63
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000000 -> initscore=-34.538776
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.115179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2679
[LightGBM] [Info] Number of data points in the train set: 6991, number of used features: 60
[LightGBM] [I

### Создаем сабмишен.

Дало плохое качество - 8. Видимо я не посчитал некоторых киллер фичей, и плохо провалидировал.

In [12]:
submission = pd.read_csv('sample_submission.csv', dtype={'fullVisitorId': 'object'})
my_subm = dict(submission.values)

In [13]:
for a, b in zip(test['fullVisitorId'].values, pr_final2):
    my_subm[a] += b

In [14]:
my_subm_csv = pd.DataFrame({'fullVisitorId': my_subm.keys(), 'target': my_subm.values()})

In [15]:
my_subm_csv.to_csv('subm2.csv')

: 