# Step 0.0. Install LightAutoML

# Step 0.1. Import necessary libraries 

In [1]:
%matplotlib inline

# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from scipy.signal import argrelmax,peak_prominences, find_peaks_cwt, peak_widths, peak_prominences, find_peaks
from lightautoml.dataset.roles import DatetimeRole

# Step 0.2. Parameters 

In [30]:
N_THREADS = 6 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 777 # fixed random state for various reasons
TEST_SIZE = 0.1 # Test size for metric check
TIMEOUT = 3600*6 # Time in seconds for automl run
TARGET = ['target']

# Step 0.3. Fix torch number of threads and numpy seed 

In [3]:
np.random.seed(RANDOM_STATE)

# Step 0.4. Data load 

In [100]:
%%time

train = pd.read_csv('train_dataset_train.csv', parse_dates=['month_id','carts_created_at'], index_col='id')
print(train.shape)
train.head()

(200000, 58)
Wall time: 814 ms


Unnamed: 0_level_0,age_indicator,month_id,student_id,program_id,carts_created_at,spent_time_total,spent_time_to_complete_hw,completed_hw,failed_hw,reworked_hw,...,p_total_calls,p_was_conversations,p_total_duration,support_feedback_avg,feedback_avg_d1,feedback_avg_d2,feedback_avg_d3,feedback_avg_d4,feedback_avg_d5,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15182,32.0,2020-09-30,6694527,1469,2020-08-26,163.0,0.0,1.0,0.0,17.0,...,,,,4.0,5.0,,,,,0
89385,,2021-06-30,6712877,1392,2020-08-05,,,,,,...,,,,,,,,,,0
47931,,2021-02-28,6659444,376,2020-06-20,,,,,,...,,,,,,,,,,0
279085,1.0,2021-11-30,7151591,1160,2021-04-14,,,,,,...,,,,,,,,,,0
7806,30.0,2020-10-31,6705666,952,2020-07-19,,,,,,...,,,,,5.0,,,,,0


In [101]:
train.target.value_counts()

0    174301
1     13512
5      3678
4      3659
3      3027
2      1823
Name: target, dtype: int64

In [102]:
train.target.unique()

array([0, 3, 1, 4, 5, 2], dtype=int64)

In [103]:
%%time

test= pd.read_csv('test_dataset_test.csv', parse_dates=['month_id','carts_created_at'], index_col='id')
test.head()

Wall time: 363 ms


Unnamed: 0_level_0,age_indicator,month_id,student_id,program_id,carts_created_at,spent_time_total,spent_time_to_complete_hw,completed_hw,failed_hw,reworked_hw,...,p_missed_calls,p_total_calls,p_was_conversations,p_total_duration,support_feedback_avg,feedback_avg_d1,feedback_avg_d2,feedback_avg_d3,feedback_avg_d4,feedback_avg_d5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77551,,2021-05-31,7064806,1395,2021-02-28,,,,,,...,,,,,,,,,,
227812,27.0,2021-05-31,6982061,245,2021-01-01,81.0,0.0,5.0,0.0,13.0,...,,,,,,,,,,
103035,9.0,2021-06-30,7118790,1233,2021-03-19,49.0,0.0,2.0,0.0,0.0,...,,,,,,,,5.0,,
260943,,2021-09-30,7288419,784,2021-06-29,10.0,,0.0,0.0,0.0,...,0.0,2.0,2.0,61.0,,,,,,
134611,7.0,2021-08-31,6949976,998,2020-12-31,,,,,,...,,,,,,,,,,


In [104]:
sample_submisson = pd.read_csv('sample_solution.csv', index_col='id')
sample_submisson.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
77551,0
227812,0
103035,0
260943,0
134611,0


In [105]:
test = pd.merge(test, sample_submisson, how='left', on='id')
test.head()

Unnamed: 0_level_0,age_indicator,month_id,student_id,program_id,carts_created_at,spent_time_total,spent_time_to_complete_hw,completed_hw,failed_hw,reworked_hw,...,p_total_calls,p_was_conversations,p_total_duration,support_feedback_avg,feedback_avg_d1,feedback_avg_d2,feedback_avg_d3,feedback_avg_d4,feedback_avg_d5,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
77551,,2021-05-31,7064806,1395,2021-02-28,,,,,,...,,,,,,,,,,0
227812,27.0,2021-05-31,6982061,245,2021-01-01,81.0,0.0,5.0,0.0,13.0,...,,,,,,,,,,0
103035,9.0,2021-06-30,7118790,1233,2021-03-19,49.0,0.0,2.0,0.0,0.0,...,,,,,,,5.0,,,0
260943,,2021-09-30,7288419,784,2021-06-29,10.0,,0.0,0.0,0.0,...,2.0,2.0,61.0,,,,,,,0
134611,7.0,2021-08-31,6949976,998,2020-12-31,,,,,,...,,,,,,,,,,0


In [106]:
zarplata = pd.read_excel('zp.xlsx', engine='openpyxl', index_col='index')

In [107]:
def make_features(data, zarplata):
    data['is_weekend'] = (data['carts_created_at'].dt.dayofweek>5).astype(int)
    data['city'] = data['city'].str.replace(r'[^\w\s]', '', regex=True)
    zarplata['city']=zarplata['city'].str.replace(r'[^\w\s]', '', regex=True)
    grouped_features = ['month_id','student_id','program_id','city','ABC','platform',\
                        'payment_type','promo','gender','speed_recall','auto_payment' ,'os','hw_leader']
    tmp = ['avg_hw_mark',
        'bought_d1',
        'bought_d2',
        'bought_d3',
        'bought_d4',
        'bought_d5',
        'webinars',
        'notes',
        'avg_quiz_result',
        'completed_hw',
        'reworked_hw',
        'lessons',
        'p_avg_duration',
        'p_avg_talk_duration',
        'p_missed_calls',
        'p_total_calls',
        'p_was_conversations',
        'p_was_conversations',
        'price',
        'm_avg_duration',
        'm_avg_talk_duration',
        'm_missed_calls',
        'm_total_calls',
        'm_was_conversations',
        'm_total_duration',
        'support_feedback_avg',
        'feedback_avg_d1',
        'feedback_avg_d2',
        'feedback_avg_d3',
        'feedback_avg_d4',
        'feedback_avg_d5',
        'activity']
    for feature in grouped_features:
        
        grouped = data.groupby(feature).agg({
            'avg_hw_mark': ['min','max','median','mean','std'],
            'bought_d1': ['min','max','median','mean','std'],
            'bought_d2': ['min','max','median','mean','std'],
            'bought_d3': ['min','max','median','mean','std'],
            'bought_d4': ['min','max','median','mean','std'],
            'bought_d5': ['min','max','median','mean','std'],
            'webinars': ['min','max','median','mean','std'],
            'notes': ['min','max','median','mean','std'],
            'avg_quiz_result': ['min','max','median','mean','std'],
            'completed_hw': ['min','max','median','mean','std'],
            'reworked_hw': ['min','max','median','mean','std'],
            'lessons': ['min','max','median','mean','std'],
            'p_avg_duration': ['min','max','median','mean','std'],
            'p_avg_talk_duration': ['min','max','median','mean','std'],
            'p_missed_calls': ['min','max','median','mean','std'],
            'p_total_calls': ['min','max','median','mean','std'],
            'p_was_conversations': ['min','max','median','mean','std'],
            'p_was_conversations': ['min','max','median','mean','std'],
            'price': ['min','max','median','mean','std'],
            'm_avg_duration': ['min','max','median','mean','std'],
            'm_avg_talk_duration': ['min','max','median','mean','std'],
            'm_missed_calls': ['min','max','median','mean','std'],
            'm_total_calls': ['min','max','median','mean','std'],
            'm_was_conversations': ['min','max','median','mean','std'],
            'm_total_duration': ['min','max','median','mean','std'],
            'support_feedback_avg':['min','max','median','mean','std'],
            'feedback_avg_d1':['min','max','median','mean','std'],
            'feedback_avg_d2':['min','max','median','mean','std'],
            'feedback_avg_d3':['min','max','median','mean','std'],
            'feedback_avg_d4':['min','max','median','mean','std'],
            'feedback_avg_d5':['min','max','median','mean','std'],
            'activity':['min','max','median','mean','std'],
        })
        grouped.columns = [feature+'_'+'_'.join(col) for col in grouped.columns.values]
        grouped = grouped.reset_index()
        data = pd.merge(data, grouped, how='left', on=feature)
        for t in tmp:
            data[t+'_'+feature+'_center_mean'] = data[t] - data[feature+'_'+t+'_mean']
    data =  pd.merge(data, zarplata, how='left',on='city')
    return data

all_data = make_features(pd.concat([train,test]),  zarplata)
train, test = all_data[:len(train)], all_data[len(train):]

In [108]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Columns: 2479 entries, age_indicator to feat2
dtypes: datetime64[ns](2), float64(2359), int32(1), int64(109), object(8)
memory usage: 3.7+ GB


# Step 0.5. Add new features

# Step 0.6. Data splitting for train-test 

In [109]:
tr_data, te_data = train_test_split(train, 
                                     test_size=TEST_SIZE,
#                                     shuffle=False,
                                        stratify=train.target,
                                     random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

Data splitted. Parts sizes: tr_data = (180000, 2479), te_data = (20000, 2479)


# ========= AutoML preset usage =========


## Step 1. Create Task

In [110]:
%%time

def custom_metric(y_true, y_pred, **kwargs):
    y_pred = np.argmax(y_pred, axis=1)
    return 0.2*recall_score(y_true, y_pred, average='micro')+0.8*precision_score(y_true, y_pred, average='micro')

task = Task('multiclass', metric=custom_metric )

Wall time: 108 ms


## Step 2. Setup columns roles

In [111]:
%%time

roles = {
    'target': TARGET,
#     'drop': TARGET
    DatetimeRole(base_date=False, base_feats=True, seasonality=('y', 'm', 'd')): 'carts_created_at',
    DatetimeRole(base_date=False, base_feats=True, seasonality=('y', 'm')): 'month_id'
}

Wall time: 0 ns


## Step 3. Create AutoML from preset and train on 80% of data

In [112]:
params = {
#     'n_estimators': 50000,
#     'learning_rate': 0.03,
#     'reg_alpha': 3e-4,
#     'reg_lambda': 9e-2,
#     'num_leaves': 20,
#     'subsample': 0.9,
#     'subsample_freq': 2,
#     'max_bin': 240,
    'device':'gpu'
}

In [113]:
features = accurate_fi[accurate_fi['Importance']>0].Feature.to_list()+TARGET

In [114]:
automl = TabularAutoML(task = task, 
                   timeout = TIMEOUT,
                   cpu_limit = N_THREADS,
#                 memory_limit=8,
                 general_params = {'use_algos': [['lgb','lgb_tuned']]},
                   reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                        lgb_params = {'default_params': params},
                    )
oof_pred = automl.fit_predict(tr_data, roles = roles, verbose = 4)

[00:18:27] Stdout logging level is DEBUG.
[00:18:27] Task: multiclass

[00:18:27] Start automl preset with listed constraints:
[00:18:27] - time: 21600.00 seconds
[00:18:27] - CPU: 6 cores
[00:18:27] - memory: 16 GB

[00:18:27] [1mTrain data shape: (180000, 2479)[0m

[00:25:46] Feats was rejected during automatic roles guess: []
[00:25:49] Layer [1m1[0m train process start. Time left 21158.50 secs




[00:26:35] Training until validation scores don't improve for 100 rounds
[00:28:55] [100]	valid's multi_logloss: 0.187665	valid's Opt metric: 0.932167
[00:31:11] [200]	valid's multi_logloss: 0.144006	valid's Opt metric: 0.952306
[00:33:13] [300]	valid's multi_logloss: 0.124346	valid's Opt metric: 0.960028
[00:35:05] [400]	valid's multi_logloss: 0.113758	valid's Opt metric: 0.964806
[00:36:45] [500]	valid's multi_logloss: 0.107561	valid's Opt metric: 0.966861
[00:38:17] [600]	valid's multi_logloss: 0.103708	valid's Opt metric: 0.968167
[00:39:39] [700]	valid's multi_logloss: 0.101414	valid's Opt metric: 0.968833
[00:40:54] [800]	valid's multi_logloss: 0.0998686	valid's Opt metric: 0.969083
[00:42:00] [900]	valid's multi_logloss: 0.0989083	valid's Opt metric: 0.969472
[00:42:59] [1000]	valid's multi_logloss: 0.0981311	valid's Opt metric: 0.970111
[00:43:54] [1100]	valid's multi_logloss: 0.0977458	valid's Opt metric: 0.970167
[00:43:59] Early stopping, best iteration is:
[1010]	valid's mu



[00:45:09] Training until validation scores don't improve for 100 rounds
[00:46:52] [100]	valid's multi_logloss: 0.18936	valid's Opt metric: 0.931944
[00:48:38] [200]	valid's multi_logloss: 0.144358	valid's Opt metric: 0.951917
[00:50:19] [300]	valid's multi_logloss: 0.123811	valid's Opt metric: 0.960333
[00:51:53] [400]	valid's multi_logloss: 0.113088	valid's Opt metric: 0.964917
[00:53:19] [500]	valid's multi_logloss: 0.106711	valid's Opt metric: 0.966889
[00:54:38] [600]	valid's multi_logloss: 0.102991	valid's Opt metric: 0.968111
[00:55:48] [700]	valid's multi_logloss: 0.100512	valid's Opt metric: 0.968944
[00:56:50] [800]	valid's multi_logloss: 0.098888	valid's Opt metric: 0.969361
[00:57:45] [900]	valid's multi_logloss: 0.0977296	valid's Opt metric: 0.969861
[00:58:36] [1000]	valid's multi_logloss: 0.0969967	valid's Opt metric: 0.970028
[00:59:21] [1100]	valid's multi_logloss: 0.096488	valid's Opt metric: 0.970417
[01:00:05] [1200]	valid's multi_logloss: 0.0961083	valid's Opt met



[01:00:58] Training until validation scores don't improve for 100 rounds
[01:02:40] [100]	valid's multi_logloss: 0.189826	valid's Opt metric: 0.930917
[01:04:25] [200]	valid's multi_logloss: 0.144697	valid's Opt metric: 0.951389
[01:06:04] [300]	valid's multi_logloss: 0.124723	valid's Opt metric: 0.960389
[01:07:37] [400]	valid's multi_logloss: 0.114362	valid's Opt metric: 0.964722
[01:09:03] [500]	valid's multi_logloss: 0.108321	valid's Opt metric: 0.966556
[01:10:21] [600]	valid's multi_logloss: 0.10502	valid's Opt metric: 0.967417
[01:11:30] [700]	valid's multi_logloss: 0.102814	valid's Opt metric: 0.968056
[01:12:32] [800]	valid's multi_logloss: 0.101283	valid's Opt metric: 0.968778
[01:13:27] [900]	valid's multi_logloss: 0.100154	valid's Opt metric: 0.968806
[01:14:15] [1000]	valid's multi_logloss: 0.0995584	valid's Opt metric: 0.96925
[01:15:02] [1100]	valid's multi_logloss: 0.0990736	valid's Opt metric: 0.969444
[01:15:44] [1200]	valid's multi_logloss: 0.0987721	valid's Opt metr



[01:16:39] Training until validation scores don't improve for 100 rounds
[01:18:19] [100]	valid's multi_logloss: 0.192528	valid's Opt metric: 0.930722
[01:20:04] [200]	valid's multi_logloss: 0.146764	valid's Opt metric: 0.950444
[01:21:41] [300]	valid's multi_logloss: 0.126336	valid's Opt metric: 0.959361
[01:23:12] [400]	valid's multi_logloss: 0.114873	valid's Opt metric: 0.963444
[01:24:37] [500]	valid's multi_logloss: 0.108418	valid's Opt metric: 0.965889
[01:25:54] [600]	valid's multi_logloss: 0.104451	valid's Opt metric: 0.966778
[01:27:03] [700]	valid's multi_logloss: 0.102046	valid's Opt metric: 0.967639
[01:28:03] [800]	valid's multi_logloss: 0.100346	valid's Opt metric: 0.96775
[01:28:58] [900]	valid's multi_logloss: 0.0992028	valid's Opt metric: 0.968528
[01:29:48] [1000]	valid's multi_logloss: 0.0984348	valid's Opt metric: 0.968861
[01:30:05] Early stopping, best iteration is:
[934]	valid's multi_logloss: 0.0988655	valid's Opt metric: 0.968972
[01:30:22] ===== Start working 



[01:30:43] Training until validation scores don't improve for 100 rounds
[01:32:23] [100]	valid's multi_logloss: 0.190605	valid's Opt metric: 0.931333
[01:34:07] [200]	valid's multi_logloss: 0.146545	valid's Opt metric: 0.950833
[01:35:45] [300]	valid's multi_logloss: 0.127037	valid's Opt metric: 0.9595
[01:37:16] [400]	valid's multi_logloss: 0.116458	valid's Opt metric: 0.962861
[01:38:39] [500]	valid's multi_logloss: 0.110404	valid's Opt metric: 0.965111
[01:39:56] [600]	valid's multi_logloss: 0.106955	valid's Opt metric: 0.96625
[01:41:05] [700]	valid's multi_logloss: 0.104537	valid's Opt metric: 0.967639
[01:42:06] [800]	valid's multi_logloss: 0.103054	valid's Opt metric: 0.968111
[01:43:01] [900]	valid's multi_logloss: 0.102049	valid's Opt metric: 0.968611
[01:43:51] [1000]	valid's multi_logloss: 0.101605	valid's Opt metric: 0.968806
[01:44:36] [1100]	valid's multi_logloss: 0.101149	valid's Opt metric: 0.968861
[01:45:18] [1200]	valid's multi_logloss: 0.100747	valid's Opt metric: 

INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-14299d70-ad02-4d71-a684-50dfa31f0f7d


[01:46:28] Training until validation scores don't improve for 100 rounds
[01:49:47] [100]	valid's multi_logloss: 0.140291	valid's Opt metric: 0.954194
[01:52:53] [200]	valid's multi_logloss: 0.107847	valid's Opt metric: 0.967417
[01:55:22] [300]	valid's multi_logloss: 0.0982919	valid's Opt metric: 0.970556
[01:57:13] [400]	valid's multi_logloss: 0.0944067	valid's Opt metric: 0.972444
[01:58:43] [500]	valid's multi_logloss: 0.0932253	valid's Opt metric: 0.973306
[02:00:01] [600]	valid's multi_logloss: 0.0932383	valid's Opt metric: 0.974222
[02:00:37] Early stopping, best iteration is:
[550]	valid's multi_logloss: 0.0930915	valid's Opt metric: 0.973556


INFO:optuna.study.study:Trial 0 finished with value: 0.9735555555555555 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244, 'bagging_fraction': 0.8659969709057025, 'min_sum_hessian_in_leaf': 0.24810409748678125, 'reg_alpha': 2.5361081166471375e-07, 'reg_lambda': 2.5348407664333426e-07}. Best is trial 0 with value: 0.9735555555555555.


[02:00:49] [1mTrial 1[0m with hyperparameters {'feature_fraction': 0.6872700594236812, 'num_leaves': 244, 'bagging_fraction': 0.8659969709057025, 'min_sum_hessian_in_leaf': 0.24810409748678125, 'reg_alpha': 2.5361081166471375e-07, 'reg_lambda': 2.5348407664333426e-07} scored 0.9735555555555555 in 0:14:43.489459
[02:00:49] Hyperparameters optimization for [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m completed
[02:00:49] The set of hyperparameters [1m{'feature_fraction': 0.6872700594236812, 'num_leaves': 244, 'bagging_fraction': 0.8659969709057025, 'min_sum_hessian_in_leaf': 0.24810409748678125, 'reg_alpha': 2.5361081166471375e-07, 'reg_lambda': 2.5348407664333426e-07}[0m
 achieve 0.9736 AutoML Metric
[02:00:49] Start fitting [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m ...
[02:00:49] Training params: {'task': 'train', 'learning_rate': 0.05, 'num_leaves': 244, 'feature_fraction': 0.6872700594236812, 'bagging_fraction': 0.8659969709057025, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1



[02:01:13] Training until validation scores don't improve for 100 rounds
[02:04:31] [100]	valid's multi_logloss: 0.128463	valid's Opt metric: 0.958972
[02:07:25] [200]	valid's multi_logloss: 0.103469	valid's Opt metric: 0.96875
[02:09:28] [300]	valid's multi_logloss: 0.0967713	valid's Opt metric: 0.971861
[02:11:01] [400]	valid's multi_logloss: 0.0944303	valid's Opt metric: 0.973194
[02:12:18] [500]	valid's multi_logloss: 0.0945903	valid's Opt metric: 0.973861
[02:12:26] Early stopping, best iteration is:
[411]	valid's multi_logloss: 0.0943264	valid's Opt metric: 0.973111
[02:12:36] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m =====




[02:12:59] Training until validation scores don't improve for 100 rounds
[02:16:20] [100]	valid's multi_logloss: 0.128654	valid's Opt metric: 0.958361
[02:19:15] [200]	valid's multi_logloss: 0.103938	valid's Opt metric: 0.968778
[02:21:18] [300]	valid's multi_logloss: 0.0974752	valid's Opt metric: 0.972194
[02:22:50] [400]	valid's multi_logloss: 0.0962806	valid's Opt metric: 0.973833
[02:24:08] [500]	valid's multi_logloss: 0.0969314	valid's Opt metric: 0.974667
[02:24:33] Early stopping, best iteration is:
[435]	valid's multi_logloss: 0.0962227	valid's Opt metric: 0.974472
[02:24:44] ===== Start working with [1mfold 2[0m for [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m =====




[02:25:06] Training until validation scores don't improve for 100 rounds
[02:28:28] [100]	valid's multi_logloss: 0.130601	valid's Opt metric: 0.958889
[02:31:23] [200]	valid's multi_logloss: 0.104578	valid's Opt metric: 0.968611
[02:33:28] [300]	valid's multi_logloss: 0.0973449	valid's Opt metric: 0.971556
[02:35:01] [400]	valid's multi_logloss: 0.0950964	valid's Opt metric: 0.97275
[02:36:20] [500]	valid's multi_logloss: 0.0948919	valid's Opt metric: 0.973583
[02:37:11] Early stopping, best iteration is:
[476]	valid's multi_logloss: 0.0947209	valid's Opt metric: 0.973389
[02:37:22] ===== Start working with [1mfold 3[0m for [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m =====




[02:37:45] Training until validation scores don't improve for 100 rounds
[02:41:05] [100]	valid's multi_logloss: 0.128538	valid's Opt metric: 0.959333
[02:44:02] [200]	valid's multi_logloss: 0.103931	valid's Opt metric: 0.9695
[02:46:06] [300]	valid's multi_logloss: 0.0979184	valid's Opt metric: 0.971639
[02:47:38] [400]	valid's multi_logloss: 0.0963116	valid's Opt metric: 0.973083
[02:48:55] Early stopping, best iteration is:
[397]	valid's multi_logloss: 0.0962564	valid's Opt metric: 0.973083
[02:49:05] ===== Start working with [1mfold 4[0m for [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m =====




[02:49:28] Training until validation scores don't improve for 100 rounds
[02:52:48] [100]	valid's multi_logloss: 0.129338	valid's Opt metric: 0.957222
[02:55:43] [200]	valid's multi_logloss: 0.101725	valid's Opt metric: 0.968778
[02:57:47] [300]	valid's multi_logloss: 0.0944804	valid's Opt metric: 0.971611
[02:59:19] [400]	valid's multi_logloss: 0.0917191	valid's Opt metric: 0.972694
[03:00:37] [500]	valid's multi_logloss: 0.0914842	valid's Opt metric: 0.973833
[03:01:31] Early stopping, best iteration is:
[483]	valid's multi_logloss: 0.0913203	valid's Opt metric: 0.973639
[03:01:41] Fitting [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m finished. score = [1m0.973538888888889[0m
[03:01:41] [1mLvl_0_Pipe_0_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[03:01:41] Time left 11806.20 secs

[03:01:41] Time limit exceeded in one of the tasks. AutoML will blend level 1 models.

[03:01:41] [1mLayer 1 training completed.[0m

[03:01:41] Blending: optimization starts with equal weight

In [115]:
fast_fi = automl.get_feature_scores('fast')
# fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (30, 10), grid = True)
fast_fi

Unnamed: 0,Feature,Importance
0,carts_created_at,149270.656446
1,student_id_price_std,39578.865121
2,student_id,27965.051557
3,age_indicator,26925.326193
4,student_id_feedback_avg_d3_max,25299.496984
...,...,...
1493,platform_bought_d3_std,0.015247
1494,platform_p_avg_talk_duration_max,0.011550
1495,hw_leader_m_avg_talk_duration_median,0.007848
1496,gender_p_total_calls_std,0.006886


In [58]:
accurate_fi = automl.get_feature_scores('accurate', te_data.sample(5000), silent = False)

[09:24:32] LightAutoML ts master used 1009 feats
[09:24:55] 1/1009 Calculated score for hw_leader_feedback_avg_d5_median: 0.0000000
[09:25:06] 2/1009 Calculated score for hw_leader_bought_d1_mean: 0.0000000
[09:25:17] 3/1009 Calculated score for platform_m_total_duration_median: 0.0000000
[09:25:30] 4/1009 Calculated score for ABC_reworked_hw_mean: 0.0000000
[09:25:43] 5/1009 Calculated score for student_id_p_was_conversations_min: 0.0000000
[09:25:54] 6/1009 Calculated score for feedback_avg_d1: -0.0002000
[09:26:07] 7/1009 Calculated score for city_activity_median: 0.0000000
[09:26:18] 8/1009 Calculated score for hw_leader_bought_d3_max: 0.0000000
[09:26:29] 9/1009 Calculated score for m_was_conversations: 0.0000000
[09:26:41] 10/1009 Calculated score for ABC_m_total_duration_mean: 0.0000000
[09:26:52] 11/1009 Calculated score for promo_m_was_conversations_mean: 0.0000000
[09:27:03] 12/1009 Calculated score for program_id_bought_d1_mean: 0.0000000
[09:27:15] 13/1009 Calculated score 

[09:49:52] 106/1009 Calculated score for hw_leader_lessons_median: 0.0000000
[09:50:04] 107/1009 Calculated score for speed_recall_feedback_avg_d2_min: 0.0000000
[09:50:17] 108/1009 Calculated score for auto_payment_feedback_avg_d1_median: 0.0000000
[09:50:29] 109/1009 Calculated score for platform_m_total_duration_max: 0.0000000
[09:50:42] 110/1009 Calculated score for price: 0.0022000
[09:50:54] 111/1009 Calculated score for gender_m_total_duration_median: 0.0000000
[09:51:07] 112/1009 Calculated score for auto_payment_avg_quiz_result_median: 0.0000000
[09:51:19] 113/1009 Calculated score for promo_p_was_conversations_mean: 0.0000000
[09:51:32] 114/1009 Calculated score for student_id_feedback_avg_d1_min: 0.0008000
[09:51:47] 115/1009 Calculated score for city_m_total_duration_max: 0.0000000
[09:51:59] 116/1009 Calculated score for ABC_feedback_avg_d1_mean: 0.0000000
[09:52:12] 117/1009 Calculated score for platform_m_missed_calls_mean: 0.0000000
[09:52:25] 118/1009 Calculated score 

[10:12:49] 212/1009 Calculated score for city_avg_quiz_result_mean: 0.0000000
[10:13:02] 213/1009 Calculated score for hw_leader_activity_max: 0.0000000
[10:13:15] 214/1009 Calculated score for os_webinars_max: -0.0002000
[10:13:28] 215/1009 Calculated score for platform_m_avg_duration_max: 0.0000000
[10:13:41] 216/1009 Calculated score for speed_recall_feedback_avg_d2_median: 0.0000000
[10:13:54] 217/1009 Calculated score for program_id_p_avg_talk_duration_max: -0.0002000
[10:14:07] 218/1009 Calculated score for platform_bought_d2_max: 0.0000000
[10:14:20] 219/1009 Calculated score for program_id_m_avg_duration_max: -0.0002000
[10:14:33] 220/1009 Calculated score for feedback_avg_d2: -0.0002000
[10:14:46] 221/1009 Calculated score for student_id_support_feedback_avg_min: 0.0006000
[10:15:00] 222/1009 Calculated score for month_id_p_avg_duration_mean: 0.0010000
[10:15:13] 223/1009 Calculated score for hw_leader_p_total_calls_mean: 0.0000000
[10:15:26] 224/1009 Calculated score for plat

[10:35:36] 317/1009 Calculated score for hw_leader_p_avg_duration_median: 0.0000000
[10:35:49] 318/1009 Calculated score for month_id_bought_d5_max: 0.0000000
[10:36:01] 319/1009 Calculated score for hw_leader_m_avg_talk_duration_mean: 0.0000000
[10:36:14] 320/1009 Calculated score for program_id_feedback_avg_d4_min: -0.0002000
[10:36:27] 321/1009 Calculated score for city_feedback_avg_d2_median: 0.0000000
[10:36:40] 322/1009 Calculated score for program_id_m_total_duration_median: -0.0004000
[10:36:53] 323/1009 Calculated score for ABC_lessons_median: 0.0000000
[10:37:06] 324/1009 Calculated score for ABC_feedback_avg_d2_mean: 0.0000000
[10:37:19] 325/1009 Calculated score for os_activity_min: 0.0000000
[10:37:32] 326/1009 Calculated score for month_id_reworked_hw_max: -0.0002000
[10:37:45] 327/1009 Calculated score for gender_avg_quiz_result_median: 0.0000000
[10:37:57] 328/1009 Calculated score for hw_leader_feedback_avg_d3_min: 0.0000000
[10:38:10] 329/1009 Calculated score for os_

[10:58:16] 422/1009 Calculated score for gender: 0.0000000
[10:58:29] 423/1009 Calculated score for promo_bought_d1_max: 0.0000000
[10:58:42] 424/1009 Calculated score for month_id_p_total_calls_median: 0.0002000
[10:58:55] 425/1009 Calculated score for student_id_support_feedback_avg_mean: 0.0004000
[10:59:08] 426/1009 Calculated score for city_price_mean: 0.0000000
[10:59:21] 427/1009 Calculated score for auto_payment_p_was_conversations_mean: 0.0000000
[10:59:34] 428/1009 Calculated score for hw_leader_lessons_max: 0.0000000
[10:59:46] 429/1009 Calculated score for student_id_avg_hw_mark_median: -0.0002000
[10:59:59] 430/1009 Calculated score for os_bought_d2_mean: 0.0000000
[11:00:12] 431/1009 Calculated score for auto_payment_feedback_avg_d5_mean: 0.0000000
[11:00:25] 432/1009 Calculated score for hw_leader_price_median: 0.0000000
[11:00:38] 433/1009 Calculated score for city_m_missed_calls_median: 0.0000000
[11:00:51] 434/1009 Calculated score for hw_leader_feedback_avg_d1_max: 0

[11:20:54] 528/1009 Calculated score for auto_payment_avg_hw_mark_mean: 0.0000000
[11:21:07] 529/1009 Calculated score for city_m_was_conversations_min: 0.0000000
[11:21:19] 530/1009 Calculated score for spent_time_to_complete_hw: 0.0000000
[11:21:31] 531/1009 Calculated score for promo_m_avg_talk_duration_median: 0.0000000
[11:21:44] 532/1009 Calculated score for ABC_support_feedback_avg_mean: 0.0000000
[11:21:56] 533/1009 Calculated score for month_id_completed_hw_max: 0.0000000
[11:22:08] 534/1009 Calculated score for month_id_p_total_calls_max: -0.0002000
[11:22:21] 535/1009 Calculated score for city: 0.0000000
[11:22:33] 536/1009 Calculated score for gender_m_avg_talk_duration_median: 0.0000000
[11:22:46] 537/1009 Calculated score for promo_m_total_duration_median: 0.0000000
[11:22:59] 538/1009 Calculated score for os_m_was_conversations_mean: 0.0000000
[11:23:11] 539/1009 Calculated score for gender_feedback_avg_d4_median: 0.0000000
[11:23:23] 540/1009 Calculated score for platfo

[11:42:33] 634/1009 Calculated score for platform_avg_hw_mark_mean: 0.0000000
[11:42:45] 635/1009 Calculated score for city_p_avg_duration_min: 0.0000000
[11:42:56] 636/1009 Calculated score for student_id_completed_hw_mean: 0.0002000
[11:43:07] 637/1009 Calculated score for ABC_bought_d5_max: 0.0000000
[11:43:18] 638/1009 Calculated score for ABC_m_avg_talk_duration_mean: 0.0000000
[11:43:29] 639/1009 Calculated score for student_id_feedback_avg_d1_max: 0.0006000
[11:43:41] 640/1009 Calculated score for ABC_webinars_median: 0.0000000
[11:43:53] 641/1009 Calculated score for auto_payment_p_avg_duration_median: 0.0000000
[11:44:04] 642/1009 Calculated score for program_id_feedback_avg_d4_max: 0.0000000
[11:44:15] 643/1009 Calculated score for month_id_m_total_duration_median: 0.0004000
[11:44:26] 644/1009 Calculated score for os_activity_median: 0.0000000
[11:44:38] 645/1009 Calculated score for platform_m_total_calls_mean: 0.0000000
[11:44:49] 646/1009 Calculated score for student_id_r

[12:01:54] 739/1009 Calculated score for promo_p_missed_calls_mean: 0.0000000
[12:02:06] 740/1009 Calculated score for os_feedback_avg_d2_mean: 0.0000000
[12:02:17] 741/1009 Calculated score for gender_m_avg_duration_max: 0.0000000
[12:02:27] 742/1009 Calculated score for month_id_feedback_avg_d2_median: 0.0000000
[12:02:39] 743/1009 Calculated score for os_m_avg_duration_median: 0.0000000
[12:02:50] 744/1009 Calculated score for ABC_lessons_max: 0.0000000
[12:03:00] 745/1009 Calculated score for city_bought_d1_mean: 0.0000000
[12:03:11] 746/1009 Calculated score for platform_m_avg_duration_median: 0.0000000
[12:03:22] 747/1009 Calculated score for month_id_m_avg_duration_max: 0.0002000
[12:03:33] 748/1009 Calculated score for os_webinars_median: 0.0000000
[12:03:44] 749/1009 Calculated score for student_id_completed_hw_min: 0.0002000
[12:03:55] 750/1009 Calculated score for ABC_bought_d1_mean: 0.0004000
[12:04:07] 751/1009 Calculated score for month_id_m_total_duration_max: 0.0000000


[12:21:03] 843/1009 Calculated score for os_p_missed_calls_max: 0.0000000
[12:21:14] 844/1009 Calculated score for student_id_feedback_avg_d4_median: 0.0000000
[12:21:25] 845/1009 Calculated score for gender_feedback_avg_d2_mean: 0.0000000
[12:21:36] 846/1009 Calculated score for month_id_reworked_hw_mean: 0.0004000
[12:21:47] 847/1009 Calculated score for city_lessons_max: 0.0000000
[12:21:58] 848/1009 Calculated score for platform_feedback_avg_d2_median: 0.0000000
[12:22:09] 849/1009 Calculated score for student_id_feedback_avg_d1_mean: 0.0002000
[12:22:20] 850/1009 Calculated score for month_id_notes_max: 0.0002000
[12:22:32] 851/1009 Calculated score for browser: 0.0008000
[12:22:43] 852/1009 Calculated score for p_was_conversations: -0.0002000
[12:22:54] 853/1009 Calculated score for month_id_activity_mean: 0.0006000
[12:23:05] 854/1009 Calculated score for bought_d1: 0.0000000
[12:23:16] 855/1009 Calculated score for os_lessons_max: 0.0000000
[12:23:27] 856/1009 Calculated score 

[12:40:21] 947/1009 Calculated score for student_id_notes_max: -0.0002000
[12:40:31] 948/1009 Calculated score for ABC_completed_hw_max: 0.0002000
[12:40:43] 949/1009 Calculated score for hw_leader_price_max: 0.0000000
[12:40:53] 950/1009 Calculated score for gender_p_avg_talk_duration_max: 0.0000000
[12:41:05] 951/1009 Calculated score for month_id_activity_max: 0.0000000
[12:41:16] 952/1009 Calculated score for city_p_avg_talk_duration_mean: 0.0000000
[12:41:26] 953/1009 Calculated score for gender_m_total_duration_mean: 0.0000000
[12:41:38] 954/1009 Calculated score for city_feedback_avg_d2_min: 0.0000000
[12:41:49] 955/1009 Calculated score for ABC_m_avg_duration_mean: 0.0000000
[12:42:00] 956/1009 Calculated score for city_p_avg_talk_duration_min: 0.0000000
[12:42:11] 957/1009 Calculated score for city_p_total_calls_max: 0.0000000
[12:42:23] 958/1009 Calculated score for student_id_m_avg_talk_duration_min: -0.0002000
[12:42:34] 959/1009 Calculated score for gender_p_avg_duration_m

In [87]:
features=accurate_fi[accurate_fi['Importance']>0].Feature.to_list()

In [88]:
automl = TabularUtilizedAutoML(task = task, 
                   timeout = TIMEOUT,
                   cpu_limit = N_THREADS,
#                 memory_limit=8,
                 general_params = {'use_algos': [['lgb','lgb_tuned']]},
                   reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                        lgb_params = {'default_params': params},
                              )
oof_pred = automl.fit_predict(train[list((set(features)))], roles = roles, verbose = 4)

[23:53:01] Start automl [1mutilizator[0m with listed constraints:
[23:53:01] - time: 21600.00 seconds
[23:53:01] - CPU: 6 cores
[23:53:01] - memory: 16 GB

[23:53:01] [1mIf one preset completes earlier, next preset configuration will be started[0m

[23:53:01] Start 0 automl preset configuration:
[23:53:01] [1mD:\Ananconda3\envs\lama\lib\site-packages\lightautoml\automl\presets\tabular_configs\conf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[23:53:01] Found reader_params in kwargs, need to combine
[23:53:01] Merged variant for reader_params = {'n_jobs': 6, 'cv': 5, 'random_state': 42}
[23:53:01] Found general_params in kwargs, need to combine
[23:53:01] Merged variant for general_params = {'use_algos': [['lgb', 'lgb_tuned']], 'return_all_predictions': False}
[23:53:01] Stdout logging level is DEBUG.
[23:53:01] Task: multiclass

[23:53:01] Start automl preset with listed constraints:
[23:53:01] - t

KeyError: 'target'

In [83]:
class_mapping = automl.outer_pipes[0].ml_algos[0].models[0][0].reader.class_mapping

In [117]:
pred = automl.predict(test)
pred = [class_mapping[i] for i in np.argmax(pred.data[:], axis=1)]

In [85]:
test[TARGET] = pred
test[TARGET].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test[TARGET] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


target
0         75053
1          5498
4          1410
5          1278
3          1192
2           566
dtype: int64

In [None]:
pd.DataFrame(pred, index=sample_submisson.index, columns=[TARGET]).to_csv('sub.csv')