In [4]:

# Стандартные библиотеки python
import os
import timeit
import re

In [5]:
# Установленные библиотеки
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import torch

In [6]:
%pip install -U lightautoml

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [7]:
# Импорты из package пакета LAMA
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task

In [8]:
N_THREADS = 4 # количество ядер для lgbm and линейных моделей
N_FOLDS = 5 # количество folds(рекордов) для AutoML
RANDOM_STATE = 42 # зафиксированное кол-во случайных состояний 
TEST_SIZE = 0.2 # Тестовый размер метрик (0.2 - 20% от всех) для проверки 
TIMEOUT = 600 # Ограничение во времени после запуска AutoML

In [9]:
# Фиксируется количество ядер в Torch, а также указываем numpy seed
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [42]:
%time
train_data = pd.read_csv('./fish_train.csv',sep=',', encoding='latin-1')
train_data.replace(np.nan, 0, inplace=True)
print(train_data.head())

test_data = pd.read_csv('./final.csv', sep=',', encoding='latin-1')
test_data.replace(np.nan, 0, inplace=True)
print(test_data.head())

submission = pd.read_csv('./fish_check.csv', sep=',', encoding='latin-1')
submission.replace(np.nan, 0, inplace=True)
print(submission.head())

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.91 µs
        id  ship  record      time  latitude  longitude course  velocity  \
0  1024769    86    7734  20:17:14  0.035325   0.151459     85    3.7797   
1  3641376   187   25463  18:26:07  0.015448  -0.010647    169       0.0   
2   617109    37    3967  01:17:25  0.041851  -0.191819    318  0.539957   
3  3167360   169   21840  10:56:42  0.012567  -0.248763    162   2.69978   
4  2835848   151   18882  15:16:08  0.016242   0.237891      6   4.85961   

   sure_tral  
0        0.0  
1        0.0  
2        0.0  
3        0.0  
4        0.0  
       id  ship  record      time  latitude  longitude course velocity
0  572626   230   33365  17:54:36 -0.026813  -0.058809     99   3.7797
1   18879   206   29181  01:59:44 -0.059285   0.014964    354   7.5594
2   28406   208   29360  07:01:31  0.017806  -0.033742    331      0.0
3  347040   221   31573  06:42:57 -0.015593  -0.122373     61  2.69978
4  492346   229   32804  00:18:12 

In [43]:
# Сплитуем (разъединяем) данные для последующего обучения 
tr_data, te_data = train_test_split(train_data, 
                                     test_size=TEST_SIZE, 
                                     stratify=train_data['sure_tral'], 
                                     random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

Data splitted. Parts sizes: tr_data = (1210389, 9), te_data = (302598, 9)


In [23]:
#========= AutoML preset usage =========
# Шаг 1. Создаем Task (задачу для AutoML)
%time
def acc_score(y_true, y_pred, **kwargs):
    return accuracy_score(y_true, (y_pred > 0.5).astype(int), **kwargs)

def f1_metric(y_true, y_pred, **kwargs):
    return f1_score(y_true, (y_pred > 0.5).astype(int), **kwargs)

task = Task('binary', metric = f1_metric)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [24]:
# Шаг 2. Настроим roles для столбцов
%time

roles = {
    'target': 'sure_tral',
    'drop': ['Id', 'ship','record','time','course'],
}


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.39 µs


In [25]:
# Шаг 3. Создадим AutoML из нашего пресета и обучим его на 80% данныъх
%time
automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(tr_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs
Start automl preset with listed constraints:
- time: 600 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (1210389, 9)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 593.7803699970245 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.3678551216751556
Linear model: C = 0.0001 score = 0.4584040747028862
Linear model: C = 0.0005 score = 0.5926551287463065
Linear model: C = 0.001 score = 0.5926551287463065
Linear model: C = 0.005 score = 0.613156146179402
Linear model: C = 0.01 score = 0.613156146179402
Linear model: C = 0.05 score = 0.613156146179402

===== Start working with fold 1 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.3601751380163716
Linear model: C = 0.

Time limit exceeded after calculating fold 3


Linear model: C = 0.005 score = 0.5951750104587923
Linear model: C = 0.01 score = 0.5951750104587923
Lvl_0_Pipe_0_Mod_0_LinearL2 fitting and predicting completed
Time left 519.944632768631
Start fitting Selector_LightGBM ...

===== Start working with fold 0 for Selector_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.0150894	valid's Opt metric: 0.789336
[200]	valid's binary_logloss: 0.01419	valid's Opt metric: 0.79885
[300]	valid's binary_logloss: 0.0140076	valid's Opt metric: 0.80145
[400]	valid's binary_logloss: 0.0140429	valid's Opt metric: 0.803209
Early stopping, best iteration is:
[306]	valid's binary_logloss: 0.0140064	valid's Opt metric: 0.802052
Selector_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_0_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_0_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.0150166	vali

Time limit exceeded after calculating fold 1


Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed
Optuna may run 1 secs
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.0148679	valid's Opt metric: 0.78603
[200]	valid's binary_logloss: 0.0142384	valid's Opt metric: 0.790709
[300]	valid's binary_logloss: 0.0141364	valid's Opt metric: 0.790743
Early stopping, best iteration is:
[229]	valid's binary_logloss: 0.0141817	valid's Opt metric: 0.792917
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Time left 253.15051436424255


Time limit exceeded in one of the tasks. AutoML will blend level 1 models.


Blending: Optimization starts with equal weights and score 0.6819732962263526
Blending, iter 0: score = 0.6983006535947711, weights = [0.20671381 0.7932862 ]
Blending, iter 1: score = 0.6983006535947711, weights = [0.20671381 0.7932862 ]
No score update. Terminated

Automl preset training completed in 360.24 seconds.
oof_pred:
array([[          nan],
       [          nan],
       [2.9050602e-04],
       [8.1035614e-05],
       [2.2604773e-03],
       [          nan],
       [4.6094523e-05],
       [4.7697977e-05],
       [2.6217432e-04],
       [3.9852981e-04]], dtype=float32)
Shape = (1210389, 1)


In [27]:
# Шаг 4. AutoML прогнозирует возможные значения данных и проверяет очки за угаданные значения
%time

test_pred = automl.predict(te_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(acc_score(tr_data['sure_tral'].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(acc_score(te_data['sure_tral'].values, test_pred.data[:, 0])))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.78 µs
Prediction for test data:
array([[9.6252370e-05],
       [1.5639130e-04],
       [4.9821916e-05],
       [1.7419767e-04],
       [6.8756890e-05],
       [1.5291911e-04],
       [3.1426014e-05],
       [8.4796789e-05],
       [1.7023404e-04],
       [2.8437243e-03]], dtype=float32)
Shape = (302598, 1)
Check scores...
OOF score: 0.989061367874295
TEST score: 0.9936053774314437


In [28]:
# Шаг 5. Создание AutoML с тратой (списанием) времени

# Ниже мы собираемся создать специальную предустановку AutoML для
# TIMEOUT utilization (постарайтесь потратить его как можно больше):
%time

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(tr_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs
Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'random_state': 42}
Found general_params in kwargs, need to combine
Merged variant for general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']], 'return_all_predictions': False}
Start automl preset with listed constraints:
- time: 599.9962313175201 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (1210389, 9)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 594.2381567955017 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.36761653142102285
Linear model: C = 0.0001 score = 0.4563255576366423
Linear model: C =

Time limit exceeded after calculating fold 1


Lvl_0_Pipe_1_Mod_0_LightGBM fitting and predicting completed
Optuna may run 1 secs
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.0148679	valid's Opt metric: 0.78603
[200]	valid's binary_logloss: 0.0142384	valid's Opt metric: 0.790709
[300]	valid's binary_logloss: 0.0141364	valid's Opt metric: 0.790743
Early stopping, best iteration is:
[229]	valid's binary_logloss: 0.0141817	valid's Opt metric: 0.792917
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 100 rounds
[100]	valid's binary_logloss: 0.0148679	valid's Opt metric: 0.78603
[200]	valid's binary_logloss: 0.0142384	valid's Opt metric: 0.790709
[300]	valid's binary_logloss: 0.0141364	valid's O

Time limit exceeded in one of the tasks. AutoML will blend level 1 models.


Blending: Optimization starts with equal weights and score 0.7704103726723895
Blending, iter 0: score = 0.7925484351713862, weights = [0.         0.622669   0.37733102]
Blending, iter 1: score = 0.7925484351713862, weights = [0.         0.622669   0.37733102]
No score update. Terminated

Automl preset training completed in 619.06 seconds.
oof_pred:
array([[4.7291387e-06],
       [2.8522769e-07],
       [1.5875413e-06],
       [1.1841981e-05],
       [2.1065793e-04],
       [4.5397387e-06],
       [6.4964752e-06],
       [7.5397711e-06],
       [3.3058291e-06],
       [4.0276664e-06]], dtype=float32)
Shape = (1210389, 1)


In [29]:
# Шаг 6. AutoML прогнозирует возможные значения данных и проверяет очки за угаданные значения для утилизированного в AutoML
%time

test_pred = automl.predict(te_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(acc_score(tr_data['sure_tral'].values, oof_pred.data[:, 0])))
print('TEST score: {}'.format(acc_score(te_data['sure_tral'].values, test_pred.data[:, 0])))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.7 µs
Prediction for test data:
array([[1.1432840e-05],
       [9.3784611e-06],
       [4.8733309e-06],
       [7.0416031e-06],
       [1.3425801e-05],
       [7.2741250e-06],
       [1.0002145e-05],
       [3.8803009e-06],
       [4.7802416e-05],
       [2.0702663e-03]], dtype=float32)
Shape = (302598, 1)
Check scores...
OOF score: 0.9930997390095251
TEST score: 0.9939325441675094


In [30]:
# Шаг 7. Обучение на полной выборке данных
%time

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.39 µs
Current random state: {'reader_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
Found reader_params in kwargs, need to combine
Merged variant for reader_params = {'n_jobs': 4, 'random_state': 42}
Found general_params in kwargs, need to combine
Merged variant for general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']], 'return_all_predictions': False}
Start automl preset with listed constraints:
- time: 599.9960525035858 seconds
- cpus: 4 cores
- memory: 16 gb

Train data shape: (1512987, 9)
Feats was rejected during automatic roles guess: []


Layer 1 ...
Train process start. Time left 593.6752281188965 secs
Start fitting Lvl_0_Pipe_0_Mod_0_LinearL2 ...

===== Start working with fold 0 for Lvl_0_Pipe_0_Mod_0_LinearL2 =====

Linear model: C = 1e-05 score = 0.0
Linear model: C = 5e-05 score = 0.3956238911886458
Linear model: C = 0.0001 score = 0.4807280513918629
Linear model: C =

Time limit exceeded after calculating fold 1
Time limit exceeded in one of the tasks. AutoML will blend level 1 models.


Blending: Optimization starts with equal weights and score 0.6770912467291298
Blending, iter 0: score = 0.6898777669033589, weights = [0.09016994 0.90983003]
Blending, iter 1: score = 0.6898777669033589, weights = [0.09016994 0.90983003]
No score update. Terminated

Automl preset training completed in 320.27 seconds.
oof_pred:
array([[3.0727134e-04],
       [2.0401808e-03],
       [1.0030456e-04],
       [1.0422960e-04],
       [3.5085561e-04],
       [7.1551913e-04],
       [1.9997209e-05],
       [3.2554708e-05],
       [2.6142102e-04],
       [2.2357008e-05]], dtype=float32)
Shape = (1512987, 1)


In [44]:
#Step 8. Прогнозирование для данных test.csv и проверка OOF 
%time





test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(acc_score(train_data['sure_tral'].values, oof_pred.data[:, 0])))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.39 µs
Prediction for test data:
array([[2.3585633e-05],
       [2.6126983e-05],
       [1.0404695e-03],
       [5.1437393e-05],
       [5.6716468e-05],
       [6.4192762e-05],
       [6.3721782e-05],
       [1.1669561e-04],
       [4.5553406e-05],
       [4.9908250e-04]], dtype=float32)
Shape = (696209, 1)
Check scores...
OOF score: 0.9901565578554211


In [45]:
#Step 9. Подготовка к окончанию, результат прогноза в .csv
submission['sure_tral'] = (test_pred.data[:, 0] > 0.5).astype(int)
submission.to_csv('_automl_utilized_600_f1_score.csv', index = False)
print(submission)

ValueError: Length of values (696209) does not match length of index (812804)