In [47]:
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [48]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 600 # Time in seconds for automl run
TARGET_NAME = 'final_price' # Target column name

In [49]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
submission = pd.read_csv('sample_submission.csv')

In [50]:
def create_expert_feats(data):
    pass

create_expert_feats(train_data)
create_expert_feats(test_data)

In [51]:
task = Task('reg', loss='mae', metric='mae')

sklearn doesn't support in general case mae and will not be used.


In [52]:
roles = {'target': TARGET_NAME,
         'drop': ['row_ID'] # to drop or not to drop?
         }

In [53]:
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                      )

In [54]:
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

93
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
[2021-05-16 08:07:13,473] (INFO): Trial 48 finished with value: -2542.934914622358 and parameters: {'feature_fraction': 0.8820454371237115, 'num_leaves': 192, 'bagging_fraction': 0.9666452831759725, 'min_sum_hessian_in_leaf': 0.3418221858888605, 'reg_alpha': 1.6222917424639427e-08, 'reg_lambda': 5.648198888942089e-08}. Best is trial 32 with value: -2528.967141716753.
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l1: 2675.84
[200]	valid's l1: 2573.51
[300]	valid's l1: 2560.46
[400]	valid's l1: 2555.74
[500]	valid's l1: 2552.71
[600]	valid's l1: 2550.72
[700]	valid's l1: 2550.39
[800]	valid's l1: 2549.3
[900]	valid's l1: 2549.38
[1000]	valid's l1: 2548.95
Early stopping, best iteration is:
[853]	valid's l1: 2548.7
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
[2021

In [57]:
fast_fi = automl.get_feature_scores('fast')
# fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

In [58]:
# fast_fi

In [59]:
test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(mean_absolute_error(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))

[2021-05-16 08:14:58,178] (INFO): Prediction for test data:
array([[ 2781.8127],
       [ 6090.3853],
       [ 3427.1184],
       ...,
       [15289.261 ],
       [ 5421.962 ],
       [ 6472.4697]], dtype=float32)
Shape = (10697, 1)
[2021-05-16 08:14:58,179] (INFO): Check scores...
[2021-05-16 08:14:58,180] (INFO): OOF score: 2379.4363534669264


In [60]:
submission[TARGET_NAME] = test_pred.data[:, 0]
submission.head()

Unnamed: 0,row_ID,final_price
0,35000,2781.812744
1,35001,6090.385254
2,35002,3427.118408
3,35003,6745.902832
4,35004,4674.939453


In [61]:
submission.to_csv('submissions/submission_2.csv', index = False)