In [69]:
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [76]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 600 # Time in seconds for automl run
TARGET_NAME = 'final_price' # Target column name

In [77]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
submission = pd.read_csv('sample_submission.csv')

In [78]:
def create_extra_features(data):
    data['NANs_cnt'] = data.isnull().sum(axis = 1) 
    
def create_col_with_min_freq(data, col, min_freq = 10):
    # replace rare values (less than min_freq rows) in feature by RARE_VALUE
    data[col + '_fixed'] = data[col].astype(str)
    data.loc[data[col + '_fixed'].value_counts()[data[col + '_fixed']].values < min_freq, col + '_fixed'] = "RARE_VALUE"
    data.replace({'nan': np.nan}, inplace = True)

def create_gr_feats(data):
    # create aggregation feats for numeric features based on categorical ones
    for cat_col in ['vehicle_manufacturer', 'vehicle_model', 'vehicle_category',
                   'vehicle_gearbox_type', 'doors_cnt', 'wheels', 'vehicle_color', 
                   'vehicle_interior_color', 'deal_type']:
        create_col_with_min_freq(data, cat_col, 15)
        for num_col in ['current_mileage', 'vehicle_year', 'car_leather_interior']:
            for n, f in [('mean', np.mean), ('min', np.nanmin), ('max', np.nanmax)]:
                data['FIXED_' + n + '_' + num_col + '_by_' + cat_col] = data.groupby(cat_col + '_fixed')[num_col].transform(f)
                
    # create features with counts
    for col in ['vehicle_manufacturer', 'vehicle_model', 'vehicle_category',
               'current_mileage', 'vehicle_year', 'vehicle_gearbox_type', 'doors_cnt',
               'wheels', 'vehicle_color', 'vehicle_interior_color', 'car_vin', 'deal_type']:
        data[col + '_cnt'] = data[col].map(data[col].value_counts(dropna = False))
    
        

create_extra_features(train_data)
create_extra_features(test_data)

all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
create_gr_feats(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)

(35000, 118) (10697, 118)


In [79]:
task = Task('reg', loss='mae', metric='mae')

sklearn doesn't support in general case mae and will not be used.


In [80]:
roles = {'target': TARGET_NAME,
         'drop': ['row_ID'] # to drop or not to drop?
         }

In [81]:
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                      )

In [82]:
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

2377.0867245772906 and parameters: {'feature_fraction': 0.9984425384731308, 'num_leaves': 184, 'bagging_fraction': 0.7175028997728397, 'min_sum_hessian_in_leaf': 5.8348881212509305, 'reg_alpha': 6.934704918794148e-07, 'reg_lambda': 2.523795366811073e-07}. Best is trial 0 with value: -2374.662029323527.
Start fitting Lvl_0_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_0_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l1: 2603.32
[200]	valid's l1: 2467.15
[300]	valid's l1: 2445.63
[400]	valid's l1: 2439.53
[500]	valid's l1: 2433.31
[600]	valid's l1: 2428.98
[700]	valid's l1: 2426.53
[800]	valid's l1: 2422.52
[900]	valid's l1: 2419.61
[1000]	valid's l1: 2416.2
[1100]	valid's l1: 2413.45
[1200]	valid's l1: 2411.47
Did not meet early stopping. Best iteration is:
[1196]	valid's l1: 2411.41
Lvl_0_Pipe_1_Mod_1_LightGBM fitting and predicting completed
[2021-05-16 08:23:30,668] (INFO): Trial 14 finished with value: -2

In [83]:
fast_fi = automl.get_feature_scores('fast')
# fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

In [86]:
# fast_fi

In [87]:
test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(mean_absolute_error(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))

[2021-05-16 08:35:28,150] (INFO): Prediction for test data:
array([[ 2787.2134],
       [ 5875.6006],
       [ 2455.132 ],
       ...,
       [16068.514 ],
       [ 5263.795 ],
       [ 6188.896 ]], dtype=float32)
Shape = (10697, 1)
[2021-05-16 08:35:28,151] (INFO): Check scores...
[2021-05-16 08:35:28,152] (INFO): OOF score: 2236.9500419658625


In [88]:
submission[TARGET_NAME] = test_pred.data[:, 0]
submission.head()

Unnamed: 0,row_ID,final_price
0,35000,2787.213379
1,35001,5875.600586
2,35002,2455.13208
3,35003,6830.84668
4,35004,4655.22168


In [89]:
submission.to_csv('submissions/submission_3.csv', index = False)