In [2]:
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [3]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 1800 # Time in seconds for automl run
TARGET_NAME = 'final_price' # Target column name

In [4]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')
submission = pd.read_csv('sample_submission.csv')

In [7]:
def create_extra_features(data):
    data['NANs_cnt'] = data.isnull().sum(axis = 1) 
    
def create_col_with_min_freq(data, col, min_freq = 10):
    # replace rare values (less than min_freq rows) in feature by RARE_VALUE
    data[col + '_fixed'] = data[col].astype(str)
    data.loc[data[col + '_fixed'].value_counts()[data[col + '_fixed']].values < min_freq, col + '_fixed'] = "RARE_VALUE"
    data.replace({'nan': np.nan}, inplace = True)

def create_gr_feats(data):
    # create aggregation feats for numeric features based on categorical ones
    for cat_col in ['vehicle_manufacturer', 'vehicle_model', 'vehicle_category',
                   'vehicle_gearbox_type', 'doors_cnt', 'wheels', 'vehicle_color', 
                   'vehicle_interior_color', 'deal_type']:
        create_col_with_min_freq(data, cat_col, 15)
        for num_col in ['current_mileage', 'vehicle_year', 'car_leather_interior']:
            for n, f in [('mean', np.mean), ('min', np.nanmin), ('max', np.nanmax), ('median', np.median)]:
                data['FIXED_' + n + '_' + num_col + '_by_' + cat_col] = data.groupby(cat_col + '_fixed')[num_col].transform(f)
                
    # create features with counts
    for col in ['vehicle_manufacturer', 'vehicle_model', 'vehicle_category',
               'current_mileage', 'vehicle_year', 'vehicle_gearbox_type', 'doors_cnt',
               'wheels', 'vehicle_color', 'vehicle_interior_color', 'car_vin', 'deal_type']:
        data[col + '_cnt'] = data[col].map(data[col].value_counts(dropna = False))
    
        

create_extra_features(train_data)
create_extra_features(test_data)

all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
create_gr_feats(all_df)
train_data, test_data = all_df[:len(train_data)], all_df[len(train_data):]
print(train_data.shape, test_data.shape)

(35000, 145) (10697, 145)


In [8]:
task = Task('reg', loss='mae', metric='mae')

sklearn doesn't support in general case mae and will not be used.


In [9]:
roles = {'target': TARGET_NAME,
         'drop': ['row_ID'] # to drop or not to drop?
         }

In [13]:
automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned', 'cb', 'cb_tuned'], ['lgb', 'linear_l2', 'lgb_tuned']]},
                       cb_params = {'default_params': {'task_type':"GPU", 'thread_count':100}},
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                      )

In [14]:
oof_pred = automl.fit_predict(train_data, roles = roles)
logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

:
[284]	valid's l1: 2360.13
Lvl_1_Pipe_1_Mod_1_LightGBM fitting and predicting completed
[2021-05-16 20:07:46,259] (INFO): Trial 42 finished with value: -2360.132503605417 and parameters: {'feature_fraction': 0.9692519455114422, 'num_leaves': 71, 'bagging_fraction': 0.8774062604230195, 'min_sum_hessian_in_leaf': 0.008401804267250273, 'reg_alpha': 2.5058435618277523e-08, 'reg_lambda': 0.026484441895812262}. Best is trial 26 with value: -2341.751348306656.
Start fitting Lvl_1_Pipe_1_Mod_1_LightGBM ...

===== Start working with fold 0 for Lvl_1_Pipe_1_Mod_1_LightGBM =====

Training until validation scores don't improve for 200 rounds
[100]	valid's l1: 2396.11
[200]	valid's l1: 2354.09
[300]	valid's l1: 2353.18
[400]	valid's l1: 2351.74
[500]	valid's l1: 2351.63
[600]	valid's l1: 2351.81
[700]	valid's l1: 2351.48
[800]	valid's l1: 2350.83
[900]	valid's l1: 2351.24
[1000]	valid's l1: 2351.71
Early stopping, best iteration is:
[805]	valid's l1: 2350.56
Lvl_1_Pipe_1_Mod_1_LightGBM fitting and

In [15]:
fast_fi = automl.get_feature_scores('fast')
# fast_fi.set_index('Feature')['Importance'].plot.bar(figsize = (20, 10), grid = True)

In [16]:
# fast_fi

In [21]:
test_pred = automl.predict(test_data)
logging.info('Prediction for test data:\n{}\nShape = {}'
              .format(test_pred, test_pred.shape))

logging.info('Check scores...')
logging.info('OOF score: {}'.format(mean_absolute_error(train_data[TARGET_NAME].values, oof_pred.data[:, 0])))

[2021-05-16 21:09:05,600] (INFO): Prediction for test data:
array([[ 2886.2434],
       [ 5908.5435],
       [ 2444.8494],
       ...,
       [16929.654 ],
       [ 5282.768 ],
       [ 6292.7456]], dtype=float32)
Shape = (10697, 1)
[2021-05-16 21:09:05,601] (INFO): Check scores...
[2021-05-16 21:09:05,602] (INFO): OOF score: 2242.3088047814613


In [24]:
submission[TARGET_NAME] = test_pred.data[:, 0]
submission.head()

Unnamed: 0,row_ID,final_price
0,35000,2886.243408
1,35001,5908.543457
2,35002,2444.849365
3,35003,6914.652832
4,35004,3736.633545


In [25]:
submission.to_csv('submissions/submission_5.csv', index = False)

In [None]:
submission['final_price'] = 0

In [None]:
submission.to_csv('submissions/submission_all_zeros.csv', index = False)

In [None]:
submission4 = pd.read_csv('submissions/submission_4.csv')
np.mean(submission4['final_price'])

In [None]:
submission4['final_price'] = submission4['final_price'] - np.mean(submission4['final_price']) + 5454.77075

In [None]:
submission4.to_csv('submissions/submission_4_mae_hack.csv', index = False)

In [None]:
# all zeros gives 5454.77075