In [3]:
# Standard python libraries
import os
import time

# Essential DS libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import torch

# # LightAutoML presets, task and report generation
# from lightautoml.automl.presets.tabular_presets import TabularAutoML
# from lightautoml.tasks import Task

In [4]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 1500 # equal to 25 minutes
TARGET_NAME = 'final_price'

In [5]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [67]:
INPUT_DIR = './Kaggle_0/'

In [196]:
train_data = pd.read_csv(INPUT_DIR + 'train_data.csv')
print(train_data.shape)
train_data.head(3)

(35000, 15)


Unnamed: 0,row_ID,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price
0,0,TOYOTA,Aqua s,Sedan,133000,2014,Automatic,4/5,Right-hand drive,Silver,Black,,0,For Sale,3650.0
1,1,MERCEDES-BENZ,C 220,Sedan,24500,2010,Manual,4/5,Left wheel,Silver,Black,,0,For Sale,6800.0
2,2,HYUNDAI,Veloster,Hatchback,31000,2016,Tiptronic,2/3,Left wheel,Silver,Black,KMHTC6AE3GU293912,1,For Sale,6300.0


In [8]:
test_data = pd.read_csv(INPUT_DIR + 'test_data.csv')
print(test_data.shape)
test_data.head(3)

(10697, 14)


Unnamed: 0,row_ID,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type
0,35000,TOYOTA,Prius,Hatchback,323733,2012,Automatic,4/5,Left wheel,Grey,Black,JTDKN3DU6C5439638,1,For Sale
1,35001,HYUNDAI,Elantra,Sedan,112000,2013,Tiptronic,4/5,Left wheel,Grey,Black,SURATSHIA,1,For Sale
2,35002,LEXUS,NX 300,Jeep,16920,2018,Automatic,,Left wheel,Brown,,JTJYARBZ5J2104521,1,For Sale


In [9]:
submission = pd.read_csv(INPUT_DIR + 'sample_submission.csv')
print(submission.shape)
submission.head(3)

(10697, 2)


Unnamed: 0,row_ID,final_price
0,35000,0
1,35001,0
2,35002,0


In [208]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35000 entries, 0 to 34999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   row_ID                  35000 non-null  int64  
 1   vehicle_manufacturer    34999 non-null  object 
 2   vehicle_model           34993 non-null  object 
 3   vehicle_category        34999 non-null  object 
 4   current_mileage         35000 non-null  int64  
 5   vehicle_year            35000 non-null  int64  
 6   vehicle_gearbox_type    34999 non-null  object 
 7   doors_cnt               34255 non-null  object 
 8   wheels                  34999 non-null  object 
 9   vehicle_color           34599 non-null  object 
 10  vehicle_interior_color  28282 non-null  object 
 11  car_vin                 35000 non-null  object 
 12  car_leather_interior    35000 non-null  int64  
 13  deal_type               35000 non-null  object 
 14  final_price             35000 non-null

In [198]:
train_data['car_vin'].value_counts()

 4T1BD1FK7EU114553     42
 3VW1K7AJ6EM438747     42
 4T1BD1FK6EU137614     41
 4T1BD1FK2CU027785     40
 4T1BD1FK4DU085902     40
                       ..
 5XXGN4A75EG263349      1
 2T3BFREV1JW759334      1
 4T1BF1FK1CU597366      1
 4T1BB46K19U081594      1
 3FA6P0K98DR308742      1
Name: car_vin, Length: 4288, dtype: int64

In [199]:
def append_wmi(wmis, vendor, car_vin):
    """ This function appends new element in dictionary
        if not exist

    Args:
        wmis (dictionary of the lists of the codes) like {'manufacturer": ['JTG', '1T4', ... 'JTD']} 
        vendor (text string): name of the vehicle manufacturer
        car_vin (text string): serial of the numbers and symbols (max 17)
    """
    if car_vin in ['nan', 'NaN']:
        return
    wmi = car_vin[:3]
    #print(wmi)
    if vendor not in wmis.keys():
        wmis.update({vendor: list()})
        wmis[vendor].append(wmi)
        #print(vins)
    elif wmi not in wmis[vendor]:
        wmis[vendor].append(wmi)
        #print(vins)
    return
    

In [200]:
def append_vdc(vdcs, model, car_vin):
    """ This function appends new element in dictionary
        if not exist

    Args:
        vdcs (dictionary of the lists of the codes) like {'model": ['XC', 'SD', ... 'SC']}
        model (text string): name of the vehicle model
        car_vin (text string): serial of the numbers and symbols (max 17)
    """
    if car_vin in ['nan', 'NaN'] or not len(model):
        return
    vdc = car_vin[3:5]
    #print(vin_model)
    if model not in vdcs.keys():
        vdcs.update({model: list()})
        vdcs[model].append(vdc)
        #print(vin_model)
    elif vdc not in vdcs[model]:
        vdcs[model].append(vdc)
        #print(vins)
    return

In [201]:
def get_vin_parts(df):
    """ This function collect existing vin-codes into dictionary
        for future implementations

    Args:
        df (pandas.DataFrame): The source data that we use for collecting
    """
    wmis_dict = dict()
    vdcs_dict = dict()
    
    for row in df.index:
        car_vin = str(df['car_vin'].iloc[row])
        if len(car_vin) > 3:
            car_vin = car_vin[1:-1]
            vendor = str(df['vehicle_manufacturer'].iloc[row])
            append_wmi(wmis_dict, vendor, car_vin)
            
            model = str(df['vehicle_model'].iloc[row])
            append_vdc(vdcs_dict, model, car_vin)
    return wmis_dict, vdcs_dict


In [202]:
wmis, vdcs = get_vin_parts(train_data)
display(wmis, vdcs)

{'HYUNDAI': ['KMH', '5NP', 'KM8', '5NM', '5XY', 'WWW', 'VIN', '123', 'SXY'],
 'TOYOTA': ['JTN',
  '4T1',
  'JTE',
  '3TM',
  'JTD',
  'NMT',
  '5TF',
  '4T4',
  '5TD',
  'JTM',
  '2T3',
  '111',
  '5YF',
  '5TE',
  'THB',
  '2T1',
  'NHP',
  '3MY',
  '000',
  '457',
  '123',
  'EUR',
  '4T3',
  'WDC',
  'AM5',
  'XAC',
  'SCP',
  'JTF',
  'VIN',
  'OV1',
  'JYD',
  '454',
  'TYG',
  'BV7',
  'JN8',
  'VNK'],
 'MERCEDES-BENZ': ['4JG',
  'WDC',
  'WDD',
  'WDB',
  '122',
  'VDD',
  '55S',
  'WDF',
  '111',
  '333',
  'OTA',
  'KPB',
  '222',
  '101',
  'WME',
  'WD3',
  'geo',
  'WDA',
  'FDG',
  '260',
  'IPI',
  'TT',
  '900',
  '123',
  'KAF',
  '171'],
 'NISSAN': ['JN8',
  '123',
  '1N4',
  '3N1',
  'Z8N',
  '111',
  '655',
  '245',
  'ZE0',
  '788',
  '599',
  '564',
  'GUV',
  '12',
  '5N1',
  'DD',
  '899',
  '598',
  '122',
  'VSK',
  'UHH',
  'NIS',
  '1N6'],
 'LEXUS': ['JTJ', 'JTH', '2T2', '58A'],
 'AUDI': ['WA1', 'WAU', 'geo', 'UN', 'WUA'],
 'BMW': ['5UX',
  'WBA',
  'WBX',
  

{'Veloster': ['TC', 'T3', 'TG', ':K', 'TH'],
 'CHR': ['KH'],
 'GLE 350': ['DA', 'FB', 'ED'],
 'Juke': ['AF', 'DF'],
 'GX 470': ['BT', 'JM', 'BM'],
 'Q5': ['VF',
  'AN',
  'D7',
  'LF',
  'DK',
  'WK',
  'L2',
  'C8',
  'DG',
  'BN',
  'C2',
  'CM'],
 'X5': ['ZW',
  'KS',
  'ZV',
  'KR',
  'JU',
  'FA',
  'FF',
  '4',
  'KT',
  'FB',
  'FE',
  '51',
  '43',
  'CR',
  'AT',
  'GY',
  'LS',
  '64',
  'HS'],
 'Camry': ['BD', 'B1', 'BF', 'BK', 'B3', 'B6', 'BB', 'B2', 'BE', 'BZ', 'AF'],
 'Highlander': ['DC',
  'EW',
  'BK',
  'ES',
  'JW',
  'JK',
  'ZK',
  'DK',
  'JZ',
  'BC',
  'YK',
  'DG'],
 'Jetta': ['26',
  'DX',
  'B0',
  '1K',
  '46',
  'LL',
  'DB',
  'DP',
  'L1',
  'D1',
  'LA',
  'C5',
  'SF',
  'LZ',
  '2K',
  '4A',
  'D6',
  'RZ',
  '3L',
  'DZ',
  'RM',
  'D0',
  '63',
  '2B',
  'B1',
  'DK',
  'SA',
  'KE'],
 'Volt': ['RH', 'RC', 'RG', 'RD', 'RB', 'RE', 'RF', 'RA'],
 'GLA 250': ['TG'],
 'CT 200h': ['KD'],
 'E 350': ['HF', 'BB', 'KK', 'KJ', 'UF', '21'],
 'GX 460': ['JM', 'BM'

In [205]:
def set_vin(wmis, vdcs, df):
    """ This function set vin-codes for empty cells
        using vendor and model information

    Args:
        wmis (dictionary of the lists of the codes) like {'manufacturer": ['JTG', '1T4', ... 'JTD']} 
        vdcs (dictionary of the lists of the codes) like {'model": ['XC', 'SD', ... 'SC']}
        df (pandas.DataFrame): The source dataframe

    Returns:
        pandas.DataFrame: output dataframe
    """
    for row in df.index:
        car_vin = str(df['car_vin'].iloc[row])
        
        if car_vin in ['nan', 'NaN']:
            vendor = str(df['vehicle_manufacturer'].iloc[row])
            model = str(df['vehicle_model'].iloc[row])
            
            if vendor in wmis.keys():
                car_vin = wmis[vendor][0]
            else:
                car_vin = 'nan'
            if model in vdcs.keys():
                car_vin += vdcs[model][0]
        else:
            car_vin = car_vin[1:6]
        df['car_vin'].iloc[row] = car_vin
                
    return df

In [206]:
train_data = set_vin(wmis, vdcs, train_data)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [207]:
train_data['car_vin'].value_counts()

KMHEC    1555
JTNKN    1298
KMHJU    1135
KMHD7    1069
JTDKN     909
         ... 
WOL00       1
WMWXS       1
JTDJT       1
ASTRA       1
1N6AD       1
Name: car_vin, Length: 1128, dtype: int64

In [11]:
%%time
import category_encoders as ce

def create_expert_feats(data):
    
    # 1 ++ 
    def modify_car_vin(car_vin):
        if car_vin != np.nan:
            car_vin = str(car_vin)
        else:
            return car_vin
        if len(car_vin) < 11:
            return car_vin
        else:
            return car_vin[:11]
    data['car_vin'] =  data['car_vin'].apply(modify_car_vin)
    
    # 2 +-0
    #создаем словарь имя столбца: число(признак) на который надо заменить пропуски
#     values = {
#         'doors_cnt': data['doors_cnt'].mode()[0],
#         'vehicle_color': data['vehicle_color'].mode()[0],
#         'vehicle_interior_color': data['vehicle_interior_color'].mode()[0]
#     }
#     #заполняем пропуски в соответствии с заявленным словарем
#     data = data.fillna(values)

    # 3 +++
    data['car_wearout_ratio'] = 1e6/((2022 - data['vehicle_year'])*data['current_mileage'])
    
    # 4
#     bin_encoder = ce.BinaryEncoder(cols=['vehicle_category'])
#     type_bin = bin_encoder.fit_transform(data['vehicle_category'])
#     data = pd.concat([data, type_bin], axis=1)
    
#     bin_encoder = ce.BinaryEncoder(cols=['vehicle_gearbox_type'])
#     type_bin = bin_encoder.fit_transform(data['vehicle_gearbox_type'])
#     data = pd.concat([data, type_bin], axis=1)

    display(data.head(3))
    return data
   
train_data = create_expert_feats(train_data)
test_data = create_expert_feats(test_data)

Unnamed: 0,row_ID,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,final_price,car_wearout_ratio
0,0,TOYOTA,Aqua s,Sedan,133000,2014,Automatic,4/5,Right-hand drive,Silver,Black,,0,For Sale,3650.0,0.93985
1,1,MERCEDES-BENZ,C 220,Sedan,24500,2010,Manual,4/5,Left wheel,Silver,Black,,0,For Sale,6800.0,3.401361
2,2,HYUNDAI,Veloster,Hatchback,31000,2016,Tiptronic,2/3,Left wheel,Silver,Black,KMHTC6AE3G,1,For Sale,6300.0,5.376344


Unnamed: 0,row_ID,vehicle_manufacturer,vehicle_model,vehicle_category,current_mileage,vehicle_year,vehicle_gearbox_type,doors_cnt,wheels,vehicle_color,vehicle_interior_color,car_vin,car_leather_interior,deal_type,car_wearout_ratio
0,35000,TOYOTA,Prius,Hatchback,323733,2012,Automatic,4/5,Left wheel,Grey,Black,JTDKN3DU6C,1,For Sale,0.308897
1,35001,HYUNDAI,Elantra,Sedan,112000,2013,Tiptronic,4/5,Left wheel,Grey,Black,SURATSHIA,1,For Sale,0.992063
2,35002,LEXUS,NX 300,Jeep,16920,2018,Automatic,,Left wheel,Brown,,JTJYARBZ5J,1,For Sale,14.775414


CPU times: user 72.6 ms, sys: 8.06 ms, total: 80.6 ms
Wall time: 113 ms


In [None]:
tr_data, te_data = train_test_split(
    train_data, 
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE
)

print(f'Data splitted. Parts sizes: tr_data = {tr_data.shape}, te_data = {te_data.shape}')

tr_data.head()

In [None]:
task = Task('reg', loss = 'mae', metric = 'mae')

In [None]:
roles = {
    'target': TARGET_NAME,
    'drop': ['row_ID']
}