In [1]:
import os
import numpy as np
import pandas as pd
from itertools import chain
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from config import preprocessing as preprocessing_cfg

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = './data'

In [3]:
train = pd.read_csv(os.path.join(data_path, 'train.csv'), sep=';')
validation = pd.read_csv(os.path.join(data_path, 'validation.csv'), sep=';')
test = pd.read_csv(os.path.join(data_path, 'test.csv'), sep=';')

train.head()

Unnamed: 0,cena,typ_sprzedawcy,marka_pojazdu,model_pojazdu,przebieg,pojemnosc_skokowa,rodzaj_paliwa,moc,skrzynia_biegow,typ_nadwozia,...,hamulce_z_kompozytow_ceramicznych,opony_off-road,nowy_pojazd,gwarancja_dealerska,brak_informacji_o_wyposazeniu,wiek_pojazdu,gwarancja_producenta,wojewodztwo,liczba_generacji_modelu,ktora_generacja_modelu
0,106900.0,Autoryzowany Dealer,Suzuki,Vitara,4.0,1.4,Hybryda,129.0,Manualna,SUV,...,0,0,1,0.0,0,1,0.0,lubelskie,4,0
1,63900.0,Autoryzowany Dealer,Hyundai,I30,60811.0,1.5,Benzyna,110.0,Manualna,Kompakt,...,0,0,0,0.0,0,3,0.0,łódzkie,3,0
2,129900.0,Dealer,BMW,Seria 6,116000.0,4.4,Benzyna,407.0,Automatyczna,Coupe,...,0,0,0,0.0,0,11,0.0,śląskie,4,1
3,9777.0,Osoba prywatna,Citroën,C4,316000.0,1.6,Benzyna+LPG,110.0,Manualna,Kompakt,...,0,0,0,0.0,0,17,0.0,wielkopolskie,3,2
4,45900.0,Osoba prywatna,Opel,Corsa,67466.0,1.4,Benzyna,90.0,Manualna,Kompakt,...,0,0,0,0.0,0,5,0.0,kujawsko-pomorskie,6,1


In [4]:
dfs = [train, validation, test]

for df in dfs:
    for col in chain(preprocessing_cfg.CAR_EQUIPMENT_COLS, preprocessing_cfg.BINARY_COLS):
        df[col] = df[col].astype(np.uint16)

### Data preprocessing

Check for missing values

In [5]:
missing_values = train.isnull().sum()
missing_values[missing_values > 0]

przebieg                953
pojemnosc_skokowa      2299
moc                      89
skrzynia_biegow          93
liczba_drzwi            876
liczba_miejsc          6603
spalanie_w_miescie    57401
rodzaj_koloru         40308
naped                 22450
kraj_pochodzenia      47777
dtype: int64

Check for categorical features

In [6]:
train.select_dtypes(include='object').columns.tolist()

['typ_sprzedawcy',
 'marka_pojazdu',
 'model_pojazdu',
 'rodzaj_paliwa',
 'skrzynia_biegow',
 'typ_nadwozia',
 'kolor',
 'rodzaj_koloru',
 'naped',
 'kraj_pochodzenia',
 'wojewodztwo']

Preprocessing for SPLIT 1 (without data standardization)

In [7]:
most_frequent_imputed_and_encoded_cols = [
    'skrzynia_biegow',
    'naped',
    'rodzaj_koloru'
]

median_imputed_cols = [
    'przebieg',
    'liczba_drzwi',
    'liczba_miejsc',
    'pojemnosc_skokowa',
    'moc',
    'spalanie_w_miescie'
]

other_encoded_cols = [
    'marka_pojazdu',
    'model_pojazdu',
    'typ_sprzedawcy',
    'kolor',
    'wojewodztwo',
    'typ_nadwozia',
    'rodzaj_paliwa'
]

In [8]:
transformer_1 = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=5, sparse_output=False, drop='if_binary', dtype=np.uint16))
])

transformer_2 = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Nieznany')),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=5, sparse_output=False, drop='if_binary', dtype=np.uint16))
])

preprocessor_split_1 = ColumnTransformer([
    ('most_frequent_imputer_and_encoder', transformer_1, most_frequent_imputed_and_encoded_cols),
    ('origin_country_transformer', transformer_2, ['kraj_pochodzenia']),
    ('median_imputer', SimpleImputer(strategy='median'), median_imputed_cols),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=5, sparse_output=False, drop='if_binary', dtype=np.uint16), other_encoded_cols)
], remainder='passthrough', verbose_feature_names_out=False)

train_split_1 = preprocessor_split_1.fit_transform(train)

train_split_1 = pd.DataFrame(data=train_split_1, columns=preprocessor_split_1.get_feature_names_out())
train_split_1.head()

Unnamed: 0,skrzynia_biegow_Manualna,naped_4x4 (dołączany automatycznie),naped_4x4 (dołączany ręcznie),naped_4x4 (stały),naped_Na przednie koła,naped_Na tylne koła,rodzaj_koloru_Matowy,rodzaj_koloru_Metalik,rodzaj_koloru_Perłowy,kraj_pochodzenia_Austria,...,fotele_tylne_z_funkcje_masazu,hamulce_z_kompozytow_ceramicznych,opony_off-road,nowy_pojazd,gwarancja_dealerska,brak_informacji_o_wyposazeniu,wiek_pojazdu,gwarancja_producenta,liczba_generacji_modelu,ktora_generacja_modelu
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,4.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,3.0,2.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,6.0,1.0


In [9]:
validation_split_1 = preprocessor_split_1.transform(validation)
test_split_1 = preprocessor_split_1.transform(test)

validation_split_1 = pd.DataFrame(data=validation_split_1, columns=preprocessor_split_1.get_feature_names_out())
test_split_1 = pd.DataFrame(data=test_split_1, columns=preprocessor_split_1.get_feature_names_out())

print(f'Wymiary zbioru treningowego: {train_split_1.shape}')
print(f'Wymiary zbioru walidacyjnego: {validation_split_1.shape}')
print(f'Wymiary zbioru testowego: {test_split_1.shape}')

Wymiary zbioru treningowego: (137298, 1168)
Wymiary zbioru walidacyjnego: (29421, 1168)
Wymiary zbioru testowego: (29421, 1168)


In [10]:
train_split_1.to_csv(os.path.join(data_path, 'train_split_1.csv'), index=False, sep=';')
validation_split_1.to_csv(os.path.join(data_path, 'validation_split_1.csv'), index=False, sep=';')
test_split_1.to_csv(os.path.join(data_path, 'test_split_1.csv'), index=False, sep=';')

del train_split_1, validation_split_1, test_split_1

Preprocessing for SPLIT 2 (with data standardization)

In [12]:
most_frequent_imputed_and_encoded_cols = [
    'skrzynia_biegow',
    'naped',
    'rodzaj_koloru'
]

median_imputed_and_scaled_cols = [
    'przebieg',
    'liczba_drzwi',
    'liczba_miejsc',
    'pojemnosc_skokowa',
    'moc',
    'spalanie_w_miescie',
    'wiek_pojazdu',
    'gwarancja_producenta',
    'gwarancja_dealerska'
]

other_encoded_cols = [
    'marka_pojazdu',
    'model_pojazdu',
    'typ_sprzedawcy',
    'kolor',
    'wojewodztwo',
    'typ_nadwozia',
    'rodzaj_paliwa'
]

In [13]:
transformer_1 = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=5, sparse_output=False, drop='if_binary', dtype=np.uint16))
])

transformer_2 = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Nieznany')),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=5, sparse_output=False, drop='if_binary', dtype=np.uint16))
])

transformer_3 = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor_split_2 = ColumnTransformer([
    ('most_frequent_imputer_and_encoder', transformer_1, most_frequent_imputed_and_encoded_cols),
    ('origin_country_transformer', transformer_2, ['kraj_pochodzenia']),
    ('median_imputer_and_scaler', transformer_3, median_imputed_and_scaled_cols),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=5, sparse_output=False, drop='if_binary', dtype=np.uint16), other_encoded_cols)
], remainder='passthrough', verbose_feature_names_out=False)

train_split_2 = preprocessor_split_2.fit_transform(train)

train_split_2 = pd.DataFrame(data=train_split_2, columns=preprocessor_split_2.get_feature_names_out())
train_split_2.head()

Unnamed: 0,skrzynia_biegow_Manualna,naped_4x4 (dołączany automatycznie),naped_4x4 (dołączany ręcznie),naped_4x4 (stały),naped_Na przednie koła,naped_Na tylne koła,rodzaj_koloru_Matowy,rodzaj_koloru_Metalik,rodzaj_koloru_Perłowy,kraj_pochodzenia_Austria,...,hardtop,kierownica_po_prawej_anglik,orurowanie_przednie,fotele_tylne_z_funkcje_masazu,hamulce_z_kompozytow_ceramicznych,opony_off-road,nowy_pojazd,brak_informacji_o_wyposazeniu,liczba_generacji_modelu,ktora_generacja_modelu
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0


In [14]:
validation_split_2 = preprocessor_split_2.transform(validation)
test_split_2 = preprocessor_split_2.transform(test)

validation_split_2 = pd.DataFrame(data=validation_split_2, columns=preprocessor_split_2.get_feature_names_out())
test_split_2 = pd.DataFrame(data=test_split_2, columns=preprocessor_split_2.get_feature_names_out())

print(f'Wymiary zbioru treningowego: {train_split_2.shape}')
print(f'Wymiary zbioru walidacyjnego: {validation_split_2.shape}')
print(f'Wymiary zbioru testowego: {test_split_2.shape}')

Wymiary zbioru treningowego: (137298, 1168)
Wymiary zbioru walidacyjnego: (29421, 1168)
Wymiary zbioru testowego: (29421, 1168)


In [15]:
train_split_2.to_csv(os.path.join(data_path, 'train_split_2.csv'), index=False, sep=';')
validation_split_2.to_csv(os.path.join(data_path, 'validation_split_2.csv'), index=False, sep=';')
test_split_2.to_csv(os.path.join(data_path, 'test_split_2.csv'), index=False, sep=';')

del train_split_2, validation_split_2, test_split_2