In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data_path = './data'
filename = 'data_cleaned.csv'

In [3]:
df = pd.read_csv(os.path.join(data_path, filename), sep=';')
df.head()

Unnamed: 0,cena,typ_sprzedawcy,marka_pojazdu,model_pojazdu,przebieg,pojemnosc_skokowa,rodzaj_paliwa,moc,skrzynia_biegow,typ_nadwozia,...,hamulce_z_kompozytow_ceramicznych,opony_off-road,nowy_pojazd,gwarancja_dealerska,brak_informacji_o_wyposazeniu,wiek_pojazdu,gwarancja_producenta,wojewodztwo,liczba_generacji_modelu,ktora_generacja_modelu
0,23200.0,Dealer,Volvo,V70,304000.0,1.6,Diesel,109.0,Manualna,Kombi,...,0,0,0,0.0,1,13,0.0,zachodniopomorskie,3,0
1,16800.0,Osoba prywatna,Honda,Accord,236000.0,2.0,Benzyna,155.0,Manualna,Sedan,...,0,0,0,0.0,0,18,0.0,kujawsko-pomorskie,8,3
2,249900.0,Dealer,Mercedes-Benz,Klasa X,73000.0,3.0,Diesel,258.0,Automatyczna,SUV,...,0,0,0,0.0,0,4,0.0,małopolskie,1,0
3,16499.0,Dealer,Toyota,Avensis,220000.0,1.8,Benzyna,129.0,Manualna,Kombi,...,0,0,0,0.0,0,18,0.0,kujawsko-pomorskie,3,1
4,29900.0,Dealer,Ford,C-MAX,179058.0,2.0,Diesel,140.0,Manualna,Minivan,...,0,0,0,0.0,0,11,0.0,łódzkie,2,0


### Train / validation / test split

Dataset will be split in a stratified fashion based on two columns: brand and model

In [7]:
brand_and_models = df.apply(lambda row: f"{row['marka_pojazdu'].replace(' ', '_')}_{row['model_pojazdu'].replace(' ', '_')}", axis=1)
brand_and_models_counts = brand_and_models.value_counts()
brand_and_models[brand_and_models.isin(brand_and_models_counts[brand_and_models_counts < 3].index)] = 'temp'

train, test, _, brand_and_models_test = train_test_split(df, brand_and_models, test_size=0.3, stratify=brand_and_models)

brand_and_models_test_counts = brand_and_models_test.value_counts()
brand_and_models_test[brand_and_models_test.isin(brand_and_models_test_counts[brand_and_models_test_counts < 2].index)] = 'temp'
validation, test = train_test_split(test, test_size=0.5, stratify=brand_and_models_test)

In [19]:
splits_statistics = pd.DataFrame(
    data={
        'Liczba próbek': [len(train), len(validation), len(test)],
        'Procent udziału': [f'{len(train)/len(df)*100:.2f}%', f'{len(validation)/len(df)*100:.2f}%', f'{len(test)/len(df)*100:.2f}%']
    },
    index=['Zbiór treningowy', 'Zbiór walidacyjny', 'Zbiór testowy']
)
splits_statistics

Unnamed: 0,Liczba próbek,Procent udziału
Zbiór treningowy,137298,70.00%
Zbiór walidacyjny,29421,15.00%
Zbiór testowy,29421,15.00%


In [20]:
train_stratas = train.apply(lambda row: f"{row['marka_pojazdu'].replace(' ', '_')}_{row['model_pojazdu'].replace(' ', '_')}", axis=1)
validation_stratas = validation.apply(lambda row: f"{row['marka_pojazdu'].replace(' ', '_')}_{row['model_pojazdu'].replace(' ', '_')}", axis=1)
test_stratas = test.apply(lambda row: f"{row['marka_pojazdu'].replace(' ', '_')}_{row['model_pojazdu'].replace(' ', '_')}", axis=1)

train_stratas_counts = train_stratas.value_counts(normalize=True).apply(lambda x: f'{(100 * x):.3f}%')
validation_stratas_counts = validation_stratas.value_counts(normalize=True).apply(lambda x: f'{(100 * x):.3f}%')
test_stratas_counts = test_stratas.value_counts(normalize=True).apply(lambda x: f'{(100 * x):.3f}%')

train_stratas_counts = train_stratas_counts.reset_index()
train_stratas_counts.columns = ['Nazwa modelu', 'Zbiór treningowy']

validation_stratas_counts = validation_stratas_counts.reset_index()
validation_stratas_counts.columns = ['Nazwa modelu', 'Zbiór walidacyjny']

test_stratas_counts = test_stratas_counts.reset_index()
test_stratas_counts.columns = ['Nazwa modelu', 'Zbiór testowy']

sets_comparison = pd.merge(train_stratas_counts, validation_stratas_counts, on='Nazwa modelu', how='left')
sets_comparison = pd.merge(sets_comparison, test_stratas_counts, on='Nazwa modelu', how='left')

sets_comparison.head(20)

Unnamed: 0,Nazwa modelu,Zbiór treningowy,Zbiór walidacyjny,Zbiór testowy
0,Opel_Astra,2.176%,2.175%,2.179%
1,Audi_A4,1.983%,1.982%,1.982%
2,BMW_Seria_3,1.940%,1.941%,1.941%
3,Skoda_Octavia,1.913%,1.914%,1.914%
4,Volkswagen_Golf,1.890%,1.890%,1.890%
5,BMW_Seria_5,1.722%,1.720%,1.723%
6,Ford_Focus,1.668%,1.669%,1.665%
7,Volkswagen_Passat,1.637%,1.638%,1.638%
8,Audi_A6,1.513%,1.513%,1.513%
9,Ford_Mondeo,1.249%,1.247%,1.251%


In [21]:
train.to_csv(os.path.join(data_path, 'train.csv'), index=False, sep=';')
validation.to_csv(os.path.join(data_path, 'validation.csv'), index=False, sep=';')
test.to_csv(os.path.join(data_path, 'test.csv'), index=False, sep=';')