In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# same folder --> folder data

path_original = "data/_Original/full_used_car_prices_original.csv"
path_generated = "data/_Generated/full_used_car_prices_generated.csv"

data_original = pd.read_csv(path_original)
data_generated = pd.read_csv(path_generated)

In [None]:
data_original.info()

In [None]:
data_generated.info()

Transformation of the price column from $ to float in the original dataset.

In [None]:
data_original.head()

In [None]:
data_original['price'] = data_original['price'].str.replace('$', '')
data_original['price'] = data_original['price'].str.replace(',', '')
data_original['price'] = data_original['price'].astype(int)
data_original['price'].head()

# Creating the test and train datasets

### Distribution of the Target Variable

Since we would like to apply stratified sampling, we need to know the distribution of the target variable (for binning the continuous target variable). Since the target variable is heavily right-skewed (as seen in the histogram below), we will use log transformation to make it more normally distributed.

In [None]:
# target column
y_original = data_original["price"]

# Use the log of the target column
y_original_ln = np.log(y_original)
print(max(y_original))
print(max(y_original_ln))

# Histogram of the target column
plt.title('Distribution of Prices (skrewed)')
plt.hist(y_original, bins=40)
plt.xlabel('Price in $')
plt.ylabel('Number of Cars')
plt.show()

# Histogram of the ln target column
plt.title('Distribution of Log Prices using log. transformation')
plt.hist(y_original_ln, bins=40)
plt.xlabel('Log of Price in $')
plt.ylabel('Number of Cars')
plt.show()

We can see that there are some outliers in the target variable. We will take a look at the 20 most expensive cars in the dataset.

In [None]:
# Print the 10 largest values of the target column
print(y_original.nlargest(20))

We agreed to remove the three samples with over 1 mio. USD in the target variable, since they are outliers and would distort the distribution greatly.

In [None]:
y_original_cleaned = y_original[y_original < 1000000]
y_original_ln_cleaned = np.log(y_original_cleaned)
plt.hist(y_original_cleaned, bins=40)
plt.show()
plt.hist(y_original_ln_cleaned, bins=40)
plt.show()

We will make 5 bins for the target variable, creating pseudo-classes in combination with the brand of the car for stratisfied sampling. This ensures that the test set is representative of the whole dataset.

In [None]:
# Make a new feature with the binning of the target column from ln
y_binned = pd.cut(y_original_ln, bins=5, labels=False)

# combine it with brand 
brand = data_original["brand"]
y_binned = y_binned.astype(str) + "_" + brand

# Look at if there are classes with few samples
print("Number of classes for sampling", len(y_binned.value_counts()))
print(len(y_binned.value_counts()[y_binned.value_counts() == 1]))
y_binned.value_counts()[y_binned.value_counts() == 1]

Since there are 24 pseudo-classes with only one sample (which would lead to problems in stratified sampling), we temporarly remove them and will distribute them randomly after the stratified sampling.

In [None]:
from sklearn.model_selection import train_test_split

# All the classes with only one sample should be in a separate set which will be splitted in the end usind random sampling
X_single = data_original[y_binned.isin(y_binned.value_counts()[y_binned.value_counts() == 1].index)]

# Sampling usind stratified sampling for the classes with more than one sample
X = data_original.copy()[y_binned.isin(y_binned.value_counts()[y_binned.value_counts() > 1].index)]
y_stratified = y_binned[y_binned.isin(y_binned.value_counts()[y_binned.value_counts() > 1].index)]

# Stratified sampling usind y_binned 80% train and 20% test
X_train, X_test = train_test_split(X, test_size=0.2, stratify=y_stratified, random_state=42)

# Distribute the single samples among the train and test set
X_single_1, X_single_2 = train_test_split(X_single, test_size=0.2, random_state=42)
X_train = pd.concat([X_train, X_single_1])
X_test = pd.concat([X_test, X_single_2])

# Plot histograms of the target column for the train and test set, using percentage instead of counts
y_train = X_train["price"]
y_test = X_test["price"]

y_train_ln = np.log(y_train)
y_test_ln = np.log(y_test)


plt.hist(y_train_ln, bins=50, density=True, alpha=0.5, label="train")
plt.hist(y_test_ln, bins=50, density=True, alpha=0.5, label="test")
plt.xlabel("log(price) in $")
plt.ylabel("Density")
plt.legend()
plt.show()

The plot ($$$) shows that the distribution of the target variable between the train and test set is close to similar.

In [None]:
# Compare the brand distribution in the train and test set
brand_train = X_train["brand"]
brand_test = X_test["brand"]

df_brand = pd.DataFrame({"train": brand_train.value_counts(normalize=True), "test": brand_test.value_counts(normalize=True)})
# Multiply by 100 to get percentage
df_brand = df_brand * 100

df_brand["diff"] = df_brand["train"] - df_brand["test"]
df_brand

In [None]:
# Print length of train and test set
print("Length of train set:", len(X_train))
print("Length of test set:", len(X_test))

The distribution of the brands should be sufficiently similar between the train and test set.

In [None]:
X_train.head()

# Save the train and test set
X_train.to_csv("data/0_Data_Split/train.csv", index=False)
X_test.to_csv("data/0_Data_Split/test.csv", index=False)