In [None]:
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
dataset = pd.read_csv('./data/bg-car-offers.csv')

In [None]:
dataset.sample(15)

In [None]:
dataset.info()

In [None]:
dataset.shape

In [None]:
dataset.describe(include='all')

In [None]:
null_counts_per_column = dataset.isnull().sum().sort_values(ascending=False)
null_counts_per_column

In [None]:
fig = sns.barplot(x=null_counts_per_column, y=null_counts_per_column.index)
fig.set_title('Count of missing values per column')
fig.set_xlabel('Count of missing values')
fig.set_ylabel('Column')

In [None]:
msno.matrix(dataset.sample(500))

In [None]:
msno.heatmap(dataset)

# **Observations:**
* If engine size is present then EV features are not present and vice versa.
* EV features are always present together.
* If manufacturing dates are present then mileage is also present.
* Most of the time when any of the additional features columns (Other, Exterior, Interior) is present then the others are also present (author of the offer is lazy to fill them?).
* City and Region are almost always present together.

In [None]:
msno.dendrogram(dataset)

In [None]:
categorical_features = dataset.select_dtypes(include=[object])
categorical_features.columns

In [None]:
numerical_features = dataset.select_dtypes(include=[np.number])
numerical_features.columns

## Target feature - Price

In [None]:
# Make Price column numerical
y = dataset['Price']

y.value_counts()

In [None]:
price_non_numeric_rows = dataset[~dataset['Price'].str.isdigit()]

In [None]:
dataset[dataset['Price'].str.contains('При запитване', case=False)]

In [None]:
non_numeric_price_counts = price_non_numeric_rows['Price'].value_counts().reset_index()
non_numeric_price_counts.columns = ['Price', 'Count']

non_numeric_price_counts

In [None]:
sns.barplot(x='Price', y='Count', data=non_numeric_price_counts.head(10))

In [None]:
prepared_dataset = dataset.copy()

# Remove currency symbols
prepared_dataset['Price'] = prepared_dataset['Price'].str.replace('EUR', '')

# Replace USD prices with NaN
prepared_dataset.loc[prepared_dataset['Price'].str.contains('USD'), 'Price'] = np.nan

# Replace 'При запитване' with NaN
prepared_dataset.loc[prepared_dataset['Price'] == 'При запитване', 'Price'] = np.nan

# Convert to float in order to fill NaN values
prepared_dataset['Price'] = prepared_dataset['Price'].astype('float64').round()

# Fill NaN values with the mean by model
prepared_dataset['Price'] = prepared_dataset.groupby('Model')['Price'].transform(lambda x: x.fillna(x.mean()))

prepared_dataset['Price'].value_counts(dropna=False)

In [None]:
prepared_dataset['Price'].isna().sum()

In [None]:
prepared_dataset[prepared_dataset['Price'].isna()]

In [None]:
# Drop rows still with nan Price
prepared_dataset = prepared_dataset.dropna(subset=['Price'])

prepared_dataset['Price'].notna().sum()

In [None]:
prepared_dataset['Price'].isna().sum()

In [None]:
prepared_dataset[prepared_dataset['Price'] == 'При запитване']

In [None]:
prepared_dataset['Price'].value_counts(dropna=False)

In [None]:
prepared_dataset['Price'].describe()

In [None]:
prepared_dataset['Price'].dtype

## Decision Tree Regressor with no preprocessing

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

Y = prepared_dataset['Price']
X = prepared_dataset.drop(columns=['Price'])

categorical_features = X.select_dtypes(include=[object])
numerical_features = X.select_dtypes(include=[np.number])

ordinal_encoder = OrdinalEncoder()

column_transformer = ColumnTransformer([
    ('categorical', ordinal_encoder, categorical_features.columns)
], remainder='passthrough')

X = column_transformer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
def plot_tree_depth_scores(X_train, y_train, X_test, y_test, depth_range):
    train_scores = []
    test_scores = []

    for i in depth_range:
        model = DecisionTreeRegressor(max_depth=i)
        model.fit(X_train, y_train)
        train_scores.append(model.score(X_train, y_train))
        test_scores.append(model.score(X_test, y_test))

    plt.plot(depth_range, train_scores, label='Train')
    plt.plot(depth_range, test_scores, label='Test')
    plt.xlabel('Max Depth')
    plt.ylabel('Score')
    plt.legend()
    plt.show()

In [None]:
plot_tree_depth_scores(X_train, y_train, X_test, y_test, range(1, 11))

In [None]:
tree_model = DecisionTreeRegressor(max_depth=7)

tree_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error

y_pred = tree_model.predict(X_test)

print('Root Mean Squared Error: ', root_mean_squared_error(y_test, y_pred))
print('Mean Absolute Error: ', mean_absolute_error(y_test, y_pred))

In [None]:
from sklearn.tree import plot_tree

tree_fig = plt.figure(figsize=(300, 50))

plot_tree(tree_model, filled=True, feature_names=prepared_dataset.columns.drop('Price'), fontsize=10)
plt.savefig('./decision_tree_no_prep.png')

plt.show()

In [None]:
from pandas import DataFrame
from sklearn.inspection import permutation_importance

feature_importances = DataFrame(
    columns=['Feature', 'Importance', 'Permutation Importance'],
    data=zip(prepared_dataset.columns.drop('Price'), tree_model.feature_importances_,
             permutation_importance(tree_model, X_test, y_test, random_state=42, n_repeats=20).importances_mean))

feature_importances.sort_values(by='Importance', ascending=False)

In [None]:
data = feature_importances.melt(id_vars='Feature', var_name='Importance',
                                value_vars=['Importance', 'Permutation Importance'])

plt.title('Feature Importances')
plt.xlabel('Value')

sns.barplot(x='value', data=data, hue='Importance', y='Feature', orient='h')

In [None]:
prepared_dataset.iloc[prepared_dataset['Price'].argmax()]

WOW! A Camaro for 20 Million Leva! Well that's an outlier if I've ever seen one!

In [None]:
prepared_dataset['Price'] = prepared_dataset['Price'].round()

prepared_dataset['Price'].describe()

In [None]:
prepared_dataset.sample(20)

In [None]:
numerical_columns_skewness = dataset[numerical_features.columns].skew().sort_values(ascending=False)
numerical_columns_skewness

In [None]:
# Initialize a figure
plt.figure(figsize=(20, 15))

# Number of rows/columns in the grid
n_cols = 3  # Adjust based on preference
n_rows = int(np.ceil(len(numerical_features.columns) / float(n_cols)))

# Create a subplot for each numerical feature
for i, column in enumerate(numerical_features.columns, 1):
    plt.subplot(n_rows, n_cols, i)
    sns.histplot(dataset[column], kde=True, stat="count", linewidth=0)
    plt.title(column)

plt.tight_layout()
plt.show()

In [None]:
numerical_columns_skewness

In [None]:
numerical_columns_kurtosis = dataset[numerical_features.columns].kurt().sort_values(ascending=False)
numerical_columns_kurtosis