In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100000)

# Зареждане на данните

In [None]:
dataset = pd.read_csv('../data/prepared-car-offers.csv')

In [None]:
dataset.sample(15)

In [None]:
dataset.info()

In [None]:
dataset.shape

In [None]:
dataset.describe(include='all')

# Кодиране на данните

## Нормализиране на числовите стойности

Не е достатъчно само да премахнем аутлайърите в числовите колони. Трябва също така да ги нормализираме, за да с еднакъв мащаб и да гарантираме, че няма колоните, които сa с по-голям числов обхват, да натежат повече при обучението на модела.

### Standard Scaling

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

y = dataset['Price']
X = dataset.drop(columns=['Price'])

standard_scaling_num_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

ohe_cat_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numerical', standard_scaling_num_preprocessor, make_column_selector(dtype_include=np.number)),
    ('categorical', ohe_cat_preprocessor, make_column_selector(dtype_include=object))
], remainder='passthrough')

default_dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

default_dt_pipeline.fit(X_train, y_train)
y_pred = default_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

### MaxAbs Scaling

In [None]:
from sklearn.preprocessing import MaxAbsScaler

maxabs_scaling_num_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MaxAbsScaler())
])

maxabs_preprocessor = ColumnTransformer([
    ('numerical', maxabs_scaling_num_preprocessor, make_column_selector(dtype_include=np.number)),
    ('categorical', ohe_cat_preprocessor, make_column_selector(dtype_include=object))
], remainder='passthrough')

maxabs_scaling_dt_pipeline = Pipeline([
    ('preprocessor', maxabs_preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

maxabs_scaling_dt_pipeline.fit(X_train, y_train)
y_pred = maxabs_scaling_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

### MinMax Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaling_num_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', MinMaxScaler())
])

minmax_preprocessor = ColumnTransformer([
    ('numerical', minmax_scaling_num_preprocessor, make_column_selector(dtype_include=np.number)),
    ('categorical', ohe_cat_preprocessor, make_column_selector(dtype_include=object))
], remainder='passthrough')

minmax_scaling_dt_pipeline = Pipeline([
    ('preprocessor', minmax_preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

minmax_scaling_dt_pipeline.fit(X_train, y_train)
y_pred = minmax_scaling_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

Ще използваме `MaxAbsScaler`, тъй като той дава най-добри резултати.

In [None]:
default_num_preprocessor = maxabs_scaling_num_preprocessor
default_preprocessor = maxabs_preprocessor
default_dt_pipeline = maxabs_scaling_dt_pipeline

In [None]:
y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

default_dt_pipeline.fit(X_train, y_train)
y_pred = default_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

## Кодиране на категорийните стойности

Категорийните стойности трябва да бъдат преобразувани в числа, за да могат да бъдат използвани от модела. Ще разгледаме няколко опции за кодиране на категорийни стойности. 

In [None]:
sns.barplot(x=dataset.select_dtypes(include=[object]).nunique(),
            y=dataset.select_dtypes(include=[object]).columns)
plt.title('Брой уникални стойности по колона (кардиналност)')
plt.xlabel('Брой уникални стойности')
plt.ylabel('Колона')

### One-hot encoding на всички категорийни колони

In [None]:
y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

default_dt_pipeline.fit(X_train, y_train)
y_pred = default_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

Нека разгледаме кои са колоните с най-голямо значение според моделa:

In [None]:
importances = default_dt_pipeline.named_steps['model'].feature_importances_

ohe_feature_names = default_dt_pipeline.named_steps['preprocessor'].transformers_[1][1].named_steps[
    'encoder'].get_feature_names_out(input_features=dataset.select_dtypes(include=[object]).columns)

numerical_feature_names = dataset.select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([numerical_feature_names, ohe_feature_names])

feature_importances_df = pd.DataFrame(data=zip(all_feature_names, importances), columns=['Feature', 'Importance'])
feature_importances_df[feature_importances_df['Importance'] > 0].sort_values(by='Importance', ascending=False).head(30)

In [None]:
null_counts_per_column = dataset.isnull().sum().sort_values(ascending=False).head(20)

sns.barplot(x=null_counts_per_column, y=null_counts_per_column.index)
plt.title('Липсващи стойности по колона')
plt.xlabel('Брой липсващи стойности')
plt.ylabel('Колона')

In [None]:
dataset.shape

### Target encoding за колоните с голяма кардиналност

In [None]:
from sklearn.preprocessing import TargetEncoder

columns_for_target_encoding = dataset[dataset.select_dtypes(include=object).columns].nunique()[
    dataset.nunique() > 6].index
columns_for_target_encoding

In [None]:
from sklearn.metrics import PredictionErrorDisplay

target_encoding_preprocessor = Pipeline([
    ('target_encoder', TargetEncoder(target_type='continuous', smooth=0.2)),
])

target_enc_cat_preprocessor = ColumnTransformer([
    ('target_encoding', target_encoding_preprocessor, columns_for_target_encoding),
    ('one_hot_encoding', ohe_cat_preprocessor, make_column_selector(dtype_include=object))
], remainder='passthrough')

target_oh_encoding_preprocessor = ColumnTransformer([
    ('categorical', target_enc_cat_preprocessor, make_column_selector(dtype_include=object)),
    ('numerical', default_num_preprocessor, make_column_selector(dtype_include=np.number)),
], remainder='passthrough')

target_enc_dt_pipeline = Pipeline([
    ('preprocessor', target_oh_encoding_preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

target_enc_dt_pipeline.fit(X_train, y_train)
y_pred = target_enc_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

In [None]:
target_encoder = target_enc_dt_pipeline.named_steps['preprocessor'].transformers_[0][1]['target_encoding'][
    'target_encoder']

In [None]:
category_means = zip(target_encoder.categories_, target_encoder.encodings_)

category_means_df = pd.DataFrame(columns=['Category', 'Mean'])

for category, mean in category_means:
    for i in range(len(category)):
        category_means_df.loc[len(category_means_df.index)] = [category[i], mean[i]]

category_means_df

In [None]:
PredictionErrorDisplay.from_predictions(y_test, y_pred, kind='residual_vs_predicted',
                                        scatter_kwargs={'alpha': 0.5, 'marker': 'x'}, subsample=None)

Да видим отново важността на колоните, след като сме приложили Target Encoding:

In [None]:
importances = target_enc_dt_pipeline.named_steps['model'].feature_importances_

ohe_feature_names = target_enc_dt_pipeline.named_steps['preprocessor'].transformers_[0][1]['one_hot_encoding'][
    'encoder'].get_feature_names_out(
    input_features=dataset.select_dtypes(include=object).columns)

numerical_feature_names = dataset.drop(columns='Price').select_dtypes(include=np.number).columns
all_feature_names = np.concatenate([numerical_feature_names, ohe_feature_names])

feature_importances_df = pd.DataFrame(data=zip(all_feature_names, importances), columns=['Feature', 'Importance'])
feature_importances_df[feature_importances_df['Importance'] > 0].sort_values(by='Importance', ascending=False)

In [None]:
default_cat_preprocessor = target_enc_cat_preprocessor
default_preprocessor = target_oh_encoding_preprocessor
default_dt_pipeline = target_enc_dt_pipeline

# Редуциране на пространството на характеристиките

## Премахване на колинеарни характеристики

Колинеарността е проблем, който възниква, когато две или повече характеристики са силно корелирани помежду си. Това може да доведе до преоценяване на важността на характеристиките. Нека проверим за колинеарност в нашия набор от данни.

In [None]:
corr_matrix = dataset.select_dtypes(include=np.number).corr()

plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, cmap='coolwarm')

In [None]:
excluded_columns = ['Extras_TV', 'Extras_Tiptronic', 'Extras_USB', 'Extras_IN\\AUX изводи', 'Extras_Климатик']

no_colinearity_df = dataset.copy()
no_colinearity_df = no_colinearity_df.loc[:, no_colinearity_df.columns.difference(excluded_columns)]

y = no_colinearity_df['Price']
X = no_colinearity_df.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

default_dt_pipeline.fit(X_train, y_train)
y_pred = default_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

In [None]:
dataset = no_colinearity_df

## Премахване на колона: `Make`, `Model`, `EuroStandard`, `Color`, `Region`

In [None]:
excluded_columns = ['Make', 'Model', 'EuroStandard', 'Color', 'Region']

for column in excluded_columns:
    df_excluded_column = dataset.drop(columns=column)

    columns_for_target_encoding = \
    df_excluded_column[df_excluded_column.select_dtypes(include=object).columns].nunique()[
        df_excluded_column.nunique() > 6].index

    cat_preproccessor = ColumnTransformer([
        ('target_encoding', target_encoding_preprocessor, columns_for_target_encoding),
        ('one_hot_encoding', ohe_cat_preprocessor, make_column_selector(dtype_include=object))
    ], remainder='passthrough')

    preprocessor = ColumnTransformer([
        ('categorical', cat_preproccessor, make_column_selector(dtype_include=object)),
        ('numerical', default_num_preprocessor, make_column_selector(dtype_include=np.number)),
    ], remainder='passthrough')

    dt_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor(random_state=42))
    ])

    y = df_excluded_column['Price']
    X = df_excluded_column.drop(columns=['Price'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    dt_pipeline.fit(X_train, y_train)
    y_pred = dt_pipeline.predict(X_test)

    print('-' * 50)
    print('Резултати с премахната колона: ' + column)
    print(f'Средноквадратична грешка (RMSE) без {column}: ', root_mean_squared_error(y_test, y_pred))
    print(f'Средна абсолютна грешка (MAE) без {column}: ', mean_absolute_error(y_test, y_pred))

In [None]:
dataset = dataset.drop(columns='Region')

columns_for_target_encoding = dataset[dataset.select_dtypes(include=object).columns].nunique()[dataset.nunique() > 6].index
default_cat_preprocessor = ColumnTransformer([
    ('target_encoding', target_encoding_preprocessor, columns_for_target_encoding),
    ('one_hot_encoding', ohe_cat_preprocessor, make_column_selector(dtype_include=object))
], remainder='passthrough')

default_preprocessor = ColumnTransformer([
    ('categorical', default_cat_preprocessor, make_column_selector(dtype_include=object)),
    ('numerical', default_num_preprocessor, make_column_selector(dtype_include=np.number)),
], remainder='passthrough')

default_dt_pipeline = Pipeline([
    ('preprocessor', default_preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

## Премахване на характеристики с ниска важност

In [None]:
from sklearn.feature_selection import SelectFromModel

most_important_features_pipeline = Pipeline([
    ('preprocessor', default_preprocessor),
    ('feature_selection', SelectFromModel(DecisionTreeRegressor(random_state=42), threshold=0.01)),
    ('model', DecisionTreeRegressor(random_state=42))
])

y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

most_important_features_pipeline.fit(X_train, y_train)
y_pred = most_important_features_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

## 6500, 16100

## Премахване на характеристики с ниска взаимна информация

In [None]:
from sklearn.feature_selection import mutual_info_regression

y = dataset['Price']
X = dataset.drop(columns=['Price'])

numerical_features = X.select_dtypes(include=np.number)
X = X[numerical_features.columns]
X = X.fillna(0)

X = default_preprocessor.fit_transform(X)

mutual_info = mutual_info_regression(X, y)

mutual_info_df = pd.DataFrame(data=zip(numerical_features.columns, mutual_info),
                              columns=['Feature', 'Mutual Information'])

In [None]:
mutual_info_df.sort_values(by='Mutual Information', ascending=False)

In [None]:
selected_features = mutual_info_df[mutual_info_df['Mutual Information'] >= 0.0001]['Feature'].values

selected_features

In [None]:
selected_num_features = dataset[selected_features]
cat_features = dataset.select_dtypes(include=object)

y = dataset['Price']
X = pd.concat([selected_num_features, cat_features], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

default_dt_pipeline.fit(X_train, y_train)
y_pred = default_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

## Премахване на характеристики с ниска корелация с целевата променлива

In [None]:
correlation_with_price = dataset[dataset.drop(columns='Price').select_dtypes(include=np.number).columns].corrwith(dataset['Price'])

correlation_with_price_df = pd.DataFrame(np.abs(correlation_with_price), columns=['Correlation with Price'])
correlation_with_price_df.sort_values(by='Correlation with Price', ascending=False)

In [None]:
correlated_features = correlation_with_price_df[correlation_with_price_df['Correlation with Price'] >= 0.5].index
cat_features = dataset.select_dtypes(include=object).columns
selected_features = correlated_features.union(cat_features)
selected_features

In [None]:
y = dataset['Price']
X = dataset[selected_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

default_dt_pipeline.fit(X_train, y_train)
y_pred = default_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

In [None]:
dataset = dataset[selected_features.append(pd.Index(['Price']))]
dataset

# Откриване на най-добрите хиперпараметри за модела

**ВНИМАНИЕ: Следващата клетка използва всички процесори и може да доведе до забавяне на машината докато се изпълнява!**
**Изпълнението може да отнеме над 30 мин!**

In [None]:
from sklearn.model_selection import GridSearchCV

import os
os.environ['JOBLIB_TIMEOUT'] = '300'

y = dataset['Price']
X = dataset.drop(columns=['Price'])

param_grid = {
    'preprocessor__categorical__target_encoding__target_encoder__smooth': ['auto', 0.1, 0.2, 0.5],
    'model__max_depth': [None, 10, 15, 20, 30, 50, 100],
    'model__min_samples_split': [2, 5, 10, 25],
    'model__min_samples_leaf': [1, 2, 4, 10, 20, 50],
    'model__max_leaf_nodes': [None, 10, 20, 50, 100]
}

grid_search = GridSearchCV(default_dt_pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X, y)

print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

best_dt_pipeline = grid_search.best_estimator_

In [None]:
y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_dt_pipeline.fit(X_train, y_train)
y_pred = best_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

# Анализиране на грешките

In [None]:
y = dataset['Price']
X = dataset.drop(columns=['Price'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_dt_pipeline.fit(X_train, y_train)
y_pred = best_dt_pipeline.predict(X_test)

print('Средноквадратична грешка (RMSE): ', root_mean_squared_error(y_test, y_pred))
print('Средна абсолютна грешка (MAE): ', mean_absolute_error(y_test, y_pred))

PredictionErrorDisplay.from_predictions(y_test, y_pred, kind='residual_vs_predicted',
                                        scatter_kwargs={'alpha': 0.5, 'marker': 'x'}, subsample=None)

In [None]:
errors = pd.Series(np.abs(y_test - y_pred))

errors_df = X_test.copy()
errors_df['Error'] = errors
errors_df = errors_df.sort_values(by='Error', ascending=False)
errors_df[errors_df['Error'] > 0].describe(include='all')

In [None]:
errors_df = errors_df[errors_df['Error'] > 0]

# Group by 'Make' and calculate the mean error
mean_error_by_make = errors_df.groupby('Make')['Error'].mean().reset_index()

plt.figure(figsize=(10, 15))
ax = sns.barplot(x='Error', y='Make', data=mean_error_by_make.sort_values(by='Error', ascending=False), orient='h')
ax.axvline(errors_df['Error'].mean(), color='red', linestyle='--')
plt.title('Средна абсолютна грешка по марка')
plt.ylabel('Марка')
plt.xlabel('Грешка')
plt.show()

In [None]:
count_by_make = errors_df['Make'].value_counts()
make_count_error_df = pd.DataFrame(count_by_make).join(mean_error_by_make.set_index('Make'))
make_count_error_df.sort_values(by='Error', ascending=False)

# Експортиране на модела

In [None]:
import pickle

with open('../models/d_tree_car_price_model.pkl', 'wb+') as model_file:
    pickle.dump(best_dt_pipeline, model_file)