In [21]:
import os
import pickle
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from itertools import chain
from datetime import timedelta
from timeit import default_timer
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from config import preprocessing as preprocessing_cfg

import warnings
warnings.filterwarnings('ignore')

In [2]:
random_state = 42
np.random.seed(random_state)

In [3]:
data_path = './data'
models_path = './models'

In [4]:
train = pd.read_csv(os.path.join(data_path, 'train.csv'), sep=';')
validation = pd.read_csv(os.path.join(data_path, 'validation.csv'), sep=';')
test = pd.read_csv(os.path.join(data_path, 'test.csv'), sep=';')

train.head()

Unnamed: 0,cena,typ_sprzedawcy,marka_pojazdu,model_pojazdu,przebieg,pojemnosc_skokowa,rodzaj_paliwa,moc,skrzynia_biegow,typ_nadwozia,...,hamulce_z_kompozytow_ceramicznych,opony_off-road,nowy_pojazd,gwarancja_dealerska,brak_informacji_o_wyposazeniu,wiek_pojazdu,gwarancja_producenta,wojewodztwo,liczba_generacji_modelu,ktora_generacja_modelu
0,106900.0,Autoryzowany Dealer,Suzuki,Vitara,4.0,1.4,Hybryda,129.0,Manualna,SUV,...,0,0,1,0.0,0,1,0.0,lubelskie,4,0
1,63900.0,Autoryzowany Dealer,Hyundai,I30,60811.0,1.5,Benzyna,110.0,Manualna,Kompakt,...,0,0,0,0.0,0,3,0.0,łódzkie,3,0
2,129900.0,Dealer,BMW,Seria 6,116000.0,4.4,Benzyna,407.0,Automatyczna,Coupe,...,0,0,0,0.0,0,11,0.0,śląskie,4,1
3,9777.0,Osoba prywatna,Citroën,C4,316000.0,1.6,Benzyna+LPG,110.0,Manualna,Kompakt,...,0,0,0,0.0,0,17,0.0,wielkopolskie,3,2
4,45900.0,Osoba prywatna,Opel,Corsa,67466.0,1.4,Benzyna,90.0,Manualna,Kompakt,...,0,0,0,0.0,0,5,0.0,kujawsko-pomorskie,6,1


In [5]:
dfs = [train, validation, test]

for df in dfs:
    for col in chain(preprocessing_cfg.CAR_EQUIPMENT_COLS, preprocessing_cfg.BINARY_COLS):
        df[col] = df[col].astype(np.uint16)

### Preprocessing

In [6]:
categorical_cols = train.select_dtypes(include='object').columns.tolist()
numerical_cols = test.select_dtypes('float').columns.tolist()
numerical_cols.remove('cena')

print(f'Categorical columns: {categorical_cols}')
print(f'Numerical columns: {numerical_cols}')

Categorical columns: ['typ_sprzedawcy', 'marka_pojazdu', 'model_pojazdu', 'rodzaj_paliwa', 'skrzynia_biegow', 'typ_nadwozia', 'kolor', 'rodzaj_koloru', 'naped', 'kraj_pochodzenia', 'wojewodztwo']
Numerical columns: ['przebieg', 'pojemnosc_skokowa', 'moc', 'liczba_drzwi', 'liczba_miejsc', 'spalanie_w_miescie', 'gwarancja_dealerska', 'gwarancja_producenta']


In [7]:
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Nieznany')),
    ('encoder', OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=5, sparse_output=False, drop='if_binary', dtype=np.uint16))
])

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('scaler', StandardScaler())
])

preprocessor_improved = ColumnTransformer([
    ('categorical_transformer', categorical_transformer, categorical_cols),
    ('numerical_transformer', numerical_transformer, numerical_cols),
], remainder='passthrough', verbose_feature_names_out=False)

train_improved = preprocessor_improved.fit_transform(train)

train_improved = pd.DataFrame(data=train_improved, columns=preprocessor_improved.get_feature_names_out())
train_improved.head()

Unnamed: 0,typ_sprzedawcy_Autoryzowany Dealer,typ_sprzedawcy_Dealer,typ_sprzedawcy_Osoba prywatna,marka_pojazdu_Abarth,marka_pojazdu_Acura,marka_pojazdu_Aixam,marka_pojazdu_Alfa Romeo,marka_pojazdu_Alpine,marka_pojazdu_Aston Martin,marka_pojazdu_Audi,...,kierownica_po_prawej_anglik,orurowanie_przednie,fotele_tylne_z_funkcje_masazu,hamulce_z_kompozytow_ceramicznych,opony_off-road,nowy_pojazd,brak_informacji_o_wyposazeniu,wiek_pojazdu,liczba_generacji_modelu,ktora_generacja_modelu
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,4.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,3.0,2.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,6.0,1.0


In [8]:
validation_improved = preprocessor_improved.transform(validation)
test_improved = preprocessor_improved.transform(test)

validation_improved = pd.DataFrame(data=validation_improved, columns=preprocessor_improved.get_feature_names_out())
test_improved = pd.DataFrame(data=test_improved, columns=preprocessor_improved.get_feature_names_out())

print(f'Wymiary zbioru treningowego: {train_improved.shape}')
print(f'Wymiary zbioru walidacyjnego: {validation_improved.shape}')
print(f'Wymiary zbioru testowego: {test_improved.shape}')

Wymiary zbioru treningowego: (137298, 1178)
Wymiary zbioru walidacyjnego: (29421, 1178)
Wymiary zbioru testowego: (29421, 1178)


In [9]:
train_improved.to_csv(os.path.join(data_path, 'train_improved.csv'), index=False, sep=';')
validation_improved.to_csv(os.path.join(data_path, 'validation_improved.csv'), index=False, sep=';')
test_improved.to_csv(os.path.join(data_path, 'test_improved.csv'), index=False, sep=';')

### Model training

In [10]:
y_train = train_improved['cena']
X_train = train_improved.drop(columns=['cena'])

y_val = validation_improved['cena']
X_val = validation_improved.drop(columns=['cena'])

y_test = test_improved['cena']
X_test = test_improved.drop(columns=['cena'])

del train_improved, validation_improved, test_improved

Functions for model evaluation

In [11]:
def calculate_metrics(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAPE': mean_absolute_percentage_error(y_true, y_pred),
        'R2': r2_score(y_true, y_pred)
    }


def plot_actual_vs_predicted(y_true, y_pred, metrics, title):
    results = pd.DataFrame(data={'y_true': y_true, 'y_pred': y_pred})

    final_title = f'{title}' +\
                    f'<span style="font-size: 15px;">Porównanie cen rzeczywistych z predykcjami</span><br>' +\
                    f'<span style="font-size: 11px;">MAE = {metrics["MAE"]:.2f},   RMSE = {metrics["RMSE"]:.2f}' +\
                    f',   MAPE = {metrics["MAPE"] * 100:.2f}%,   R2 = {metrics["R2"]:.2f}</span>'

    x = y = [0, results['y_true'].max()]

    fig = px.scatter(results, x='y_true', y='y_pred', opacity=0.3, height=650, range_x=[0, 1e6], range_y=[0, 1e6],
                     labels={'y_true': 'Cena rzeczywista [PLN]', 'y_pred': 'Cena estymowana [PLN]'})
    fig.add_trace(go.Scatter(x=x, y=y, mode='lines', line=go.scatter.Line(color='red', width=3), showlegend=False))
    fig.update_layout(
        title={'text': final_title, 'x': 0.5},
        margin={'t': 100}
    )

    fig.show()


def plot_error_histogram(y_true, y_pred, title):
    results = pd.DataFrame(data={'y_true': y_true, 'y_pred': y_pred})
    results['error'] = results['y_true'] - results['y_pred']

    final_title = f'{title}' + \
                  f'<span style="font-size: 15px;">Histogram błędów predykcji</span><br>'

    fig = px.histogram(results, x='error', range_x=[-50000, 50000], height=650)
    fig.update_layout(
        title={'text': final_title, 'x': 0.5},
        margin={'t': 100},
        yaxis_title='Liczebność',
        xaxis_title='Błąd predykcji [PLN]'
    )

    fig.show()
            

def plot_learning_curve(eval_results, best_n_estimators, title):
    final_title = f'{title}\nKrzywa uczenia się'
    x = [n + 1 for n, _ in enumerate(eval_results['validation_0']['rmse'])]

    sns.set_style('whitegrid')
    plt.figure(figsize=(11, 6))
    plt.plot(x, eval_results['validation_0']['rmse'], label='Zbiór treningowy')
    plt.plot(x, eval_results['validation_1']['rmse'], label='Zbiór walidacyjny')
    plt.axvline(best_n_estimators, color='black', label=f'Optymalna liczba estymatorów = {best_n_estimators}')
    plt.xlabel('Liczba estymatorów (n_estimators)')
    plt.ylabel('Wartość funkcji kosztu (RMSE)')
    plt.legend()
    plt.title(final_title, fontsize=20)
    plt.show()

Function for hyperparameters optimization

In [16]:
def objective(trial: optuna.Trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'random_state': random_state,
        'tree_method': 'gpu_hist',
        'gpu_id': 0,
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 17),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.6),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0)
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_val)

    return mean_absolute_error(y_val, y_pred)

In [17]:
study = optuna.create_study(study_name='XGBoost tuning', direction='minimize')
study.optimize(objective, n_trials=250)

[32m[I 2023-05-28 17:29:07,903][0m A new study created in memory with name: XGBoost tuning[0m
[32m[I 2023-05-28 17:29:28,564][0m Trial 0 finished with value: 10311.36670036319 and parameters: {'n_estimators': 512, 'max_depth': 10, 'learning_rate': 0.450529445031649, 'subsample': 0.8946200001738192, 'colsample_bytree': 0.8097876598211103, 'gamma': 0.3037594660271675}. Best is trial 0 with value: 10311.36670036319.[0m
[32m[I 2023-05-28 17:30:26,639][0m Trial 1 finished with value: 9771.78193358353 and parameters: {'n_estimators': 657, 'max_depth': 15, 'learning_rate': 0.2357690672085827, 'subsample': 0.7805901776847257, 'colsample_bytree': 0.7308314573868364, 'gamma': 2.9303076847536746}. Best is trial 1 with value: 9771.78193358353.[0m
[32m[I 2023-05-28 17:31:23,252][0m Trial 2 finished with value: 10253.880589178403 and parameters: {'n_estimators': 716, 'max_depth': 14, 'learning_rate': 0.33977824653792066, 'subsample': 0.7080092333133463, 'colsample_bytree': 0.8617389164457

[32m[I 2023-05-28 17:58:06,511][0m Trial 25 finished with value: 8728.024082318592 and parameters: {'n_estimators': 932, 'max_depth': 16, 'learning_rate': 0.0633198363523451, 'subsample': 0.7417269427820228, 'colsample_bytree': 0.6433024247294993, 'gamma': 3.026186905235239}. Best is trial 23 with value: 8672.462807807866.[0m
[32m[I 2023-05-28 17:59:44,055][0m Trial 26 finished with value: 8829.131207447843 and parameters: {'n_estimators': 921, 'max_depth': 16, 'learning_rate': 0.06352049024786541, 'subsample': 0.7288126557863148, 'colsample_bytree': 0.6000336365978606, 'gamma': 3.1247812514006106}. Best is trial 23 with value: 8672.462807807866.[0m
[32m[I 2023-05-28 18:01:22,458][0m Trial 27 finished with value: 8734.204906653049 and parameters: {'n_estimators': 935, 'max_depth': 16, 'learning_rate': 0.06006307204294442, 'subsample': 0.6953119142521921, 'colsample_bytree': 0.6394393545275742, 'gamma': 3.409111383913012}. Best is trial 23 with value: 8672.462807807866.[0m
[32

[32m[I 2023-05-28 18:29:00,409][0m Trial 50 finished with value: 8801.657434078821 and parameters: {'n_estimators': 907, 'max_depth': 13, 'learning_rate': 0.09420698996282308, 'subsample': 0.8118270749778881, 'colsample_bytree': 0.72262164394629, 'gamma': 4.0377673369916245}. Best is trial 44 with value: 8658.243998285468.[0m
[32m[I 2023-05-28 18:30:40,839][0m Trial 51 finished with value: 8697.364906566347 and parameters: {'n_estimators': 962, 'max_depth': 15, 'learning_rate': 0.030125983641755113, 'subsample': 0.711842857378066, 'colsample_bytree': 0.7083018352744291, 'gamma': 3.6326116031020725}. Best is trial 44 with value: 8658.243998285468.[0m
[32m[I 2023-05-28 18:33:10,281][0m Trial 52 finished with value: 8661.032680103577 and parameters: {'n_estimators': 966, 'max_depth': 17, 'learning_rate': 0.034552296302717445, 'subsample': 0.7371622450548971, 'colsample_bytree': 0.7025452434141434, 'gamma': 4.435012382058245}. Best is trial 44 with value: 8658.243998285468.[0m
[3

[32m[I 2023-05-28 19:20:25,333][0m Trial 75 finished with value: 8785.173583903026 and parameters: {'n_estimators': 933, 'max_depth': 16, 'learning_rate': 0.06682750256962282, 'subsample': 0.7298655370942266, 'colsample_bytree': 0.6602830344725855, 'gamma': 3.2427854951817725}. Best is trial 72 with value: 8630.534917388979.[0m
[32m[I 2023-05-28 19:21:09,139][0m Trial 76 finished with value: 8953.16321417488 and parameters: {'n_estimators': 977, 'max_depth': 11, 'learning_rate': 0.04623171573539393, 'subsample': 0.7406909941827223, 'colsample_bytree': 0.7270558953185496, 'gamma': 3.181637764497327}. Best is trial 72 with value: 8630.534917388979.[0m
[32m[I 2023-05-28 19:22:42,307][0m Trial 77 finished with value: 8852.493961143931 and parameters: {'n_estimators': 869, 'max_depth': 16, 'learning_rate': 0.08027303142308405, 'subsample': 0.7633090590930972, 'colsample_bytree': 0.6465044426966889, 'gamma': 3.779063083486425}. Best is trial 72 with value: 8630.534917388979.[0m
[32

[32m[I 2023-05-28 20:08:26,467][0m Trial 100 finished with value: 8969.811257086216 and parameters: {'n_estimators': 978, 'max_depth': 17, 'learning_rate': 0.09105235444633221, 'subsample': 0.7029423180494884, 'colsample_bytree': 0.6215723632429864, 'gamma': 3.994748520621817}. Best is trial 72 with value: 8630.534917388979.[0m
[32m[I 2023-05-28 20:10:39,843][0m Trial 101 finished with value: 8718.484304488184 and parameters: {'n_estimators': 964, 'max_depth': 17, 'learning_rate': 0.03655763847226545, 'subsample': 0.7116975310709676, 'colsample_bytree': 0.6008352606382906, 'gamma': 3.81047412115473}. Best is trial 72 with value: 8630.534917388979.[0m
[32m[I 2023-05-28 20:13:19,497][0m Trial 102 finished with value: 8709.9868341879 and parameters: {'n_estimators': 945, 'max_depth': 17, 'learning_rate': 0.01839095101762333, 'subsample': 0.6821813267277647, 'colsample_bytree': 0.637229468207852, 'gamma': 4.201876057556929}. Best is trial 72 with value: 8630.534917388979.[0m
[32m

[32m[I 2023-05-28 20:56:41,426][0m Trial 125 finished with value: 8711.270102964168 and parameters: {'n_estimators': 997, 'max_depth': 17, 'learning_rate': 0.047747534483661314, 'subsample': 0.7173338880326615, 'colsample_bytree': 0.6397925200575081, 'gamma': 4.0662434032100965}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 20:58:20,504][0m Trial 126 finished with value: 8681.219249441372 and parameters: {'n_estimators': 941, 'max_depth': 16, 'learning_rate': 0.059036596972373144, 'subsample': 0.6766635342536096, 'colsample_bytree': 0.6669433597364434, 'gamma': 4.168914246427671}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 21:02:09,022][0m Trial 127 finished with value: 8853.060670401284 and parameters: {'n_estimators': 979, 'max_depth': 17, 'learning_rate': 0.010043549285807682, 'subsample': 0.7516779406427908, 'colsample_bytree': 0.6168508351575034, 'gamma': 4.7426906388198145}. Best is trial 103 with value: 8613.91772534134

[32m[I 2023-05-28 21:50:44,921][0m Trial 150 finished with value: 8648.06147094243 and parameters: {'n_estimators': 962, 'max_depth': 17, 'learning_rate': 0.03902886028788954, 'subsample': 0.7060618795675384, 'colsample_bytree': 0.6623747436303401, 'gamma': 3.6107605716781057}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 21:52:33,693][0m Trial 151 finished with value: 8708.641654176272 and parameters: {'n_estimators': 679, 'max_depth': 17, 'learning_rate': 0.03536558855118514, 'subsample': 0.7029682973076717, 'colsample_bytree': 0.6657808831207204, 'gamma': 3.9558527492797904}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 21:54:48,791][0m Trial 152 finished with value: 8682.105013990962 and parameters: {'n_estimators': 963, 'max_depth': 17, 'learning_rate': 0.04079436926457657, 'subsample': 0.6951357548734711, 'colsample_bytree': 0.7000997563575492, 'gamma': 3.6155363964126788}. Best is trial 103 with value: 8613.917725341347.

[32m[I 2023-05-28 22:47:09,110][0m Trial 175 finished with value: 8694.271982390315 and parameters: {'n_estimators': 964, 'max_depth': 17, 'learning_rate': 0.01970522761062501, 'subsample': 0.7095216060597954, 'colsample_bytree': 0.6811188259081734, 'gamma': 4.359081502757504}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 22:49:29,563][0m Trial 176 finished with value: 8662.966354658654 and parameters: {'n_estimators': 984, 'max_depth': 17, 'learning_rate': 0.03072576471203374, 'subsample': 0.6872035561893857, 'colsample_bytree': 0.6598429944439681, 'gamma': 4.06333233546613}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 22:50:33,992][0m Trial 177 finished with value: 9199.40312180934 and parameters: {'n_estimators': 486, 'max_depth': 17, 'learning_rate': 0.12746950878275526, 'subsample': 0.736489503794877, 'colsample_bytree': 0.6954521663149321, 'gamma': 3.7232324586546475}. Best is trial 103 with value: 8613.917725341347.[0m


[32m[I 2023-05-28 23:42:59,473][0m Trial 200 finished with value: 8658.60700275176 and parameters: {'n_estimators': 982, 'max_depth': 17, 'learning_rate': 0.03145573403967134, 'subsample': 0.6755705398625267, 'colsample_bytree': 0.6564347532996243, 'gamma': 3.375749949267737}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 23:45:39,219][0m Trial 201 finished with value: 8656.058283942817 and parameters: {'n_estimators': 998, 'max_depth': 17, 'learning_rate': 0.021289655343907653, 'subsample': 0.6779922922907203, 'colsample_bytree': 0.6729503964776302, 'gamma': 3.1246218789920537}. Best is trial 103 with value: 8613.917725341347.[0m
[32m[I 2023-05-28 23:47:58,779][0m Trial 202 finished with value: 8584.631689728147 and parameters: {'n_estimators': 999, 'max_depth': 17, 'learning_rate': 0.032615987565158135, 'subsample': 0.6799538235573898, 'colsample_bytree': 0.6639186932157412, 'gamma': 3.552061094118031}. Best is trial 202 with value: 8584.631689728147.

[32m[I 2023-05-29 00:34:12,613][0m Trial 225 finished with value: 8610.590770356142 and parameters: {'n_estimators': 1000, 'max_depth': 17, 'learning_rate': 0.03424651802912876, 'subsample': 0.6668911774554702, 'colsample_bytree': 0.6699524473203846, 'gamma': 3.7344634062017743}. Best is trial 202 with value: 8584.631689728147.[0m
[32m[I 2023-05-29 00:36:14,193][0m Trial 226 finished with value: 8707.742171211738 and parameters: {'n_estimators': 996, 'max_depth': 17, 'learning_rate': 0.05920617271407771, 'subsample': 0.6691658289451421, 'colsample_bytree': 0.6798754382830061, 'gamma': 3.7419971513926122}. Best is trial 202 with value: 8584.631689728147.[0m
[32m[I 2023-05-29 00:38:08,781][0m Trial 227 finished with value: 9330.972438890833 and parameters: {'n_estimators': 1000, 'max_depth': 17, 'learning_rate': 0.132686310971474, 'subsample': 0.6565398510234746, 'colsample_bytree': 0.6409118087428171, 'gamma': 3.8990482788094334}. Best is trial 202 with value: 8584.631689728147.

In [18]:
best_params = study.best_params
best_params

{'n_estimators': 999,
 'max_depth': 17,
 'learning_rate': 0.032615987565158135,
 'subsample': 0.6799538235573898,
 'colsample_bytree': 0.6639186932157412,
 'gamma': 3.552061094118031}

In [19]:
base_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'random_state': random_state,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
}

model_improved = XGBRegressor(**base_params, **best_params)

start_time = default_timer()
model_improved.fit(X_train, y_train, verbose=False)
stop_time = default_timer()

print(f'Training time: {timedelta(seconds=stop_time - start_time)}')

Training time: 0:02:17.943242


In [22]:
pickle.dump(model_improved, open(os.path.join(models_path, 'model_improved.pickle'), 'wb'))

Feature importances

In [27]:
top_25_feature_idx = model_improved.feature_importances_.argsort()[-25:]
feature_importances = model_improved.feature_importances_[top_25_feature_idx]
feature_names = X_train.columns[top_25_feature_idx]

title = f'<span style="font-size: 20px;">Najbardziej istotne cechy wejściowe w finalnym modelu</span>'

fig = px.bar(y=feature_names, x=feature_importances, orientation='h', height=650,
             labels={'y': 'Nazwa cechy wejściowej', 'x': 'Istotność'})
fig.update_layout(
    title={'text': title, 'x': 0.5},
    margin={'t': 100}
)
fig.show()

### Model testing

In [28]:
y_pred = model_improved.predict(X_test)

title = f'<span style="font-size: 20px;"><b>Finalny model (zbiór testowy)</b></span><br>'
final_metrics = calculate_metrics(y_test, y_pred)

plot_actual_vs_predicted(y_test, y_pred, final_metrics, title)
plot_error_histogram(y_test, y_pred, title)