Импорт библиотек и загрузка датасета

In [126]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

warnings.filterwarnings("ignore")

In [127]:
data = pd.read_csv(r'../data/car-evaluation_preprocessed.csv', sep=',')

In [128]:
data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,4,4,2,2,1,1,1
1,4,4,2,2,1,2,1
2,4,4,2,2,1,3,1
3,4,4,2,2,2,1,1
4,4,4,2,2,2,2,1
...,...,...,...,...,...,...,...
1723,1,1,5,5,2,2,3
1724,1,1,5,5,2,3,4
1725,1,1,5,5,3,1,1
1726,1,1,5,5,3,2,3


In [129]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 6].values

In [130]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [131]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [132]:
def print_classification_model_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

# DTC

In [133]:
%%time
parameters_dtc = {'max_depth': np.arange(1,20,1)}
dtc = DecisionTreeClassifier()
dtc_base = GridSearchCV(dtc, parameters_dtc).fit(X_train, y_train)
dtc_base.best_params_

Wall time: 300 ms


{'max_depth': 11}

In [134]:
print_classification_model_metrics(dtc_base, y_test, dtc_base.predict(X_test))

[[236   1   0   0]
 [  5  73   2   2]
 [  0   0  14   0]
 [  0   1   0  12]]
              precision    recall  f1-score   support

           1       0.98      1.00      0.99       237
           2       0.97      0.89      0.93        82
           3       0.88      1.00      0.93        14
           4       0.86      0.92      0.89        13

    accuracy                           0.97       346
   macro avg       0.92      0.95      0.93       346
weighted avg       0.97      0.97      0.97       346

0.9682080924855492


ensemble params

In [135]:
parameters_ensemble = {'n_estimators': np.arange(0,100,5),
                       'max_features': np.arange(1,24,3)}

# Бэггинг

In [136]:
%%time
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=dtc_base.best_params_['max_depth']))
model = GridSearchCV(bag, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 25.3 s


{'max_features': 4, 'n_estimators': 25}

In [137]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[237   0   0   0]
 [ 26  56   0   0]
 [  1   9   2   2]
 [  1   4   0   8]]
              precision    recall  f1-score   support

           1       0.89      1.00      0.94       237
           2       0.81      0.68      0.74        82
           3       1.00      0.14      0.25        14
           4       0.80      0.62      0.70        13

    accuracy                           0.88       346
   macro avg       0.88      0.61      0.66       346
weighted avg       0.88      0.88      0.86       346

0.8757225433526011
