Импорт библиотек и загрузка датасета

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier

warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv(r'../data/car-evaluation_preprocessed.csv', sep=',')

In [3]:
data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,4,4,2,2,1,1,1
1,4,4,2,2,1,2,1
2,4,4,2,2,1,3,1
3,4,4,2,2,2,1,1
4,4,4,2,2,2,2,1
...,...,...,...,...,...,...,...
1723,1,1,5,5,2,2,3
1724,1,1,5,5,2,3,4
1725,1,1,5,5,3,1,1
1726,1,1,5,5,3,2,3


In [4]:
X = data.iloc[:, :-1].values
y = data.iloc[:, 6].values

In [5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
def print_classification_model_metrics(estimator, y_test, y_pred):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))

# DTC

In [8]:
%%time
parameters_dtc = {'max_depth': np.arange(1,20,1)}
dtc = DecisionTreeClassifier()
dtc_base = GridSearchCV(dtc, parameters_dtc).fit(X_train, y_train)
dtc_base.best_params_

Wall time: 236 ms


{'max_depth': 17}

In [9]:
print_classification_model_metrics(dtc_base, y_test, dtc_base.predict(X_test))

[[235   3   0   0]
 [  4  66   3   1]
 [  1   0  16   1]
 [  0   0   0  16]]
              precision    recall  f1-score   support

           1       0.98      0.99      0.98       238
           2       0.96      0.89      0.92        74
           3       0.84      0.89      0.86        18
           4       0.89      1.00      0.94        16

    accuracy                           0.96       346
   macro avg       0.92      0.94      0.93       346
weighted avg       0.96      0.96      0.96       346

0.9624277456647399


ensemble params

In [10]:
parameters_ensemble = {'n_estimators': np.arange(0,100,5),
                       'max_features': np.arange(1,24,3)}

# Бэггинг

In [11]:
%%time
bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=dtc_base.best_params_['max_depth']))
model = GridSearchCV(bag, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 25.5 s


{'max_features': 4, 'n_estimators': 45}

In [12]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[238   0   0   0]
 [ 22  52   0   0]
 [  1  17   0   0]
 [  0   9   0   7]]
              precision    recall  f1-score   support

           1       0.91      1.00      0.95       238
           2       0.67      0.70      0.68        74
           3       0.00      0.00      0.00        18
           4       1.00      0.44      0.61        16

    accuracy                           0.86       346
   macro avg       0.64      0.54      0.56       346
weighted avg       0.82      0.86      0.83       346

0.8583815028901735


# Бустинг

In [13]:
%%time
gbc = GradientBoostingClassifier()
model = GridSearchCV(gbc, parameters_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 1min


{'max_features': 4, 'n_estimators': 95}

In [14]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[236   2   0   0]
 [  0  70   3   1]
 [  0   0  18   0]
 [  0   0   0  16]]
              precision    recall  f1-score   support

           1       1.00      0.99      1.00       238
           2       0.97      0.95      0.96        74
           3       0.86      1.00      0.92        18
           4       0.94      1.00      0.97        16

    accuracy                           0.98       346
   macro avg       0.94      0.98      0.96       346
weighted avg       0.98      0.98      0.98       346

0.9826589595375722


# Стекинг

In [15]:
%%time
model = StackingClassifier(estimators=[('bag',bag), ('gbc',gbc)],
                           final_estimator=dtc_base).fit(X_train, y_train)

Wall time: 4.71 s


In [16]:
print_classification_model_metrics(model, y_test, model.predict(X_test))

[[238   0   0   0]
 [  1  72   1   0]
 [  0   1  17   0]
 [  0   1   1  14]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       238
           2       0.97      0.97      0.97        74
           3       0.89      0.94      0.92        18
           4       1.00      0.88      0.93        16

    accuracy                           0.99       346
   macro avg       0.97      0.95      0.96       346
weighted avg       0.99      0.99      0.99       346

0.9855491329479769
