In [8]:
import warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier, BaggingRegressor, \
    GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
warnings.filterwarnings('ignore')

In [15]:
def print_metrics(estimator, y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print(estimator.score(X_test, y_test))
    print(confusion_matrix(y_test, y_pred))

# Классификация

In [4]:
diabetes_data = pd.read_csv("../data/diabetes/diabetes_preprocessed.csv", index_col = 0)
diabetes_data.head()

Unnamed: 0,Age,Polyuria,Polydipsia,Sudden_Weight_Loss,Weakness,Polyphagia,Genital_thrush,Visual_Blurring,Itching,Irritability,Delayed_Healing,Partial_Paresis,Muscle_Stiffness,Alopecia,Obesity,Class,Gender_Female,Gender_Male
0,40,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1,0,1
1,58,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1,0,1
2,41,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1,0,1
3,45,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1,0,1
4,60,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1


In [5]:
y = diabetes_data["Class"]
diabetes_data.drop('Class', axis =1, inplace = True)
X = diabetes_data
X

Unnamed: 0,Age,Polyuria,Polydipsia,Sudden_Weight_Loss,Weakness,Polyphagia,Genital_thrush,Visual_Blurring,Itching,Irritability,Delayed_Healing,Partial_Paresis,Muscle_Stiffness,Alopecia,Obesity,Gender_Female,Gender_Male
0,40,0,1,0,1,0,0,0,1,0,1,0,1,1,1,0,1
1,58,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,1
2,41,1,0,0,1,1,0,0,1,0,1,0,1,1,0,0,1
3,45,0,0,1,1,1,1,0,1,0,1,0,0,0,0,0,1
4,60,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1,0
516,48,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1,0
517,58,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1,0
518,32,0,0,0,1,0,0,1,1,0,1,0,0,1,0,1,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [9]:
scaler = MinMaxScaler()
scaler.fit(X_train[['Age']])
scaler.fit(X_test[['Age']])

MinMaxScaler()

## DTC

In [17]:
%%time
params_dtc = {'max_depth': np.arange(1,20,1)}
dtc = DecisionTreeClassifier()
dtc_base = GridSearchCV(dtc, params_dtc).fit(X_train, y_train)
dtc_base.best_params_

Wall time: 362 ms


{'max_depth': 7}

In [18]:
print_metrics(dtc_base, y_test, dtc_base.predict(X_test))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95        50
           1       0.97      0.96      0.97        80

    accuracy                           0.96       130
   macro avg       0.96      0.96      0.96       130
weighted avg       0.96      0.96      0.96       130

0.9615384615384616
[[48  2]
 [ 3 77]]


## BaggingClassifier

In [22]:
%%time
params_ensemble = {'n_estimators': np.arange(10,101,20),
                    'max_features': np.arange(1,24,10)}
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=dtc_base.best_params_['max_depth']))
model = GridSearchCV(bagging, params_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 4.06 s


{'max_features': 11, 'n_estimators': 50}

In [21]:
print_metrics(model, y_test, model.predict(X_test))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94        50
           1       0.99      0.94      0.96        80

    accuracy                           0.95       130
   macro avg       0.95      0.96      0.95       130
weighted avg       0.96      0.95      0.95       130

0.9538461538461539
[[49  1]
 [ 5 75]]


## GradientBoostingClassifier

In [25]:
%%time
grad = GradientBoostingClassifier()
model = GridSearchCV(grad, params_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 1.62 s


{'max_features': 11, 'n_estimators': 50}

In [26]:
 print_metrics(model, y_test, model.predict(X_test))

              precision    recall  f1-score   support

           0       0.87      0.96      0.91        50
           1       0.97      0.91      0.94        80

    accuracy                           0.93       130
   macro avg       0.92      0.94      0.93       130
weighted avg       0.93      0.93      0.93       130

0.9307692307692308
[[48  2]
 [ 7 73]]


## StackingClassifier

In [27]:
%%time
model = StackingClassifier(estimators=[('bag',bagging), ('gbc',grad)],
                           final_estimator=dtc_base).fit(X_train, y_train)

Wall time: 593 ms


In [29]:
print_metrics(model, y_test, model.predict(X_test))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92        50
           1       0.99      0.90      0.94        80

    accuracy                           0.93       130
   macro avg       0.92      0.94      0.93       130
weighted avg       0.94      0.93      0.93       130

0.9307692307692308
[[49  1]
 [ 8 72]]


## Вывод
Композиции могут значительно повлиять на время обучения моделей. На представленых выше данных композиция не дала значительного увеличения качества модели