In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier, BaggingRegressor, \
    GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
warnings.filterwarnings('ignore')

In [7]:
 def print_metrics(estimator, y_test, y_pred):
    print(f"Коэффициент детерминации: {estimator.score(X,y)}")
    print(f'MSE: {mean_squared_error(y_test, y_pred)}')
    print(f'RMSE: {mean_squared_error(y_test, y_pred, squared=False)}')
    print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

# Регрессия

In [4]:
diamonds_data = pd.read_csv("../data/diamonds/diamonds_preprocessed.csv", index_col = 0)
diamonds_data.head()

Unnamed: 0,carat,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
1,0.23,61.5,55.0,326,3.95,3.98,2.43,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0.21,59.8,61.0,326,3.89,3.84,2.31,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0.23,56.9,65.0,327,4.05,4.07,2.31,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,0,...,1,0,0,0,0,0,0,1,0,0
5,0.31,63.3,58.0,335,4.34,4.35,2.75,0,1,0,...,0,1,0,0,0,1,0,0,0,0


In [5]:
y = diamonds_data["price"]
diamonds_data.drop('price', axis =1, inplace = True)
X = diamonds_data
X

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
1,0.23,61.5,55.0,3.95,3.98,2.43,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0.21,59.8,61.0,3.89,3.84,2.31,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,0.23,56.9,65.0,4.05,4.07,2.31,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.29,62.4,58.0,4.20,4.23,2.63,0,0,0,1,...,1,0,0,0,0,0,0,1,0,0
5,0.31,63.3,58.0,4.34,4.35,2.75,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53936,0.72,60.8,57.0,5.75,5.76,3.50,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
53937,0.72,63.1,55.0,5.69,5.75,3.61,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
53938,0.70,62.8,60.0,5.66,5.68,3.56,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
53939,0.86,61.0,58.0,6.15,6.12,3.74,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## DTR

In [9]:
%%time
params_dtr = {'max_depth': np.arange(5,16,1)}
dtr = DecisionTreeRegressor().fit(X_train, y_train)
dtr_base = GridSearchCV(dtr, params_dtr).fit(X_train, y_train)
print(dtr_base.best_params_)

{'max_depth': 13}
Wall time: 6.49 s


In [10]:
print_metrics(dtr_base, y_test, dtr_base.predict(X_test))

Коэффициент детерминации: 0.9830197224494254
MSE: 445973.79198051215
RMSE: 667.8126922876744
MAE: 340.69410830034536


## BaggingRegressor

In [11]:
params_ensemble = {'n_estimators': np.arange(10,101,20),
                    'max_features': np.arange(1,24,10)}

In [12]:
%%time
br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=dtr_base.best_params_['max_depth']))
model = GridSearchCV(br, params_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 2min 32s


{'max_features': 21, 'n_estimators': 90}

In [13]:
print_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9851505225014893
MSE: 359151.70290012046
RMSE: 599.292668818934
MAE: 308.3713351145448


## GradientBoostingRegressor

In [14]:
%%time
gbr = GradientBoostingRegressor()
model = GridSearchCV(gbr, params_ensemble).fit(X_train, y_train)
model.best_params_

Wall time: 1min 3s


{'max_features': 21, 'n_estimators': 90}

In [15]:
print_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9658753956251825
MSE: 571958.4224508983
RMSE: 756.2793283244613
MAE: 414.51648420283607


## StackingRegressor

In [16]:
 %%time
model = StackingRegressor(estimators=[('br',br), ('gbr',gbr)],
                          final_estimator=dtr_base).fit(X_train, y_train)
print_metrics(model, y_test, model.predict(X_test))

Коэффициент детерминации: 0.9835715058813513
MSE: 372592.40499590983
RMSE: 610.4034772147926
MAE: 312.3384708445437
Wall time: 31.1 s


## Вывод

Композиции могут значительно повлиять на время обучения моделей. Их применение может повысить качество моделей для задачи регрессии