# 4.9 Testowanie możliwości stackingu

In [19]:
import pandas as pd

In [20]:
boston_df = pd.read_parquet("../data/boston_df.parquet")
boston_df.sample(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
23,0.98843,0.0,8.14,0.0,0.538,5.813,100.0,4.0952,4.0,307.0,21.0,394.54,19.88,14.5
245,0.19133,22.0,5.86,0.0,0.431,5.605,70.2,7.9549,7.0,330.0,19.1,389.13,18.46,18.5
18,0.80271,0.0,8.14,0.0,0.538,5.456,36.6,3.7965,4.0,307.0,21.0,288.99,11.69,20.2
230,0.537,0.0,6.2,0.0,0.504,5.981,68.1,3.6715,8.0,307.0,17.4,378.35,11.65,24.3
494,0.27957,0.0,9.69,0.0,0.585,5.926,42.6,2.3817,6.0,391.0,19.2,396.9,13.59,24.5


In [21]:
X = boston_df.drop(columns="MEDV")
y = boston_df["MEDV"]

## Tworzenie bazowych regresorów

In [22]:
from sklearn.tree import DecisionTreeRegressor

In [23]:
dt_best_params = {
    "ccp_alpha": 0.0,
    "criterion": 'absolute_error',
    "max_depth": 7,
    "max_features": None,
    "min_samples_leaf": 8,
    "min_samples_split": 2
}
dt = DecisionTreeRegressor(**dt_best_params)

In [24]:
from sklearn.ensemble import ExtraTreesRegressor

In [25]:
et = ExtraTreesRegressor(n_estimators=10, **dt_best_params)

In [26]:
from xgboost import XGBRegressor

In [27]:
xgb = XGBRegressor(random_state=253)

Mamy już podstawowe drzewo decyzyjne, model Extra Trees oraz XGBoost, ponieważ to te trzy metody dały nam jak do tej pory najlepsze rezultaty. Sci-kit learn ma w swoim zanadrzu jeszcze kilka modeli regresji, które potraktujemy trochę narzędziowo i dołączymy do listy estymatorów w stackingu.

In [28]:
from sklearn.linear_model import LinearRegression

In [29]:
lr = LinearRegression()

In [30]:
from sklearn.svm import SVR

In [31]:
svm = SVR()

## Tworzenie modelu stackingu
Jak wspomnieliśmy, stacking pozwala nam nauczyć pewne bazowe modele, a następnie wykorzystuje ich wyniki do finalnej klasyfikacji, Możemy więc przetestować różne konfiguracje, co też uczynimy.

In [32]:
estimators = [dt, et, xgb, lr, svm]
named_estimators = [
    ("decision_tree", dt), ("extra_trees", et), ("xgboost", xgb), ("linear_regression", lr),
    ("support_vector_machine", svm)
]

In [33]:
from sklearn.ensemble import StackingRegressor

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
param_grid = {
    "final_estimator": estimators
}

In [36]:
cv = GridSearchCV(StackingRegressor(estimators=named_estimators),
                  param_grid=param_grid, n_jobs=-1, verbose=1, cv=5, scoring=["neg_mean_squared_error", "neg_mean_absolute_error"], return_train_score=True, refit="neg_mean_squared_error")
cv.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [38]:
cv.best_estimator_, cv.best_score_

(StackingRegressor(estimators=[('decision_tree',
                                DecisionTreeRegressor(criterion='absolute_error',
                                                      max_depth=7,
                                                      min_samples_leaf=8)),
                               ('extra_trees',
                                ExtraTreesRegressor(criterion='absolute_error',
                                                    max_depth=7,
                                                    max_features=None,
                                                    min_samples_leaf=8,
                                                    n_estimators=10)),
                               ('xgboost',
                                XGBRegressor(base_score=None, booster=None,
                                             callbacks=None,
                                             colsample_bylevel=None,
                                             c...
                     

In [39]:
results_df = pd.DataFrame(cv.cv_results_)
results_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_final_estimator,params,split0_test_neg_mean_squared_error,split1_test_neg_mean_squared_error,split2_test_neg_mean_squared_error,split3_test_neg_mean_squared_error,...,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_train_neg_mean_absolute_error,split1_train_neg_mean_absolute_error,split2_train_neg_mean_absolute_error,split3_train_neg_mean_absolute_error,split4_train_neg_mean_absolute_error,mean_train_neg_mean_absolute_error,std_train_neg_mean_absolute_error
0,2.539831,0.068764,0.019078,0.000906,DecisionTreeRegressor(criterion='absolute_erro...,{'final_estimator': DecisionTreeRegressor(crit...,-12.206691,-22.255074,-26.982921,-58.925495,...,-3.580583,0.603024,3,-2.215842,-2.051852,-2.162469,-1.992099,-2.261111,-2.136674,0.100513
1,2.538151,0.072013,0.019607,0.000727,ExtraTreesRegressor(criterion='absolute_error'...,{'final_estimator': ExtraTreesRegressor(criter...,-8.591727,-18.427336,-26.033492,-57.059759,...,-3.238065,0.671394,2,-2.061015,-1.750383,-1.839185,-1.653889,-1.536988,-1.768292,0.177552
2,2.415246,0.185675,0.018886,0.002674,"XGBRegressor(base_score=None, booster=None, ca...",{'final_estimator': XGBRegressor(base_score=No...,-10.887093,-40.318305,-22.584728,-64.108974,...,-4.346879,1.523846,5,-2.823397,-3.203102,-2.806305,-2.277071,-2.645488,-2.751073,0.299479
3,2.162863,0.067644,0.016551,0.003014,LinearRegression(),{'final_estimator': LinearRegression()},-9.551364,-12.216339,-16.638173,-51.576794,...,-3.061745,0.574403,1,-2.165991,-1.263037,-1.238091,-1.260587,-1.247118,-1.434965,0.365626
4,1.974725,0.267652,0.01862,0.003809,SVR(),{'final_estimator': SVR()},-7.809763,-38.443284,-65.849931,-55.1116,...,-3.867136,0.954353,4,-2.362217,-2.345062,-2.342889,-1.990355,-2.496124,-2.307329,0.168414


In [40]:
results_df.iloc[cv.best_index_]

mean_fit_time                                                          2.162863
std_fit_time                                                           0.067644
mean_score_time                                                        0.016551
std_score_time                                                         0.003014
param_final_estimator                                        LinearRegression()
params                                  {'final_estimator': LinearRegression()}
split0_test_neg_mean_squared_error                                    -9.551364
split1_test_neg_mean_squared_error                                   -12.216339
split2_test_neg_mean_squared_error                                   -16.638173
split3_test_neg_mean_squared_error                                   -51.576794
split4_test_neg_mean_squared_error                                   -19.398966
mean_test_neg_mean_squared_error                                     -21.876327
std_test_neg_mean_squared_error         

Okazuje się, że mimo wszystko model oparty wyłącznie o XGBoost jest najlepszy do przewidywania akurat tego zbioru danych.