# 3.8 Poszukiwanie najlepszej konfiguracji drzewa regresji.

In [1]:
import pandas as pd

In [2]:
boston_df = pd.read_parquet("../data/boston_df.parquet")
boston_df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [4]:
X = boston_df.drop(columns="MEDV")
y = boston_df["MEDV"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=421)

In [12]:
dt = DecisionTreeRegressor(criterion="squared_error")
dt.fit(X_train, y_train)

## Dobór progu dla mechanizmu przycinania drzewa

In [13]:
path = dt.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, _ = path.ccp_alphas, path.impurities
ccp_alphas.min(), ccp_alphas.max()

(0.0, 37.658152473190086)

In [14]:
path

{'ccp_alphas': array([0.00000000e+00, 2.81403064e-16, 2.81403064e-16, 8.44209191e-16,
        8.44209191e-16, 1.40701532e-15, 1.23762376e-05, 1.23762376e-05,
        1.23762376e-05, 1.23762376e-05, 1.23762376e-05, 1.23762376e-05,
        1.23762376e-05, 1.23762376e-05, 1.23762376e-05, 1.23762376e-05,
        1.23762376e-05, 1.23762376e-05, 1.23762376e-05, 1.23762376e-05,
        1.23762376e-05, 1.23762376e-05, 1.23762376e-05, 1.23762376e-05,
        1.23762376e-05, 1.23762376e-05, 1.23762376e-05, 1.23762376e-05,
        1.23762376e-05, 1.23762376e-05, 1.23762376e-05, 1.23762376e-05,
        1.23762376e-05, 1.23762376e-05, 1.23762376e-05, 1.65016502e-05,
        1.65016502e-05, 1.65016502e-05, 3.71287129e-05, 3.71287129e-05,
        3.71287129e-05, 4.95049505e-05, 4.95049505e-05, 4.95049505e-05,
        4.95049505e-05, 4.95049505e-05, 4.95049505e-05, 4.95049505e-05,
        4.95049505e-05, 4.95049505e-05, 4.95049505e-05, 4.95049505e-05,
        4.95049505e-05, 4.95049505e-05, 4.95049505

In [15]:
_

array([1.50814454e-14, 1.53628485e-14, 1.56442516e-14, 1.64884608e-14,
       1.73326700e-14, 1.87396853e-14, 1.23762376e-05, 2.47524753e-05,
       3.71287129e-05, 4.95049505e-05, 6.18811881e-05, 7.42574258e-05,
       8.66336634e-05, 9.90099010e-05, 1.11386139e-04, 1.23762376e-04,
       1.36138614e-04, 1.48514851e-04, 1.60891089e-04, 1.73267327e-04,
       1.85643564e-04, 1.98019802e-04, 2.10396040e-04, 2.22772277e-04,
       2.35148515e-04, 2.47524752e-04, 2.59900990e-04, 2.72277228e-04,
       2.84653465e-04, 2.97029703e-04, 3.09405941e-04, 3.21782178e-04,
       3.34158416e-04, 3.46534653e-04, 3.58910891e-04, 3.75412541e-04,
       3.91914191e-04, 4.08415842e-04, 4.45544554e-04, 4.82673267e-04,
       5.19801980e-04, 5.69306931e-04, 6.18811881e-04, 6.68316832e-04,
       7.17821782e-04, 7.67326733e-04, 8.16831683e-04, 8.66336634e-04,
       9.15841584e-04, 9.65346535e-04, 1.01485149e-03, 1.06435644e-03,
       1.11386139e-03, 1.16336634e-03, 1.21287129e-03, 1.26237624e-03,
      

### Poszukiwanie najlepszej konfiguracji drzewa

In [16]:
import numpy as np

In [17]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = {
    "criterion": ["squared_error", "absolute_error"],
    "splitter": ["best", "random"],
    "max_depth": range(2, 21),
    "min_samples_split": [2, 4, 8, 16],
    "min_samples_leaf": [1, 2, 4, 8],
    "max_features": [None, "sqrt", "log2"],
    "ccp_alpha": np.linspace(ccp_alphas.min(), ccp_alphas.max(), 30)
}

In [22]:
cv = GridSearchCV(DecisionTreeRegressor(random_state=462),
                  param_grid=param_grid, n_jobs=-1, 
                  scoring=["neg_mean_squared_error", "neg_mean_absolute_error"],
                  verbose=1, cv=5, return_train_score=True,
                  refit="neg_mean_squared_error")

In [23]:
cv.fit(X, y)

Fitting 5 folds for each of 109440 candidates, totalling 547200 fits


In [24]:
cv

In [25]:
cv.best_estimator_

In [26]:
cv.best_index_

2328

In [27]:
cv.best_params_

{'ccp_alpha': 0.0,
 'criterion': 'absolute_error',
 'max_depth': 7,
 'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'splitter': 'best'}

In [28]:
cv.best_score_

-25.44205066977286

In [29]:
results_df = pd.DataFrame(cv.cv_results_)

In [30]:
results_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,...,mean_test_neg_mean_absolute_error,std_test_neg_mean_absolute_error,rank_test_neg_mean_absolute_error,split0_train_neg_mean_absolute_error,split1_train_neg_mean_absolute_error,split2_train_neg_mean_absolute_error,split3_train_neg_mean_absolute_error,split4_train_neg_mean_absolute_error,mean_train_neg_mean_absolute_error,std_train_neg_mean_absolute_error
0,0.003646,0.000272,0.002804,0.000358,0.0,squared_error,2,,1,2,...,-4.556823,0.800674,4514,-3.893596,-3.658659,-3.53288,-3.183223,-3.426479,-3.538967,0.236361
1,0.003255,0.000895,0.002421,0.000257,0.0,squared_error,2,,1,2,...,-4.660974,1.123492,5593,-4.305616,-4.088143,-4.14213,-4.279994,-4.188513,-4.200879,0.081906
2,0.003549,0.000127,0.002362,0.000117,0.0,squared_error,2,,1,4,...,-4.556823,0.800674,4514,-3.893596,-3.658659,-3.53288,-3.183223,-3.426479,-3.538967,0.236361
3,0.002729,0.000163,0.002228,0.000149,0.0,squared_error,2,,1,4,...,-4.660974,1.123492,5593,-4.305616,-4.088143,-4.14213,-4.279994,-4.188513,-4.200879,0.081906
4,0.003252,0.000205,0.002234,9.9e-05,0.0,squared_error,2,,1,8,...,-4.556823,0.800674,4514,-3.893596,-3.658659,-3.53288,-3.183223,-3.426479,-3.538967,0.236361


In [31]:
results_df.iloc[cv.best_index_]

mean_fit_time                                                                     0.02246
std_fit_time                                                                     0.001856
mean_score_time                                                                    0.0021
std_score_time                                                                   0.000124
param_ccp_alpha                                                                       0.0
param_criterion                                                            absolute_error
param_max_depth                                                                         7
param_max_features                                                                   None
param_min_samples_leaf                                                                  8
param_min_samples_split                                                                 2
param_splitter                                                                       best
params    

### Najlepsza konfiguracja drzewa

In [32]:
results_df.iloc[cv.best_index_].params

{'ccp_alpha': 0.0,
 'criterion': 'absolute_error',
 'max_depth': 7,
 'max_features': None,
 'min_samples_leaf': 8,
 'min_samples_split': 2,
 'splitter': 'best'}

In [33]:
cv.best_estimator_.get_depth()

7

In [34]:
import joblib

In [35]:
joblib.dump(cv.best_estimator_, "../model/house_prices_dtree.joblib")

['../model/house_prices_dtree.joblib']