In [1]:
import pandas as pd

In [2]:
boston_df = pd.read_parquet("../data/boston_df.parquet")
boston_df.sample(7)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
59,0.10328,25.0,5.13,0.0,0.453,5.927,47.2,6.932,8.0,284.0,19.7,396.9,9.22,19.6
27,0.95577,0.0,8.14,0.0,0.538,6.047,88.8,4.4534,4.0,307.0,21.0,306.38,17.28,14.8
455,4.75237,0.0,18.1,0.0,0.713,6.525,86.5,2.4358,24.0,666.0,20.2,50.92,18.13,14.1
134,0.97617,0.0,21.89,0.0,0.624,5.757,98.4,2.346,4.0,437.0,21.2,262.76,17.31,15.6
111,0.10084,0.0,10.01,0.0,0.547,6.715,81.6,2.6775,6.0,432.0,17.8,395.59,10.16,22.8
93,0.02875,28.0,15.04,0.0,0.464,6.211,28.9,3.6659,4.0,270.0,18.2,396.33,6.21,25.0
349,0.02899,40.0,1.25,0.0,0.429,6.939,34.5,8.7921,1.0,335.0,19.7,389.85,5.89,26.6


In [3]:
from sklearn.tree import DecisionTreeRegressor

In [4]:
dt = DecisionTreeRegressor(random_state=462)

In [5]:
X = boston_df.drop(columns="MEDV")
y = boston_df["MEDV"]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=421)

In [8]:
dt.fit(X_train, y_train)

In [9]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [10]:
mean_squared_error(y_test, dt.predict(X_test))

17.026862745098036

In [11]:
mean_absolute_error(y_test, dt.predict(X_test))

2.9509803921568616

Same metryki *MSE* i *MAE* nie mówią nam zbyt wiele o jakości modelu. Zobaczmy czy wartości tych metryk na zbiorze treningowym bardzo różnią się od tych na zbiorze testowym.

In [12]:
mean_squared_error(y_train, dt.predict(X_train)), mean_absolute_error(y_train, dt.predict(X_train))


(0.0, 0.0)

Mamy problem. Model przeuczył się na dostarczonych danych treningowych i rozpoznaje je idealnie, a kompletnie nie radzi sobie z nowymi przykładami.

In [13]:
from sklearn.tree import export_text

In [14]:
print(export_text(dt, feature_names=list(X_train.columns)))

|--- RM <= 6.94
|   |--- LSTAT <= 14.40
|   |   |--- DIS <= 1.38
|   |   |   |--- CRIM <= 10.59
|   |   |   |   |--- value: [50.00]
|   |   |   |--- CRIM >  10.59
|   |   |   |   |--- value: [27.90]
|   |   |--- DIS >  1.38
|   |   |   |--- RM <= 6.54
|   |   |   |   |--- LSTAT <= 7.57
|   |   |   |   |   |--- AGE <= 55.75
|   |   |   |   |   |   |--- PTRATIO <= 19.95
|   |   |   |   |   |   |   |--- TAX <= 275.50
|   |   |   |   |   |   |   |   |--- PTRATIO <= 17.75
|   |   |   |   |   |   |   |   |   |--- value: [29.10]
|   |   |   |   |   |   |   |   |--- PTRATIO >  17.75
|   |   |   |   |   |   |   |   |   |--- LSTAT <= 7.02
|   |   |   |   |   |   |   |   |   |   |--- CRIM <= 0.09
|   |   |   |   |   |   |   |   |   |   |   |--- value: [25.00]
|   |   |   |   |   |   |   |   |   |   |--- CRIM >  0.09
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |--- LSTAT >  7.02
|   |   |   |   |   |   |   |   |   |   |--- value:

In [15]:
dt.get_depth()

20

### Walidacja krzyżowa

In [16]:
import numpy as np

In [17]:
from sklearn.model_selection import cross_validate

In [18]:
np.random.seed(214)

cv_results = cross_validate(dt, X, y, cv=5, scoring="neg_mean_squared_error", return_train_score=True)
cv_results

{'fit_time': array([0.01165104, 0.01082206, 0.01157451, 0.00491786, 0.00432396]),
 'score_time': array([0.00328326, 0.00281501, 0.00139785, 0.00133514, 0.00122547]),
 'test_score': array([-12.5427451 , -54.01980198, -28.79574257, -55.67831683,
        -65.29059406]),
 'train_score': array([-0., -0., -0., -0., -0.])}

In [19]:
cv_results["test_score"].mean()

-43.26544010871676

In [20]:
np.random.seed(214)

cv_mae_results = cross_validate(dt, X, y, cv=5, scoring="neg_mean_absolute_error", return_train_score=True)

cv_mae_results["test_score"].mean()

-4.044010871675402

Stworzony przez nas model jest całkowicie bezużyteczny. Jest bardzo głęboki i na widocznych poziomach rozpoznaje pojedyczne obserwacje, przez co idealnie dopasowuje się do danych. 