Завдання
===
Вам необхідно використати Decision Tree Regressor для прогнозування ціни будинку (House_Price) на основі наданого датасету. Перед побудовою моделі не забудьте виконати перетворення категоріальних ознак у числові.

Після побудови моделі вам також потрібно знайти оптимальний набір гіперпараметрів моделі, використовуючи техніку пошуку Grid Search.

Метрику для оцінки якості моделі можете обрати за власним бажанням, наприклад:

* Mean Squared Error (MSE)
* Mean Absolute Error (MAE)
* R²

Ваше завдання – побудувати модель з найкращою якістю прогнозування.

In [240]:
import pandas as pd
df = pd.read_csv('data/house_price.csv')
df.head()

Unnamed: 0,Number_of_Rooms,Size_SqFt,Building_Age,Location,Has_Garden,House_Price
0,7,955,28,Chicago,No,344674.96
1,4,1775,44,Boston,No,323732.71
2,5,1516,84,Miami,Yes,315425.11
3,7,2843,30,Chicago,Yes,358211.3
4,3,3267,62,Los Angeles,No,322604.63


In [241]:
columns_target = ['House_Price']
columns_features_categorical = ['Location', 'Has_Garden']
columns_features_numeric = list(df.columns.difference(columns_target).difference(columns_features_categorical))

print(f'Target = {columns_target}, Features: categorical colums = {columns_features_categorical}, numeric = {columns_features_numeric}')

Target = ['House_Price'], Features: categorical colums = ['Location', 'Has_Garden'], numeric = ['Building_Age', 'Number_of_Rooms', 'Size_SqFt']


In [242]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df)

print(f'Train({len(train)}), Test({len(test)})')

Train(75), Test(25)


In [243]:
from sklearn.preprocessing import OrdinalEncoder
feature_encoder = OrdinalEncoder()
feature_encoder

In [244]:
train_categorical_encoded = feature_encoder.fit_transform(train[columns_features_categorical])
test_categorical_encoded = feature_encoder.transform(test[columns_features_categorical])

train_categorical_encoded = pd.DataFrame(train_categorical_encoded, columns=columns_features_categorical, index=train.index)
test_categorical_encoded = pd.DataFrame(test_categorical_encoded, columns=columns_features_categorical, index=test.index)

train_features = pd.concat([train[columns_features_numeric], train_categorical_encoded], axis=1)
test_features = pd.concat([test[columns_features_numeric], test_categorical_encoded], axis=1)

train_features.head()


Unnamed: 0,Building_Age,Number_of_Rooms,Size_SqFt,Location,Has_Garden
47,97,4,2251,1.0,0.0
49,19,3,2646,1.0,0.0
13,77,5,992,3.0,0.0
15,70,3,1680,0.0,1.0
3,30,7,2843,1.0,1.0


In [245]:
train_target = train[columns_target]
test_target = test[columns_target]

train_target

Unnamed: 0,House_Price
47,306036.37
49,273645.00
13,328730.01
15,290059.25
3,358211.30
...,...
2,315425.11
20,385781.24
65,368710.26
75,372243.93


In [246]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(train_features, train_target)
regressor

In [247]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
test_prediction = regressor.predict(test_features)
metric_mse = mean_squared_error(test_target, test_prediction)
metric_mae = mean_absolute_error(test_target, test_prediction)
metric_r2 = r2_score(test_target, test_prediction)

print(f'MSE = {metric_mse}, MAE = {metric_mae}, R2 = {metric_r2}')

MSE = 1196239729.7322872, MAE = 28005.68959999999, R2 = -0.2771350287294059


Using Pipeline

In [248]:
from sklearn.compose import ColumnTransformer
pipeline__column_transformer = ColumnTransformer(
    transformers=[('feature_encoder', OrdinalEncoder(), columns_features_categorical)], 
    remainder='passthrough', 
    force_int_remainder_cols=False
)
pipeline__column_transformer

In [249]:
from sklearn.tree import DecisionTreeRegressor
pipeline__regressor = DecisionTreeRegressor()
pipeline__regressor

In [250]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('column_transformer', pipeline__column_transformer), 
    ('estimator', pipeline__regressor)
])

pipe

In [251]:
from sklearn.metrics import make_scorer

pipeline__scoring_metrics_mse = make_scorer(
    mean_squared_error, 
    greater_is_better=False
)

pipeline__scoring_metrics_mae = make_scorer(
    mean_absolute_error, 
    greater_is_better=False
)

pipeline__scoring_metrics_r2 = make_scorer(
    r2_score, 
    greater_is_better=True
)


In [252]:
from sklearn.model_selection import cross_val_score

pipeline__scores_mse = cross_val_score(
    pipe, train_features, train_target, cv = 5, scoring=pipeline__scoring_metrics_mse
)

pipeline__scores_mae = cross_val_score(
    pipe, train_features, train_target, cv = 5, scoring=pipeline__scoring_metrics_mae
)

pipeline__scores_r2 = cross_val_score(
    pipe, train_features, train_target, cv = 5, scoring=pipeline__scoring_metrics_r2
)

print(f'MSE\n{pipeline__scores_mse}\n\nMAE\n{pipeline__scores_mae}\n\nR2\n{pipeline__scores_r2}\n')

MSE
[-2.08066457e+09 -1.58616372e+09 -1.40386252e+09 -1.27007112e+09
 -1.78557074e+09]

MAE
[-30943.47933333 -37537.91266667 -31150.13333333 -28727.31666667
 -38770.59466667]

R2
[-0.17433932 -0.41795674  0.1347995  -0.43679163 -0.92426566]



In [253]:
from sklearn.model_selection import GridSearchCV

grid_parameters = {
    'estimator__max_depth': range(2, 10)
}

grid_search = GridSearchCV(pipe, grid_parameters, cv=5, verbose=True, refit=True)
grid_search

In [254]:
grid_search.fit(train_features, train_target)
grid_search

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [255]:
best_regressor = grid_search.best_estimator_;
best_prediction = best_regressor.predict(test_features); 

best_metric_mse = mean_squared_error(test_target, best_prediction)
best_metric_mae = mean_absolute_error(test_target, best_prediction)
best_metric_r2 = r2_score(test_target, best_prediction)

print(f'MSE = {best_metric_mse}, MAE = {best_metric_mae}, R2 = {best_metric_r2}')

MSE = 1020617064.9128194, MAE = 26217.459809523803, R2 = -0.08963594179476475
