In [1]:
%load_ext lab_black

# Models

Following models will be created:
* random forest
* neural network
* gradient boosting

In [2]:
import pandas as pd
import pickle as pkl
import dalex as dx

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import warnings

warnings.filterwarnings("ignore")

import os

## Data preparation

In [3]:
os.chdir("..")

In [4]:
df = pd.read_pickle("resources/data/housing_preproc.pkl")

In [5]:
X, y = df.drop(columns=["median_house_value"]), df[["median_house_value"]]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

## Random forest

Estimated time: 1h

In [11]:
# rf = RandomForestRegressor()
# rf_tuned = RandomizedSearchCV(
#     rf,
#     {
#         "criterion": ["squared_error", "absolute_error"],
#         "max_features": ["sqrt", "log2"],
#         "min_samples_split": [i for i in range(2, 10)],
#         "min_samples_leaf": [i for i in range(1, 3)],
#         "max_depth": [i for i in range(3, 10, 2)],
#         "n_estimators": [i for i in range(100, 301, 100)],
#     },
#     n_iter=10,
#     random_state=2137,
# )

# rf_tuned.fit(X_train, y_train)
# print(rf_tuned.best_estimator_)

RandomForestRegressor(criterion='absolute_error', max_depth=9,
                      max_features='sqrt', min_samples_leaf=2,
                      n_estimators=300)


In [12]:
# with open("resources/models/random_forest.pkl", "wb") as file:
#     pkl.dump(file=file, obj=rf_tuned)

## Neural network

Estimated time: 5 min

In [7]:
# mlp = MLPRegressor(random_state=2137)

# mlp_tuned = GridSearchCV(
#     mlp, {"hidden_layer_sizes": [(10, 100, 20), (5, 50, 50, 10), (25, 100, 20)]}
# )

# mlp_tuned.fit(X_train, y_train)
# print(mlp_tuned.best_estimator_)

MLPRegressor(hidden_layer_sizes=(10, 100, 20), random_state=2137)


In [8]:
# with open("resources/models/neural_network.pkl", "wb") as file:
#     pkl.dump(file=file, obj=mlp_tuned)

## Gradiendt Boosting

Estimated time: 0.5h

In [13]:
# gb = GradientBoostingRegressor()

# gb_tuned = RandomizedSearchCV(
#     gb,
#     {
#         "loss": ["squared_error", "absolute_error", "huber", "quantile"],
#         "criterion": ["friedman_mse", "squared_error", "mse"],
#         "max_features": ["auto", "sqrt", "log2"],
#     },
#     random_state=2137,
#     n_iter=15,
# )

# gb_tuned.fit(X_train, y_train)
# print(gb_tuned.best_estimator_)

GradientBoostingRegressor(criterion='mse', max_features='auto')


In [14]:
# with open("resources/models/gradient_boosting.pkl", "wb") as file:
#     pkl.dump(file=file, obj=gb_tuned)

## Decision tree

Estimated time: 1min

In [9]:
# dt = DecisionTreeRegressor()

# dt_tunned = RandomizedSearchCV(
#     dt,
#     {
#         "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
#         "max_features": ["auto", "sqrt", "log2"],
#         "min_samples_split": [i for i in range(2, 10, 2)],
#     },
# )

# dt_tunned.fit(X_train, y_train)
# print(dt_tunned.best_estimator_)

DecisionTreeRegressor(criterion='absolute_error', max_features='auto',
                      min_samples_split=8)


In [10]:
# with open("resources/models/decision_tree.pkl", "wb") as file:
#     pkl.dump(file=file, obj=dt_tunned)

# Models evaluaiton

In [16]:
with open("resources/models/random_forest.pkl", "rb") as file:
    rf = pkl.load(file)

with open("resources/models/neural_network.pkl", "rb") as file:
    mlp = pkl.load(file)

with open("resources/models/gradient_boosting.pkl", "rb") as file:
    gb = pkl.load(file)

In [23]:
rf_exp = dx.Explainer(rf, X, y, label="random forest")
mlp_exp = dx.Explainer(mlp, X, y, label="neural network")
gb_exp = dx.Explainer(gb, X, y, label="gradient boosting")

Preparation of a new explainer is initiated

  -> data              : 20640 rows 13 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 20640 values
  -> model_class       : sklearn.model_selection._search.RandomizedSearchCV (default)
  -> label             : random forest
  -> predict function  : <function yhat_default at 0x7fdcfd5d6d30> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = -1.27, mean = -0.0772, max = 2.5
  -> model type        : regression will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -2.22, mean = 0.0772, max = 3.35
  -> model_info        : package sklearn

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 20640 rows 13 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a n

In [26]:
rf_performance = rf_exp.model_performance(model_type="regression")
mlp_performance = mlp_exp.model_performance(model_type="regression")
gb_performance = gb_exp.model_performance(model_type="regression")

performance = pd.concat(
    [rf_performance.result, mlp_performance.result, gb_performance.result]
)

In [30]:
performance.sort_values(by="r2", ascending=False)

Unnamed: 0,mse,rmse,r2,mae,mad
neural network,0.18493,0.430035,0.81507,0.287642,0.187932
gradient boosting,0.216974,0.465804,0.783026,0.324272,0.230234
random forest,0.231571,0.481218,0.768429,0.320396,0.211325


## Results

Neural network perform best on all metrics and will be used as primary models throughout rest of the project.