In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import time

In [2]:
USE_RAY=False
NUM_WORKERS=20

In [3]:
start = time.time()

In [4]:
housing_data = fetch_california_housing()
print(housing_data.DESCR)


.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [5]:
X_train, X_test, y_train, y_test = train_test_split(housing_data.data, housing_data.target,
                                                    test_size=0.2, random_state=42)

In [6]:
param_grid = [
    {'n_estimators':[100, 200], 'learning_rate':[0.5, 0.1, 2], 'max_depth':[1,2,3,4], 'loss':['ls', 'lad']}
]

In [7]:
# Setup ray
if USE_RAY:
    import ray
    ray.init(address='auto')

In [8]:
gs = GridSearchCV(GradientBoostingRegressor(random_state=0), param_grid=param_grid,
                  scoring='neg_root_mean_squared_error',
                  cv=5)

In [9]:
grid_search_start = time.time()
if USE_RAY:
    import joblib
    from ray.util.joblib import register_ray
    register_ray()
    with joblib.parallel_backend('ray', n_jobs=NUM_WORKERS):
        gs.fit(X_train, y_train)
else:
    gs.fit(X_train, y_train)
grid_search_end = time.time()
gs.best_params_

{'learning_rate': 0.1, 'loss': 'ls', 'max_depth': 4, 'n_estimators': 200}

In [10]:
gbr = GradientBoostingRegressor()
gbr.set_params(**gs.best_params_)
gbr.fit(X_train, y_train)
gbr.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [11]:
y_pred = gbr.predict(X_test)

In [12]:

rmse = sqrt(mean_squared_error(y_test, y_pred))
rmse

0.48779067606236703

In [13]:
elapsed = time.time() - start
grid_search_time = grid_search_end - grid_search_start
print("Completed in %.f seconds, grid search time was %.f seconds" % (elapsed, grid_search_time))

Completed in 652 seconds, grid search time was 646 seconds
