In [2]:
# load our dependencies
import time
import numpy as np
import warnings
import matplotlib.pyplot as plt

from operator import itemgetter
from hyperopt import hp, fmin, rand, tpe, space_eval, Trials, STATUS_OK
from scipy.stats import randint, uniform, lognorm
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import train_test_split, cross_val_score

%matplotlib inline
plt.style.use("ggplot")
warnings.filterwarnings("ignore")

## Optimizing Hyper-parameters for Machine Learning (part 2)

part 1 https://github.com/Eric-Xu/nycdssg_talks/blob/master/08_03_2015_hyperparams/grid_and_random_search_demo.ipynb

**Objective:** Compare hyper-parameter results found through grid search, random search, and hyperopt search in predicting Boston housing prices.

In [19]:
# read in the Boston housing dataset
# description of dataset can be found at:
# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/datasets/descr/boston_house_prices.rst
boston = load_boston()

# set our features and target
X, y = boston.data, boston.target

# peek at our dataset
print "Total rows:", X.shape[0]
print "Feature names:", boston["feature_names"]
print "First row features:", X[0]
print "First row target:", y[0]

Total rows: 506
Feature names: ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT' 'MEDV']
First row features: [  6.32000000e-03   1.80000000e+01   2.31000000e+00   0.00000000e+00
   5.38000000e-01   6.57500000e+00   6.52000000e+01   4.09000000e+00
   1.00000000e+00   2.96000000e+02   1.53000000e+01   3.96900000e+02
   4.98000000e+00]
First row target: 24.0


### Classifier with default parameters

In [15]:
# initialize a gradient boosting classifier with default parameters
default_clf = GradientBoostingRegressor()

# inspect the default parameters
print default_clf

# run a 10-fold cross-validation with our classifier
mse_scores = cross_val_score(default_clf, X, y, cv=10, scoring="mean_squared_error")

# our goal is to find the parameter configuration that minimizes the mean squared error
print np.mean(-mse_scores)

GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss=ls,
             max_depth=3, max_features=None, min_samples_leaf=1,
             min_samples_split=2, n_estimators=100, random_state=None,
             subsample=1.0, verbose=0)
18.4938526518
