In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import xgboost as xgb

## Setup data

In [2]:
fires = pd.read_csv("./forestfires.csv")

FEATURE_NAMES = [
    "DC",
    "temp",
    "RH",
    "wind",
]

fires = fires[np.log(fires["area"]+1) < 4]

X, y = fires[FEATURE_NAMES], np.log(fires["area"]+1)

In [3]:
custom_scorer = metrics.make_scorer(
    lambda y, y_pred: np.sqrt(metrics.mean_squared_error(y, y_pred)), greater_is_better=False
)

## Parameter Tuning

In [4]:
learning_rate_range = [0.001, 0.003, 0.01, 0.03]
max_depth_range = [1, 2, 4, 5, 10]
n_estimators_range = [1, 2, 3, 4, 5, 10, 20, 50]
booster_range = ['gbtree', 'gblinear', 'dart']

In [5]:
grid_search = GridSearchCV(
    xgb.XGBRegressor(),
    param_grid={"learning_rate": learning_rate_range,
                "max_depth": max_depth_range,
                "n_estimators": n_estimators_range,
                "booster": booster_range},
    cv=10,
    n_jobs=-1,
    scoring="neg_mean_absolute_error",
)

In [6]:
grid_search.fit(X, y)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'learning_rate': [0.001, 0.003, 0.01, 0.03], 'max_depth': [1, 2, 4, 5, 10], 'n_estimators': [1, 2, 3, 4, 5, 10, 20, 50], 'booster': ['gbtree', 'gblinear', 'dart']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=0)

In [7]:
grid_search.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.001, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=1,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [8]:
grid_search.best_score_

-0.9483718568025007

In [9]:
grid_search.best_params_

{'booster': 'gbtree',
 'learning_rate': 0.001,
 'max_depth': 10,
 'n_estimators': 1}

In [14]:
y_raw = fires["area"]

In [15]:
y_pred = grid_search.predict(X)

a = np.argmin(y_pred)
b = np.argmax(y_pred)

In [16]:
np.exp(y_pred[a])

1.6479658

In [17]:
np.exp(y_pred[b])

1.6522148

In [18]:
y_raw[a]

0.0

In [19]:
y_raw[b]

37.71

In [20]:
y_pred

array([0.49954545, 0.5003014 , 0.5003014 , 0.49957144, 0.49957144,
       0.5003014 , 0.5003014 , 0.4995625 , 0.50126857, 0.5003014 ,
       0.49954545, 0.5003014 , 0.50031084, 0.5003014 , 0.4995625 ,
       0.5003014 , 0.5003014 , 0.5003014 , 0.49964878, 0.5003014 ,
       0.5003014 , 0.5003014 , 0.5003014 , 0.5003014 , 0.5003014 ,
       0.50031084, 0.5003014 , 0.5003014 , 0.4995625 , 0.5003014 ,
       0.4995625 , 0.5001529 , 0.5003014 , 0.5003014 , 0.5003014 ,
       0.50031084, 0.49954167, 0.4995625 , 0.5003014 , 0.49964878,
       0.5003014 , 0.5003014 , 0.50031084, 0.5003014 , 0.49966666,
       0.4995625 , 0.50031084, 0.5003014 , 0.49964878, 0.5003014 ,
       0.49975   , 0.5003014 , 0.5003014 , 0.5003014 , 0.5003014 ,
       0.4995625 , 0.50031084, 0.5003014 , 0.49964878, 0.49954545,
       0.5003014 , 0.4995625 , 0.5003014 , 0.5003014 , 0.5003014 ,
       0.5003014 , 0.5003014 , 0.5003014 , 0.5003014 , 0.5003014 ,
       0.5003014 , 0.5003014 , 0.5003014 , 0.5003014 , 0.49975