In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, VotingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from mlxtend.classifier import StackingClassifier

In [25]:
def SMAPE(estimator, X_test, y_test):
    forecast = estimator.predict(X_test)
    non_zero = lambda x: x if x > 0 else 0.0
    func = np.vectorize(non_zero)
    forecast = func(forecast)
    actual = y_test
    #return mean_absolute_error(forecast, actual)
    return -100*np.average((2*abs(forecast-actual)/((abs(forecast)+abs(actual)))))

In [26]:
train = pd.DataFrame.from_csv('train.tsv')
train = train.sample(frac=0.1, random_state=77)
time_split = TimeSeriesSplit(n_splits=4)
target = train['y']
train = train.drop(['y'], axis=1)

In [5]:
param_test1 = {
 'n_estimators': np.linspace(100, 200, 11).astype(int)
}

In [6]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, max_depth=5),
                    param_grid=param_test1,
                    scoring=SMAPE,
                    cv=time_split, error_score='')

In [7]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x0000000009AA4978>, verbose=0)

In [8]:
grid.best_params_, grid.best_score_

({'n_estimators': 150}, -37.887815193078964)

Теперь более точный поиск

In [9]:
param_test2 = {
 'n_estimators': np.linspace(145, 155, 11).astype(int)
}

In [10]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, max_depth=5),
                    param_grid=param_test2,
                    scoring=SMAPE,
                    cv=time_split, error_score='')

In [11]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x0000000009AA4978>, verbose=0)

In [12]:
grid.best_params_, grid.best_score_

({'n_estimators': 150}, -37.887815193078964)

In [18]:
l_rate = 0.1
n_estimators = 150

Теперь нужно определить max_depth и min_child_weight

In [37]:
param_test3 = {
 'max_depth':range(3,10,1)
}

In [38]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=l_rate, n_estimators=n_estimators),
                    param_grid=param_test3,
                    scoring=SMAPE,
                    cv=time_split, error_score='')

In [39]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=4), error_score='',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x0000000009AA4978>, verbose=0)

In [40]:
grid.best_params_, grid.best_score_

({'max_depth': 9}, -33.406847390078021)

In [27]:
max_depth = 9
min_child_weight = 1

Теперь gamma

In [31]:
param_test4 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

In [32]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=l_rate, n_estimators=n_estimators, max_depth=max_depth,
                                          min_child_weight=min_child_weight),
                    param_grid=param_test4,
                    scoring=SMAPE,
                    cv=time_split, error_score='')

In [33]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x0000000009AA4978>, verbose=0)

In [34]:
grid.best_params_, grid.best_score_

({'gamma': 0.0}, -34.69556811939357)

In [50]:
gamma = 0

Далее subsample и colsample_bytree

In [51]:
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

In [52]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=l_rate, n_estimators=n_estimators, max_depth=max_depth,
                                          min_child_weight=min_child_weight, gamma=gamma),
                    param_grid=param_test5,
                    scoring=SMAPE,
                    cv=time_split, error_score='')

In [53]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=116, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [54]:
grid.best_params_, grid.best_score_

({'colsample_bytree': 0.9, 'subsample': 0.9}, -32.822759352346644)

Попробуем увеличить точность

In [76]:
param_test6 = {
 'subsample':[i/100.0 for i in range(85,100,1)],
 'colsample_bytree':[i/100.0 for i in range(85,100,1)]
}


In [73]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=l_rate, n_estimators=n_estimators, max_depth=max_depth,
                                          min_child_weight=min_child_weight, gamma=gamma),
                    param_grid=param_test6,
                    scoring=SMAPE,
                    cv=time_split)

In [74]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=116, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'subsample': [0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99], 'colsample_bytree': [0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [75]:
grid.best_params_, grid.best_score_

({'colsample_bytree': 0.9, 'subsample': 0.98}, -32.28829996010743)

In [77]:
colsample_bytree = 0.9
subsample = 0.98

Осталось найти параметры регуляризации

In [80]:
param_test7 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

In [102]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=l_rate, n_estimators=n_estimators, max_depth=9,
                                          min_child_weight=min_child_weight, gamma=gamma,
                                          colsample_bytree=colsample_bytree, subsample=subsample),
                    param_grid=param_test7,
                    scoring=SMAPE,
                    cv=time_split)

In [103]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=116, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.98),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'reg_alpha': [1e-05, 0.01, 0.1, 1, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [104]:
grid.best_params_, grid.best_score_

({'reg_alpha': 0.1}, -32.285540858435212)

Увеличим точность

In [109]:
param_test8 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]}

In [110]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=l_rate, n_estimators=n_estimators, max_depth=9,
                                          min_child_weight=min_child_weight, gamma=gamma,
                                          colsample_bytree=colsample_bytree, subsample=subsample),
                    param_grid=param_test8,
                    scoring=SMAPE,
                    cv=time_split)

In [111]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=116, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.98),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [112]:
grid.best_params_, grid.best_score_

({'reg_alpha': 0}, -32.28829996010743)

Теперь уменьшим learning_rate и увеличим количество деревьев

In [128]:
param_test9 = {
 'n_estimators': np.linspace(350, 450, 11).astype(int)}

In [129]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=0.01, n_estimators=n_estimators, max_depth=9,
                                          min_child_weight=min_child_weight, gamma=gamma,
                                          colsample_bytree=colsample_bytree, subsample=subsample),
                    param_grid=param_test9,
                    scoring=SMAPE,
                    cv=time_split)

In [130]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=116, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.98),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [131]:
grid.best_params_, grid.best_score_

({'n_estimators': 370}, -32.631035215725326)

In [132]:
param_test10 = {
 'n_estimators': np.linspace(360, 380, 21).astype(int)}

In [135]:
grid = GridSearchCV(estimator=XGBRegressor(learning_rate=0.01, n_estimators=n_estimators, max_depth=max_depth,
                                          min_child_weight=min_child_weight, gamma=gamma,
                                          colsample_bytree=colsample_bytree, subsample=subsample),
                    param_grid=param_test10,
                    scoring=SMAPE,
                    cv=time_split)

In [136]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.9,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=9,
       min_child_weight=1, missing=None, n_estimators=116, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.98),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
       373, 374, 375, 376, 377, 378, 379, 380])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [137]:
grid.best_params_, grid.best_score_

({'n_estimators': 369}, -32.629051469576879)

In [143]:
n_estimators = 369
l_rate = 0.01

Настало время тюнить random_forest

In [162]:
param_test11 = {
 'max_features': np.linspace(0.1, 0.5, 50)}

In [163]:
grid = GridSearchCV(estimator=RandomForestRegressor(),
                    param_grid=param_test12,
                    scoring=SMAPE,
                    cv=time_split)

In [164]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': array([ 0.1    ,  0.10816,  0.11633,  0.12449,  0.13265,  0.14082,
        0.14898,  0.15714,  0.16531,  0.17347,  0.18163,  0.1898 ,
        0.19796,  0.20612,  0.21429,  0.22245,  0.23061,  0.23878,
        0.24694,  0.2551 ,  0.26327,  0.27143,  0.27959,  0.28776,
    ...469,
        0.44286,  0.45102,  0.45918,  0.46735,  0.47551,  0.48367,
        0.49184,  0.5    ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x

In [165]:
grid.best_params_, grid.best_score_

({'max_features': 0.26326530612244903}, -32.237985238282434)

In [179]:
param_test12 = {
 'n_estimators': np.linspace(100, 1000, 11).astype(int)}

In [180]:
grid = GridSearchCV(estimator=RandomForestRegressor(max_features = 0.26326530612244903),
                    param_grid=param_test12,
                    scoring=SMAPE,
                    cv=time_split)

In [181]:
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.263265306122, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 100,  190,  280,  370,  460,  550,  640,  730,  820,  910, 1000])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [182]:
grid.best_params_, grid.best_score_

({'n_estimators': 1000}, -32.490673036360761)

In [184]:
param_test12 = {
 'n_estimators': np.linspace(950, 1050, 5).astype(int)}

In [185]:
grid = GridSearchCV(estimator=RandomForestRegressor(max_features = 0.26326530612244903),
                    param_grid=param_test12,
                    scoring=SMAPE,
                    cv=time_split)
grid.fit(X=train.as_matrix(), y=target.as_matrix())

GridSearchCV(cv=TimeSeriesSplit(n_splits=3), error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.263265306122, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 950,  975, 1000, 1025, 1050])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function SMAPE at 0x00000000094ED978>, verbose=0)

In [186]:
grid.best_params_, grid.best_score_

({'n_estimators': 1000}, -32.679608523675249)

In [187]:
grid.cv_results_

{'mean_fit_time': array([ 5.77166669,  5.49199994,  5.47033334,  5.5436666 ,  6.75266655]),
 'mean_score_time': array([ 0.67633335,  0.69933335,  0.79333337,  0.67400002,  0.88233336]),
 'mean_test_score': array([-32.76770491, -32.72492407, -32.67960852, -32.92212255, -32.7345143 ]),
 'mean_train_score': array([-18.87497975, -18.89115808, -18.88061883, -19.14270717, -18.76427811]),
 'param_n_estimators': masked_array(data = [950 975 1000 1025 1050],
              mask = [False False False False False],
        fill_value = ?),
 'params': ({'n_estimators': 950},
  {'n_estimators': 975},
  {'n_estimators': 1000},
  {'n_estimators': 1025},
  {'n_estimators': 1050}),
 'rank_test_score': array([4, 2, 1, 5, 3]),
 'split0_test_score': array([-32.12673934, -31.91415827, -31.81152904, -32.32263888, -31.87591537]),
 'split0_train_score': array([-21.76839071, -21.37111321, -21.40905558, -22.24807333, -21.17566249]),
 'split1_test_score': array([-32.94675084, -33.23119216, -33.22464867, -33.320822

In [21]:
from hpsklearn import HyperoptEstimator, any_classifier
from hyperopt import tpe

In [22]:
estim = HyperoptEstimator( classifier=any_classifier('clf'),  
                            algo=tpe.suggest, trial_timeout=3, max_evals=2)

In [27]:
from skdata.base import SklearnClassifier
from hpsklearn.estimator import HyperoptEstimatorFactory

view = skdata.iris.view.KfoldClassification(5)
algo = SklearnClassifier(
    HyperoptEstimatorFactory(
        max_iter=25,  # -- consider also a time-based budget
    ))
mean_test_error = view.protocol(algo)
print 'mean test error:', mean_test_error

AttributeError: 'hyperopt_estimator' object has no attribute '_best_preprocs'