In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
shift = 200
def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

def fair_obj(preds, dtrain):
    fair_constant = 2
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)


In [None]:
train_x = pd.read_csv("../input/train_x_power2.csv")
train_y = pd.read_csv("./Allstate/input/train_y_power2.csv",header=None)
ids = pd.read_csv("./Allstate/input/ids.csv")
test_x = pd.read_csv("./Allstate/input/test_power2.csv")
test_x = xgb.DMatrix(test_x)

In [None]:

Df_results = pd.DataFrame() 
iter_count = 1

def objective(space):
    
    #n_folds = n_folds
    num_rounds = 1500
    n_folds = 5
    early_stop = 50
    random_state = 2016
    n_print = 500
    params = {
        'eta': 0.1,
        'silent': 1,
        'seed': random_state,
        'nthread': 20
    }
    
    params['min_child_weight'] = space['min_child_weight']
    params['colsample_bytree'] = space['colsample_bytree']
    params['max_depth'] = space['max_depth']
    params['subsample'] = space['subsample']
    params['gamma'] = space['gamma']
    params['alpha'] = space['alpha']
    
    log_files = params.copy()
    
    cv_sum = 0
    fpred = []
    xgb_rounds = []
    
    kf = KFold(train_x.shape[0], n_folds=n_folds,random_state = 2016)
    for i, (train_index, test_index) in enumerate(kf):
        #print('\n Fold %d' % (i+1))
        X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]

        d_train = xgb.DMatrix(X_train, label=y_train)
        d_valid = xgb.DMatrix(X_val, label=y_val)
        watchlist = [(d_train, 'train'), (d_valid, 'eval')]

        clf = xgb.train(params,
                        d_train,
                        num_boost_round = num_rounds,
                        evals = watchlist,
                        early_stopping_rounds=early_stop,
                        obj=fair_obj,
                        verbose_eval = None,
                        feval=xg_eval_mae)

        xgb_rounds.append(clf.best_iteration)
        scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
        cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
        #print('eval-MAE: %.6f' % cv_score)
        y_pred = np.exp(clf.predict(test_x, ntree_limit=clf.best_ntree_limit)) - shift
        
        cv_score_name = str(i) + '_cv_score' 
        rounds_name = str(i) + '_n_rounds' 
        
        log_files[cv_score_name] = cv_score 
        log_files[rounds_name] = clf.best_iteration

        if i > 0:
            fpred = pred + y_pred
        else:
            fpred = y_pred
        pred = fpred
        cv_sum = cv_sum + cv_score

    mpred = pred / n_folds
    score = cv_sum / n_folds
    #print 'the score for this round is :' + str(score )

    n_rounds = int(np.mean(xgb_rounds))
    
    log_files['score'] = score
    log_files['n_rounds'] = n_rounds
    
    log_files_df = pd.DataFrame(log_files.items(), columns = ['params','value'])

    global iter_count 
    print '##################' + '   iteration ' + str(iter_count) + '    with' + ' ' + str(score) + ' ##############'

    log_files_df['id'] = iter_count
    iter_count = iter_count+1 
    log_files_df = log_files_df.pivot(index = 'id',columns = 'params',values = 'value')
    
    global Df_results
    Df_results = Df_results.append(log_files_df)
    
    return {'loss': score, 'status': STATUS_OK}

In [None]:
space = {
         'max_depth' : hp.choice('max_depth', np.arange(7, 14, dtype=int)),
         'min_child_weight' : hp.quniform('min_child_weight', 80, 140, 1),
         'subsample' : hp.quniform('subsample', 0.5, 0.8, 0.05),
         'colsample_bytree' : hp.quniform('colsample_bytree', 0.3, 0.8, 0.05),
         'gamma' : hp.quniform('gamma', 0.5, 1.25, 0.05),
         'alpha' : hp.quniform('alpha',5,10,0.25)

        }
    



In [None]:
trials = Trials()

best = fmin(objective,space,algo = tpe.suggest,trials = trials, max_evals = 50)
    
print best 

In [None]:
Df_results.to_csv('hyper_opt_power2.csv')