In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
import itertools

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
train_x = pd.read_csv("../input/train_x_power2.csv")
train_y = pd.read_csv("../input/train_y_power2.csv",header=None)
ids = pd.read_csv("../input/ids.csv")
test_x = pd.read_csv("../input/test_power2.csv")


In [None]:
shift = 200
SEED = 2016
def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)


In [None]:

Df_results = pd.DataFrame() 
iter_count = 1
partial_evalutaion = open('extra_trees_bootstrap2.txt','a') 

def objective(space):
    
    #n_folds = n_folds
    log_files = space.copy()
    
    n_folds = 5
    cv_sum = 0
    
    kf = KFold(train_x.shape[0], n_folds=n_folds,random_state = 2016)
    for i, (train_index, test_index) in enumerate(kf):
        print(i)

        X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]

        d_train, d_train_y = np.array(X_train), np.array(y_train[0])
        d_valid, d_valid_y = np.array(X_val),np.array(y_val[0])

        clf = ExtraTreesRegressor(n_estimators = space['n_estimators'],
                                    max_features = space['max_features'],
                                    max_depth = space['max_depth'],
                                    min_samples_leaf = space['min_samples_leaf'],
                                    n_jobs = 18,
                                    criterion = 'mse',
                                    random_state = 2016,
                                    bootstrap = True,
                                    verbose = 1)
        
        clf.fit(d_train,d_train_y)

        scores_val = clf.predict(d_valid)

        cv_score = mean_absolute_error(np.exp(d_valid_y), np.exp(scores_val))
        print('eval-MAE: %.6f' % cv_score)
        
        cv_score_name = str(i) + '_cv_score' 
        rounds_name = str(i) + '_n_rounds' 
        
        log_files[cv_score_name] = cv_score 
        cv_sum = cv_sum + cv_score

    score = cv_sum / n_folds
    #print 'the score for this round is :' + str(score )
    
    log_files['score'] = score
    
    log_files_df = pd.DataFrame(log_files.items(), columns = ['params','value'])

    global iter_count 
    print '##################' + '   iteration ' + str(iter_count) + '    with' + ' ' + str(score) + ' ##############'
    
    partial_evalutaion.write('iteration ' + str(space) +str(iter_count) + 'with' + ' ' + str(score) + '\n')
    partial_evalutaion.flush()
    
    log_files_df['id'] = iter_count
    iter_count = iter_count+1 
    log_files_df = log_files_df.pivot(index = 'id',columns = 'params',values = 'value')
    
    global Df_results
    Df_results = Df_results.append(log_files_df)
    
    return {'loss': score, 'status': STATUS_OK}

In [None]:
space = {
         'max_depth' : hp.choice('max_depth', np.arange(15, 25, dtype=int)),
         'n_estimators' : hp.choice('n_estimators', np.arange(100,300,dtype=int)),
         'max_features' : hp.quniform('max_features', 0.2, 1, 0.05),
         'min_samples_leaf' : hp.choice('min_samples_leaf',np.arange(6,20,dtype=int))
        }
    

In [None]:
trials = Trials()

best = fmin(objective,space,algo = tpe.suggest,trials = trials, max_evals = 100)
    
print best 

In [None]:
Df_results.to_csv("hyperopt_ef.csv",index = None)