In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor , RandomForestRegressor
import itertools

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials



In [3]:
train_x = pd.read_csv("../input/train_x_power2.csv")
train_y = pd.read_csv("../input/train_y_power2.csv",header=None)
ids = pd.read_csv("../input/test.csv")['id']
test_x = pd.read_csv("../input/test_power2.csv")


In [9]:
shift = 200
SEED = 2016
def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)


In [12]:
n_folds = 5
cv_sum = 0

pred_oob = np.zeros(train_x.shape[0])
pred_test = np.zeros(test_x.shape[0])


kf = KFold(train_x.shape[0], n_folds=n_folds,random_state = 2016)
for i, (train_index, test_index) in enumerate(kf):
    print(i)

    X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]

    d_train, d_train_y = np.array(X_train), np.array(y_train[0])
    d_valid, d_valid_y = np.array(X_val),np.array(y_val[0])

    clf = ExtraTreesRegressor(n_estimators = 1000 ,
                                max_features =1.0 ,
                                max_depth = 22,
                                min_samples_leaf =6,
                                n_jobs = 18,
                                criterion = 'mse',
                                random_state = 2016,
                                bootstrap = True,
                                verbose = 1)

    clf.fit(d_train,d_train_y)
    
    pred = np.exp(clf.predict(d_valid))-200
    pred_oob[test_index] = pred
    
    pred_test += np.exp(clf.predict(np.array(test_x)))-200

    cv_score = mean_absolute_error(np.exp(d_valid_y), pred+200)
    print('eval-MAE: %.6f' % cv_score)

    cv_sum = cv_sum + cv_score

score = cv_sum / n_folds

0


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   27.5s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  3.0min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  7.3min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 13.4min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 17.5min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.4s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.8s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.3s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.6s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.7s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.4s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.6s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    3.3s finished


eval-MAE: 1188.059541
1


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   19.2s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  3.0min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  7.3min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 13.3min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 17.4min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.3s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.1s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.3s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.7s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.4s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.6s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    3.3s finished


eval-MAE: 1188.304485
2


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   25.4s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  3.0min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  7.3min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 13.3min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 17.2min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.3s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.7s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.1s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.4s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.4s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.5s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    3.2s finished


eval-MAE: 1197.314234
3


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   24.5s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  2.9min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  7.2min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 13.2min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 17.2min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.0s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.3s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.5s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.2s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.7s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.5s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.6s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    3.4s finished


eval-MAE: 1196.225361
4


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   26.5s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  3.0min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  7.3min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 13.2min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 17.2min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.3s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.3s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.4s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.5s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    3.3s finished


eval-MAE: 1181.564218


In [17]:
#pred_test /= n_folds

print("Writing results")
result = pd.DataFrame(pred_test, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("%d-fold average prediction:" % n_folds)


now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'test_ef_fairobj_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')

print("writing out of bag results")
oob_df = pd.DataFrame(pred_oob, columns = ['loss'])
sub_file = 'oob_ef_fairobj_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
oob_df.to_csv(sub_file, index = False)

Writing results
5-fold average prediction:
Writing submission: test_ef_fairobj_1190.293568_2016-12-12-00-24.csv
writing out of bag results
Writing submission: oob_ef_fairobj_1190.293568_2016-12-12-00-24.csv


In [18]:
n_folds = 5
cv_sum = 0

pred_oob = np.zeros(train_x.shape[0])
pred_test = np.zeros(test_x.shape[0])


kf = KFold(train_x.shape[0], n_folds=n_folds,random_state = 2016)
for i, (train_index, test_index) in enumerate(kf):
    print(i)

    X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]

    d_train, d_train_y = np.array(X_train), np.array(y_train[0])
    d_valid, d_valid_y = np.array(X_val),np.array(y_val[0])

    clf = RandomForestRegressor(n_estimators = 1000 ,
                                max_features =0.75 ,
                                max_depth = 17,
                                min_samples_leaf =7,
                                n_jobs = 18,
                                criterion = 'mse',
                                random_state = 2016,
                                bootstrap = True,
                                verbose = 1)

    clf.fit(d_train,d_train_y)
    
    pred = np.exp(clf.predict(d_valid))-200
    pred_oob[test_index] = pred
    
    pred_test += np.exp(clf.predict(np.array(test_x)))-200

    cv_score = mean_absolute_error(np.exp(d_valid_y), pred+200)
    print('eval-MAE: %.6f' % cv_score)

    cv_sum = cv_sum + cv_score

score = cv_sum / n_folds

0


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   18.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  2.4min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  5.7min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 10.5min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 13.6min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.0s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.3s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.2s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.2s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    2.6s finished


eval-MAE: 1184.855491
1


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   18.6s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  2.4min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  5.8min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 10.5min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 13.7min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.3s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.7s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.1s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.3s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.5s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.1s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    2.5s finished


eval-MAE: 1183.892082
2


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   18.4s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  2.4min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  5.7min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 10.5min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 13.6min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.0s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.3s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.2s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.5s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.1s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.9s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    2.5s finished


eval-MAE: 1193.620465
3


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   18.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  2.4min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  5.7min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 10.4min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 13.5min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.0s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.2s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.6s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.2s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.5s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.1s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    2.0s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    2.5s finished


eval-MAE: 1191.481591
4


[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:   19.7s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:  2.3min
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:  5.7min
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed: 10.4min
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed: 13.6min finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.2s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    0.4s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    0.8s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    1.0s finished
[Parallel(n_jobs=18)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=18)]: Done 164 tasks      | elapsed:    0.5s
[Parallel(n_jobs=18)]: Done 414 tasks      | elapsed:    1.1s
[Parallel(n_jobs=18)]: Done 764 tasks      | elapsed:    1.9s
[Parallel(n_jobs=18)]: Done 1000 out of 1000 | elapsed:    2.4s finished


eval-MAE: 1178.018159


In [19]:
pred_test /= n_folds

print("Writing results")
result = pd.DataFrame(pred_test, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
print("%d-fold average prediction:" % n_folds)


now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'test_rf_fairobj_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')

print("writing out of bag results")
oob_df = pd.DataFrame(pred_oob, columns = ['loss'])
sub_file = 'oob_rf_fairobj_' + str(score) + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
oob_df.to_csv(sub_file, index = False)

Writing results
5-fold average prediction:
Writing submission: test_rf_fairobj_1186.373558_2016-12-12-01-33.csv
writing out of bag results
Writing submission: oob_rf_fairobj_1186.373558_2016-12-12-01-33.csv
