In [1]:
# library
import pandas as pd
import os 
import feather
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from models import xgbm
from utils import *

In [2]:
# read in 
WU = feather.read_dataframe('Data/WU_lag.feather')

#pp
WU = WU.drop(columns=['Index','year','ParcelID','Days','year','month'])

WU_train = WU[WU['Set'] == 'train'].drop(columns=['Set'])
WU_dev = WU[WU['Set'] == 'dev'].drop(columns=['Set'])
WU_test = WU[WU['Set'] == 'test'].drop(columns=['Set'])

WU_train_all =  WU[WU['Set'] != 'test'].drop(columns=['Set'])

predictors = list( WU_train.drop(columns=['TotalWaterUse']).columns)

X_train = WU_train.drop(columns=['TotalWaterUse']).values
Y_train = WU_train.loc[:,'TotalWaterUse'].values

X_dev = WU_dev.drop(columns=['TotalWaterUse']).values
Y_dev = WU_dev.loc[:,'TotalWaterUse'].values

X_test = WU_test.drop(columns=['TotalWaterUse']).values
Y_test = WU_test.loc[:,'TotalWaterUse'].values

X_train_all = WU_train_all.drop(columns=['TotalWaterUse']).values
Y_train_all = WU_train_all.loc[:,'TotalWaterUse'].values


In [3]:

from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from hyperopt import fmin, tpe, space_eval, hp, rand, Trials, STATUS_OK
from hyperopt.pyll.stochastic import sample


# XGB

best_par, ntree = xgbm.xgbBayesVal(X_train, Y_train, X_dev, Y_dev,  max_eval=30)

In [5]:
best_par, ntree  ={'colsample_bytree': 1,
  'eta': 0.05,
  'gamma': 0.6407480451361361,
  'max_depth': 3.0,
  'min_child_weight': 84.40381275397019,
  'subsample': 0.8},304

In [6]:
%%capture
# dev
xgb_model1_dev, ntree, score  = xgbm.XGB_run(X_train, Y_train, X_dev, Y_dev, params = best_par,  verbose = False)


In [7]:
%%capture
# test
xgb_model1 , ntree, score  = xgbm.XGB_run(X_train_all, Y_train_all, X_test, Y_test, params = best_par, num_boost_round = ntree, early_stopping_rounds = ntree,verbose = False)

xgb_y_hat = xgb_model1.predict(xgb.DMatrix(X_test))

# XGB calibration

best_par, ntree = xgbm.xgbBayesVal2(X_train, Y_train, X_dev, Y_dev, mu_model = xgb_model1_dev, max_eval=20,seed=2)

In [8]:
best_par, ntree = {'colsample_bytree': 0.8,
  'eta': 0.05,
  'gamma': 0.6854846423792548,
  'max_depth': 5.0,
  'min_child_weight': 7.838012336950674,
  'subsample': 1}, 348

In [9]:
%%capture
# dev
xgb_model_dev, ntree, score  = xgbm.XGB_run2(X_train, Y_train, X_dev, Y_dev, mu_model = xgb_model1_dev, params = best_par,  verbose = False)

In [10]:
sig_hat = np.exp(xgb_model_dev.predict(xgb.DMatrix(X_dev)))
y_hat = xgb_model1_dev.predict(xgb.DMatrix(X_dev))
all_test_MF =  outframe('dev',Y_dev,y_hat,sig_hat)

path = 'Out_dev_MF\\month1\\XGB_dev_1m_dist.feather'
feather.write_dataframe(all_test_MF , path)

In [11]:
%%capture
# test
xgb_model2 , ntree, score  = xgbm.XGB_run2(X_train_all, Y_train_all, X_test, Y_test,mu_model = xgb_model1, params = best_par, num_boost_round = ntree, early_stopping_rounds = ntree, verbose = False)

xgb_y_hat = xgb_model1.predict(xgb.DMatrix(X_test))


In [12]:
sig_hat = np.exp(xgb_model2.predict(xgb.DMatrix(X_test)))
y_hat = xgb_model1.predict(xgb.DMatrix(X_test)) 
zp95 = 1.959963984540
left2 = (y_hat - sig_hat*zp95)
right = (y_hat + sig_hat*zp95)

r1,r2,r3,r4,r5 = get_RMSE_NLL_NOIS_AWPI_ECPI(Y_test,y_hat,left2,right,alpha=0.05)

 & 1132.54 & 8.33 & 5819.65 & 3642.68 & 0.92 & 95\% \\


In [14]:
# results
with open("Results/Results_1m.txt", "a") as myfile:
    myfile.write("XGB \n")
    myfile.write('RMSE %f & NLL %f & NOIS %f & AWPI %f & ECPI %f \n' % (
        r1,r2,r3,r4,r5 ))

In [15]:
all_test_MF = outframe('test',Y_test,y_hat,sig_hat)
path = 'Out_test_MF\\month1\\XGB_1m_dist.feather'
feather.write_dataframe(all_test_MF , path)

# BMA

In [16]:
y_hat_tr = xgb_model1.predict(xgb.DMatrix(X_train_all))
sig_hat_tr = np.exp(xgb_model2.predict(xgb.DMatrix(X_train_all)))
pt_y_tr = norm.pdf(Y_train_all,loc=y_hat_tr,scale=sig_hat_tr )
nll = -np.mean(np.log(pt_y_tr))


with open("Ensemble/BMA_short.txt", "a") as myfile:
    myfile.write("XGB, %f \n" % (nll))