In [1]:
# library
import pandas as pd
import os 
import feather
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
from models import lgbm
from utils import *


In [2]:
# read in 
WU = feather.read_dataframe('Data/WU.feather')

#pp
WU = WU.drop(columns=['Index','year','ParcelID','Days','year','month'])

WU_train = WU[WU['Set'] == 'train'].drop(columns=['Set'])
WU_dev = WU[WU['Set'] == 'dev'].drop(columns=['Set'])
WU_test = WU[WU['Set'] == 'test'].drop(columns=['Set'])

WU_train_all =  WU[WU['Set'] != 'test'].drop(columns=['Set'])

predictors = list( WU_train.drop(columns=['TotalWaterUse']).columns)

X_train = WU_train.drop(columns=['TotalWaterUse']).values
Y_train = WU_train.loc[:,'TotalWaterUse'].values

X_dev = WU_dev.drop(columns=['TotalWaterUse']).values
Y_dev = WU_dev.loc[:,'TotalWaterUse'].values

X_test = WU_test.drop(columns=['TotalWaterUse']).values
Y_test = WU_test.loc[:,'TotalWaterUse'].values

X_train_all = WU_train_all.drop(columns=['TotalWaterUse']).values
Y_train_all = WU_train_all.loc[:,'TotalWaterUse'].values


# LGB

best_par, ntree = lgbm.lgbBayesVal(X_train, Y_train, X_dev, Y_dev,predictors =predictors,  max_eval=20)

In [3]:
best_par, ntree  = {'bagging_fraction': 0.9727997015067352,
  'feature_fraction': 0.5088766499267137,
  'max_depth': 20,
  'min_data_in_leaf': 64.0,
  'num_leaves': 22505.0},92

In [4]:
%%capture
# dev
lgb_model1_dev, ntree, score  = lgbm.LGB_run(X_train, Y_train, X_dev, Y_dev, params = best_par, predictors =predictors, verbose = False)

In [5]:
%%capture
# test
lgb_model1 , ntree, score  = lgbm.LGB_run(X_train_all, Y_train_all, X_test, Y_test, params = best_par, num_boost_round = ntree, early_stopping_rounds = ntree, predictors =predictors, verbose = False)

lgb_y_hat = lgb_model1.predict(X_test)



# LGB calibration

best_par, ntree = lgbm.lgbBayesVal2(X_train, Y_train, X_dev, Y_dev, mu_model = lgb_model1_dev, predictors =predictors,  max_eval=20)

In [6]:
best_par, ntree = {'bagging_fraction': 0.5287061838187463,
  'feature_fraction': 0.5016077663494096,
  'max_depth': 50,
  'min_data_in_leaf': 1348.0,
  'num_leaves': 11587.0},204

In [7]:
%%capture
# dev
lgb_model_dev, ntree, score  = lgbm.LGB_run2(X_train, Y_train, X_dev, Y_dev, mu_model = lgb_model1_dev, params = best_par, predictors =predictors, verbose = False)

In [8]:
sig_hat = np.exp(lgb_model_dev.predict(X_dev))
y_hat = lgb_model1_dev.predict(X_dev)
all_test_MF =  outframe('dev',Y_dev,y_hat,sig_hat)

path = 'Out_dev_MF\\month12\\LGB_dev_12m_dist.feather'
feather.write_dataframe(all_test_MF , path)

In [9]:
%%capture
# test
lgb_model2 , ntree, score  = lgbm.LGB_run2(X_train_all, Y_train_all, X_test, Y_test,mu_model = lgb_model1, params = best_par, num_boost_round = ntree, early_stopping_rounds = ntree, predictors =predictors, verbose = False)

lgb_y_hat = lgb_model1.predict(X_test)



In [10]:
sig_hat = np.exp(lgb_model2.predict(X_test))
y_hat = lgb_model1.predict(X_test) 
zp95 = 1.959963984540
left2 = (y_hat - sig_hat*zp95)
right = (y_hat + sig_hat*zp95)

r1,r2,r3,r4,r5 = get_RMSE_NLL_NOIS_AWPI_ECPI(Y_test,y_hat,left2,right,alpha=0.05)

 & 2505.39 & 9.09 & 11349.61 & 6961.2 & 0.89 & 95\% \\


In [11]:
# results
with open("Results/Results_12m.txt", "a") as myfile:
    myfile.write("LGB \n")
    myfile.write('RMSE %f & NLL %f & NOIS %f & AWPI %f & ECPI %f \n' % (
        r1,r2,r3,r4,r5 ))

In [12]:
all_test_MF = outframe('test',Y_test,y_hat,sig_hat)
path = 'Out_test_MF\\month12\\LGB_12m_dist.feather'
feather.write_dataframe(all_test_MF , path)

# BMA

In [13]:
y_hat_tr = lgb_model1.predict(X_train_all)
sig_hat_tr = np.exp(lgb_model2.predict(X_train_all, num_iteration=ntree))
pt_y_tr = norm.pdf(Y_train_all,loc=y_hat_tr,scale=sig_hat_tr )
nll = -np.mean(np.log(pt_y_tr))


with open("Ensemble/BMA_long.txt", "a") as myfile:
    myfile.write("LGB, %f \n" % (nll))