In [None]:
# Import libraries
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../Data/HomeC.csv', low_memory=False)
df.head()

In [None]:
df.tail() # Note last column needs to be removed

## Basic EDA

In [None]:
# Print columns and check general information about columns
df.info() # 503911 by 32

In [None]:
# Check null values
df.isnull().sum() # Last row

In [None]:
# Check categorical columns
print(df.summary.value_counts())
print(df.icon.value_counts())
print(df.cloudCover.value_counts())

## Begin Preprocessing
Steps:
1. Cleaning column names
2. Remove tailing row (source of NAs)
3. Encode categorical variables (icon, summary, cloudCover) 
    LightGBM is good at handling integer-encoded categorical variables
4. Heat map
5. Averaging some columns (Check correlation)
6. Remove highly correlated data (use, house overall)
7. Convert datetime (From seconds to minutes)
8. Set index to time
9. Resampling (Optional / hourly)
10. Visualizations 

In [None]:
# 1, 2
df.columns = [i.replace(' [kW]', '') for i in df.columns]
df = df[0:-1]

In [None]:
# 3. Encode 'icon' and 'summary' using sklearn label encoder
# Encode cloudCover with next valid value
from sklearn import preprocessing
LE = preprocessing.LabelEncoder()
df.icon = LE.fit_transform(df.icon).astype('float')
df.summary = LE.fit_transform(df.summary).astype('float')
df.cloudCover.replace(['cloudCover'], method='bfill', inplace=True)
df.cloudCover = df.cloudCover.astype('float')

# Confirm chnages
df.info()

In [None]:
# 4. Generate heatmap to see feature correlations
from matplotlib import pyplot as plt
import seaborn as sns
fig = plt.subplots(figsize=(10, 8)) 
corr = df.corr()
sns.heatmap(corr, vmax=1, vmin=-1, center=0, cmap="viridis")
plt.show()

- Remove House overall and Solar as they are highly correlated to use and gen respectively
- Kitchen and furnace variables are not highly correlated therefore can keep


In [None]:
# 5. no longer needed
# 6. Drop highly correlated features
df.drop(['House overall', 'Solar'], axis=1, inplace=True)

In [None]:
# 7. Convert datetime (From seconds to minutes)
# 8. Add more time features (year not needed)
df['time'] = pd.DatetimeIndex(pd.date_range('2016-01-01 05:00', periods=len(df),  freq='min'))
df['month'] = df['time'].apply(lambda x : x.month) 
df['day'] = df['time'].apply(lambda x : x.day)
df['weekday'] = df['time'].apply(lambda x : x.day_name())
df['weekday'] = LE.fit_transform(df['weekday']).astype('float')
df['weekofyear'] = df['time'].apply(lambda x : x.weekofyear)
df['hour'] = df['time'].apply(lambda x : x.hour)
df['minute'] = df['time'].apply(lambda x : x.minute)
df.head(5)

In [None]:
# 9. Resampling (Optional / hourly)
# 10. Visualizations

In [None]:
# Modeling

In [None]:
# Resample data to hourly
resampled_df = df.set_index('time')
resampled_df = resampled_df.resample('H').mean()
resampled_df.shape

In [None]:
# Create 80,10,10 train test validation split
target = 'use'

# Split data
train = resampled_df[0:int(len(resampled_df)*0.8)]
valid = resampled_df[int(len(resampled_df)*0.8):int(len(resampled_df)*0.9)]
test = resampled_df[int(len(resampled_df)*0.9):]

# Create X and Y
X_train, Y_train = train.drop([target], axis=1), train[target]
X_valid, Y_valid = valid.drop([target], axis=1), valid[target]
X_test, Y_test = test.drop([target], axis=1), test[target]

In [None]:
# Import light gbm and build model
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train, Y_train)
lgb_valid = lgb.Dataset(X_valid, Y_valid, reference=lgb_train)

In [None]:
lgbm_dart = lgb.train(
      params = {
        'task' : 'train',
        'objective' : 'regression',
        'boosting':'dart',
        'learning_rate':0.005,
        'metric' : {'mse'},
      },
      train_set = lgb_train,
      num_boost_round = 600,
      valid_sets = [lgb_valid],
      callbacks=[save_model()],
)

In [None]:
# Gradient Descent LGBM Model
lgbm_gd = lgb.train({
        'task' : 'train',
        'objective' : 'regression',
        'boosting':'gbdt',
        'learning_rate':0.05,
        'metric' : {'mse'},
        'num_leaves':200,
        'seed':42,
    }, 
    train_set = lgb_train,
    num_boost_round = 10000,
    valid_sets = [lgb_valid],
    early_stopping_rounds=15)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from matplotlib.ticker import FormatStrFormatter

def evaluator(y_pred, y_valid, y_test, pred_frame, plot, save) :
    
    print('MAE:', mean_absolute_error(y_test[:pred_frame], y_pred[:pred_frame]))
    print('MSE:', mean_squared_error(y_test[:pred_frame], y_pred[:pred_frame]))
    
    if plot:
        fig, ax = plt.subplots(figsize=(8, 4)) 
        plt.plot(pd.concat([y_valid[-pred_frame*2:], y_test[0:pred_frame]]).index, pd.concat([y_valid[-pred_frame*2:], y_test[0:pred_frame]]).values, label='Actual Usage')
        plt.plot(y_pred[0:pred_frame].index, y_pred[0:pred_frame].values, label='Forecast Usage')
        y_labels = ax.get_yticks()
        ax.yaxis.set_major_formatter(FormatStrFormatter('%.2fkw'))
        ax.legend(['Actual', 'Forecast'])
        plt.xlabel('Date')
        plt.ylabel("Eletrical Usage")
        plt.title(str(pred_frame) + "-day forecast energy usage using LightGBM")
        plt.gcf().autofmt_xdate()
        if save:
            plt.savefig('../Figures/'+str(pred_frame)+'-day.png')
    

In [None]:
#make predict dataframe
gd_pred_df = pd.DataFrame()
gd_pred_df[target] = lgbm_gd.predict(X_test, num_iteration=lgbm_gd.best_iteration)
gd_pred_df.index = X_test.index

In [None]:
# Plot multi-period forecasts
# for period in [24,48,7*24,14*24,30*24] :
for period in [7,14,30,60,120,360] :
    evaluator(gd_pred_df['use'], Y_valid, Y_test, period, plot=True, save=True)

In [None]:
# get mse and mae as forecast length increases
for period in range(0,len(out_df),14) :
    evaluator(resampled_df['use'][-len(out_df):], out_df['use'], period)

In [None]:
fig,axs = plt.subplots(nrows=3, ncols=2, figsize=(18, 12))
for ax, pred_frame in zip(axs.flat, [7,14,30,60,120,365]):
    ax.plot(pd.concat([Y_valid[-pred_frame*2:], Y_test[0:pred_frame]]).index, pd.concat([Y_valid[-pred_frame*2:], Y_test[0:pred_frame]]).values, label='Actual Usage')
    ax.plot(out_df[0:pred_frame].index, out_df[0:pred_frame].values, label='Forecast Usage')
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2fkw'))
    ax.legend(['Actual', 'Forecast'])
#     ax.set_xlabel('Date')
#     ax.set_ylabel("Eletrical Usage")
    ax.set_title(str(pred_frame) + "-day forecast energy usage using LightGBM")
#     ax.gcf().autofmt_xdate()
    
plt.show()


In [None]:
# Plot multi-period forecasts
for period in [7,14,30,60,120,365,len(out_df)] :
    evaluator(resampled_df['use'][-len(out_df):], out_df['use'], period)

In [None]:
lgb.plot_importance(lgbm, figsize=(10, 9))