In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, KFold, RepeatedKFold, train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
from sklearn.metrics import precision_score, mean_squared_error, r2_score, make_scorer
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import quantile_transform
import scipy.stats as st
from sklearn.feature_selection import RFE, RFECV, SelectFromModel, mutual_info_regression, SelectKBest
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pprint as pp
import datetime
%matplotlib inline

In [None]:
Climate_Data = pd.read_excel('Climate_Data.xls')
#######################################################################################################################
Energy_Data_mean = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).mean()
Energy_Data_mean = Energy_Data_mean[['Day of Week', 'Is Holiday', 'Daylight Savings', 'DHI', 'DNI', 'Dew Point', 
                                     'Temperature', 'Relative Humidity']]
Energy_Data_mean.columns = ['Day_of_Week', 'Is_Holiday', 'Daylight_Savings', 'DHI_AVG', 'DNI_AVG', 'Dew Point_AVG', 
                            'Temperature_AVG', 'Relative Humidity_AVG']
#######################################################################################################################
Energy_Data_sum = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).sum()
Energy_Data_sum = Energy_Data_sum[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_sum.columns = ['DHI_SUM', 'DNI_SUM', 'Dew Point_SUM', 'Temperature_SUM', 'Relative Humidity_SUM']
#######################################################################################################################
Energy_Data_max = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).max()
Energy_Data_max = Energy_Data_max[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_max.columns = ['DHI_MAX', 'DNI_MAX', 'Dew Point_MAX', 'Temperature_MAX', 'Relative Humidity_MAX']
#######################################################################################################################
Energy_Data_std = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).std()
Energy_Data_std = Energy_Data_std[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_std.columns = ['DHI_STD', 'DNI_STD', 'Dew Point_STD', 'Temperature_STD', 'Relative Humidity_STD']
#######################################################################################################################
Energy_Data_min = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).min()
Energy_Data_min = Energy_Data_min[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_min.columns = ['DHI_MIN', 'DNI_MIN', 'Dew Point_MIN', 'Temperature_MIN', 'Relative Humidity_MIN']
#######################################################################################################################
Energy_Data = pd.concat([Energy_Data_mean, Energy_Data_sum, Energy_Data_max, Energy_Data_std, Energy_Data_min], axis=1)
Energy_Data.reset_index(inplace=True)
Energy_Data['Energy_Consumption'] = pd.read_excel('EnergyData_D1.xlsx')
#######################################################################################################################
Energy_Data['Lag1'] = (Energy_Data['Energy_Consumption'].shift(1))
Energy_Data.dropna(axis=0,inplace=True)
#######################################################################################################################
Energy_Data['Date_Time'] = pd.to_datetime(pd.DataFrame({'year': Energy_Data['Year'],'month': Energy_Data['Month'] + 1,
                                                        'day': Energy_Data['Day of Month']}))

In [None]:
Feature_Names = ['Month','Day_of_Week', 'Is_Holiday', 'Daylight_Savings', 'DHI_AVG', 'DNI_AVG', 'Dew Point_AVG', 
                 'Temperature_AVG', 'Relative Humidity_AVG', 'DHI_SUM', 'DNI_SUM', 'Dew Point_SUM', 'Temperature_SUM', 
                 'Relative Humidity_SUM', 'DHI_MAX', 'DNI_MAX', 'Dew Point_MAX', 'Temperature_MAX', 
                 'Relative Humidity_MAX', 'DHI_STD', 'DNI_STD', 'Dew Point_STD', 'Temperature_STD', 
                 'Relative Humidity_STD', 'DHI_MIN', 'DNI_MIN', 'Dew Point_MIN', 'Temperature_MIN', 
                 'Relative Humidity_MIN', 'Lag1']

X = Energy_Data[Feature_Names].as_matrix()
y = Energy_Data['Energy_Consumption'].as_matrix()
date_time = Energy_Data['Date_Time']

In [None]:
#################################################################################################
# To test anomaly detector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
DT_train, DT_Test = train_test_split(date_time, test_size=0.5, shuffle=False)
#################################################################################################

In [None]:
def energymodel_SVM():
    t0 = time()
    np.random.seed(7)
    ########################################################################################
    # Regression
    kf = KFold(n_splits=5, shuffle=True)
    scoring_param = make_scorer(mean_squared_error,greater_is_better=False)
    
    estimators = []
    estimators.append(('standardize', MinMaxScaler()))
    estimators.append(('FS', SelectKBest(mutual_info_regression)))
    estimators.append(('SVM', SVR()))
    pipe = Pipeline(estimators)
    
    y_train_scaled = (y_train - y_train.min())/(y_train.max() - y_train.min())
    
    p_grid = dict(FS__k = [int(i) for i in np.arange(1,len(Feature_Names)+1,1)],
                  SVM__gamma = np.logspace(-3, 0, 4),
                  SVM__C = np.logspace(0, 3, 4))
    
    model = GridSearchCV(estimator = pipe, param_grid = p_grid, scoring = scoring_param, cv = kf, n_jobs=-1)
    model.fit(X_train, y_train_scaled)
    
    params = model.best_params_
    print("Best best k: %s Best gamma: %f Best C: %s" % (params['FS__k'], params['SVM__gamma'], params['SVM__C']))
    
    
    Y_Test_Pred_scaled = model.predict(X_test)
    Y_Test_Pred = (Y_Test_Pred_scaled*(y_train.max()-y_train.min()))+y_train.min()
    
    rmse = np.sqrt(mean_squared_error(y_test,Y_Test_Pred))
    data_range = y_test.max() - y_test.min()
    NRMSE = (rmse/data_range) * 100.0
    RSQ = r2_score(y_test,Y_Test_Pred)
    print("Normalized RMSE: %0.3f" % NRMSE)
    print("R-squared: %0.3f" % RSQ)
    
    fig = plt.figure(figsize=(30,20))
    ax = fig.add_subplot(1, 1, 1)
    plt.scatter(y_test, Y_Test_Pred, c="g", s=200, alpha=0.5)
    plt.xlabel("Target [J]", fontsize=40)
    plt.ylabel("Predictions [J]", fontsize=40)
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    plt.savefig('Scatter-Target-vs-Pred-SVM-D1')
    
    fig = plt.figure(figsize=(30,20))
    ax = fig.add_subplot(1, 1, 1)
    plt.plot(DT_Test.dt.to_pydatetime(), y_test, 'k.', lw=5, markersize=20, label=u'Observations')
    plt.plot(DT_Test.dt.to_pydatetime(), Y_Test_Pred, 'r-', lw=5, label=u'Prediction')
    plt.xlabel('Date Time',fontsize=40)
    plt.ylabel('Energy Consumption - Facility [J]',fontsize=40)
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    plt.legend(loc='best',fontsize=30)
    plt.savefig('Plot-Target-vs-Pred-SVM-D1')
    
    t1 = time()
    print('Time taken for this trial %f' %(t1-t0))
    
    return model, y_test, Y_Test_Pred

In [None]:
energymodel_SVM()