In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit, KFold, RepeatedKFold, \
                                    train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
from sklearn.metrics import precision_score, mean_squared_error, r2_score, make_scorer, adjusted_rand_score, \
                    accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score, recall_score
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import quantile_transform
import scipy.stats as st
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pprint as pp
import datetime
%matplotlib inline

In [None]:
Climate_Data = pd.read_excel('Climate_Data.xls')
#######################################################################################################################
Energy_Data_mean = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).mean()
Energy_Data_mean = Energy_Data_mean[['Day of Week', 'Is Holiday', 'Daylight Savings', 'DHI', 'DNI', 'Dew Point', 
                                     'Temperature', 'Relative Humidity']]
Energy_Data_mean.columns = ['Day_of_Week', 'Is_Holiday', 'Daylight_Savings', 'DHI_AVG', 'DNI_AVG', 'Dew Point_AVG', 
                            'Temperature_AVG', 'Relative Humidity_AVG']
#######################################################################################################################
Energy_Data_sum = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).sum()
Energy_Data_sum = Energy_Data_sum[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_sum.columns = ['DHI_SUM', 'DNI_SUM', 'Dew Point_SUM', 'Temperature_SUM', 'Relative Humidity_SUM']
#######################################################################################################################
Energy_Data_max = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).max()
Energy_Data_max = Energy_Data_max[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_max.columns = ['DHI_MAX', 'DNI_MAX', 'Dew Point_MAX', 'Temperature_MAX', 'Relative Humidity_MAX']
#######################################################################################################################
Energy_Data_std = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).std()
Energy_Data_std = Energy_Data_std[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_std.columns = ['DHI_STD', 'DNI_STD', 'Dew Point_STD', 'Temperature_STD', 'Relative Humidity_STD']
#######################################################################################################################
Energy_Data_min = Climate_Data.groupby(['Year', 'Month', 'Day of Month']).min()
Energy_Data_min = Energy_Data_min[['DHI', 'DNI', 'Dew Point', 'Temperature', 'Relative Humidity']]
Energy_Data_min.columns = ['DHI_MIN', 'DNI_MIN', 'Dew Point_MIN', 'Temperature_MIN', 'Relative Humidity_MIN']
#######################################################################################################################
Energy_Data = pd.concat([Energy_Data_mean, Energy_Data_sum, Energy_Data_max, Energy_Data_std, Energy_Data_min], axis=1)
Energy_Data.reset_index(inplace=True)
Energy_Data[['Energy_Consumption', 'True_Labels']] = pd.read_excel('EnergyData_D3.xlsx')
#######################################################################################################################
Energy_Data['Lag1'] = (Energy_Data['Energy_Consumption'].shift(1))
Energy_Data.dropna(axis=0,inplace=True)
#######################################################################################################################
Energy_Data['Date_Time'] = pd.to_datetime(pd.DataFrame({'year': Energy_Data['Year'],'month': Energy_Data['Month'] + 1,
                                                        'day': Energy_Data['Day of Month']}))

In [None]:
Feature_Names = ['Month','Day_of_Week', 'Is_Holiday', 'Daylight_Savings', 'DHI_AVG', 'DNI_AVG', 'Dew Point_AVG', 
                 'Temperature_AVG', 'Relative Humidity_AVG', 'DHI_SUM', 'DNI_SUM', 'Dew Point_SUM', 'Temperature_SUM', 
                 'Relative Humidity_SUM', 'DHI_MAX', 'DNI_MAX', 'Dew Point_MAX', 'Temperature_MAX', 
                 'Relative Humidity_MAX', 'DHI_STD', 'DNI_STD', 'Dew Point_STD', 'Temperature_STD', 
                 'Relative Humidity_STD', 'DHI_MIN', 'DNI_MIN', 'Dew Point_MIN', 'Temperature_MIN', 
                 'Relative Humidity_MIN', 'Lag1']

X = Energy_Data[Feature_Names].as_matrix()
y = Energy_Data['Energy_Consumption'].as_matrix()
True_Labels = Energy_Data['True_Labels'].as_matrix()
date_time = Energy_Data['Date_Time']

In [None]:
#################################################################################################
# To test anomaly detector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=False)
TL_train, TL_Test = train_test_split(True_Labels, test_size=0.5, shuffle=False)
DT_train, DT_Test = train_test_split(date_time, test_size=0.5, shuffle=False)
#################################################################################################

In [None]:
print("Range: %0.3f" % (np.max(y_train) - np.min(y_train)))
print("Mean: %0.3f" % (np.mean(y_train)))
print("Median: %0.3f" % (np.median(y_train)))
print("Std. Deviation: %0.3f" % (np.std(y_train)))

In [None]:
def anomalyDetector():
    t0 = time()
    np.random.seed(7)
    ########################################################################################
    # Regression
    kf = KFold(n_splits=5, shuffle=True)
    scoring_param = make_scorer(mean_squared_error,greater_is_better=False)
    
    rfecv = RFECV(estimator=XGBRegressor(n_jobs=-1), step=1, cv=kf, scoring=scoring_param)
    FS_model = rfecv.fit(X_train, y_train)
    
    ranks = FS_model.ranking_
    FN =[]
    for i in range(len(ranks)):
        if ranks[i] == 1:
            FN.append(Feature_Names[i])    
    print(FN)
    
    X = Energy_Data[FN].as_matrix()
    X_train_transformed, X_test_transformed = train_test_split(X, test_size=0.5, shuffle=False)
    
    MD = [int(i) for i in np.linspace(1,20,num=5)]
    LR = np.linspace(0.001,0.1,num=5)
    NE = [int(i) for i in np.linspace(100,1000,num=10)]
    p_grid = dict()
    p_grid = dict(max_depth =  MD,
                  learning_rate = LR,
                  n_estimators = NE)
    
    model = GridSearchCV(estimator = XGBRegressor(n_jobs=-1), param_grid = p_grid, scoring = scoring_param, cv = kf)
    model.fit(X_train_transformed, y_train)
    
    params = model.best_params_
    print("Best max depth: %s Best LR: %f Best Est: %s" % (params['max_depth'], params['learning_rate'], params['n_estimators']))
    
    YTest_Pred = model.predict(X_train_transformed)
    Y_Test_Pred = model.predict(X_test_transformed)
    
    rmse = np.sqrt(mean_squared_error(y_test,Y_Test_Pred))
    data_range = y_test.max() - y_test.min()
    NRMSE = (rmse/data_range) * 100.0
    RSQ = r2_score(y_test,Y_Test_Pred)
    print("Normalized RMSE: %0.3f" % NRMSE)
    print("R-squared: %0.3f" % RSQ)
    ########################################################################################
    ########################## Set fixed threshold ################################################
    ############################################################################################
    Temp = pd.DataFrame(data={'Date_Time': DT_Test, 'Actual': y_test, 'Predicted':Y_Test_Pred, 'Labels':TL_Test})
    Temp.sort_values(by=['Date_Time'],inplace=True)
    Temp['Threshold'] = np.zeros(len(Temp.index))
    Temp['Threshold'] = np.mean(YTest_Pred) + 2*np.std(YTest_Pred)
    Labels = np.zeros(0)
    for i in np.arange(0,len(Temp)):
        if Temp['Actual'].values[i] > Temp['Threshold'].values[i]:
            Labels = np.append(Labels,1)
        else:
            Labels = np.append(Labels,0)
    Temp['Pred_Labels'] = Labels
    Temp = Temp[Temp['Date_Time'].between('2016-01-01','2016-12-31')]
    print("########################################################################################")
    print("Confusion Matrix - testing:")
    print(confusion_matrix(Temp['Labels'], Temp['Pred_Labels']))
    tn, fp, fn, tp = confusion_matrix(Temp['Labels'], Temp['Pred_Labels']).ravel()
    print("True Negative, False Positive, False Negative, True Positive {}.".format([tn, fp, fn, tp]))
    print("False positive means false alarms")
    print("False Negative means missed faults")
    print("########################################################################################")
    print("Classification Report - testing:")
    print(classification_report(Temp['Labels'], Temp['Pred_Labels'], target_names=['class 0', 'class 1']))
    print("########################################################################################")
    print("Accuracy - testing: %0.3f" % accuracy_score(Temp['Labels'], Temp['Pred_Labels']))
    print("########################################################################################")
    print("ROC AUC score - testing: %0.3f" % roc_auc_score(Temp['Labels'], Temp['Pred_Labels']))
    print("########################################################################################")
    ############################################################################################
    
    fig = plt.figure(figsize=(25,20))
    ax = fig.add_subplot(1, 1, 1)
    Data_0 = Temp.loc[Temp['Pred_Labels'][Temp['Pred_Labels']==0].index]
    Data_1 = Temp.loc[Temp['Pred_Labels'][Temp['Pred_Labels']==1].index]
    ax.scatter(Data_0['Date_Time'].dt.to_pydatetime(), Data_0['Actual'], c=plt.cm.coolwarm(0.), s=200, 
               edgecolors='y', marker='o', label=u'Predicted normal data')
    ax.scatter(Data_1['Date_Time'].dt.to_pydatetime(), Data_1['Actual'], c=plt.cm.coolwarm(1.), s=200, 
               edgecolors='y', marker='^', label=u'Predicted fault data')
    plt.plot(Temp['Date_Time'].dt.to_pydatetime(), Temp['Predicted'], 'c-*', lw = 4, ms = 5, label=u'XGBoost Prediction')
    plt.plot(Temp['Date_Time'].dt.to_pydatetime(), Temp['Threshold'], 'k--', lw = 4, label=u'Dynamic threshold')
    plt.xlabel('Date Time',fontsize=30)
    plt.ylabel('Energy Consumption [J]',fontsize=30)
    plt.xticks(fontsize=30)
    plt.yticks(fontsize=30)
    plt.legend(loc='best',fontsize=30)
    plt.savefig('M1-D3-XGB-Fixed-Threshold-Predicted-Labels')
    
    t1 = time()
    print("Time taken: %0.3f" % (t1-t0))
    #########################################################################################

In [None]:
scores = anomalyDetector()