In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit, KFold, RepeatedKFold, \
                                    train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
from sklearn.metrics import precision_score, mean_squared_error, r2_score, make_scorer, adjusted_rand_score, \
                    accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score, recall_score
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import quantile_transform
import scipy.stats as st
from sklearn.feature_selection import RFE, RFECV, SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import pprint as pp
import datetime
%matplotlib inline

In [None]:
Chiller_Data = pd.read_excel('Reduced_Condenser_Water_Flow_Fault_Data.xlsx')

In [None]:
Chiller_Data = Chiller_Data.loc[Chiller_Data['kW'] != 1.682000e-45]

In [None]:
Chiller_Data['Target'] = Chiller_Data['TCO'] - Chiller_Data['TCI']
Chiller_Data['Lag1'] = (Chiller_Data['Target'].shift(1))
Chiller_Data.dropna(axis=0,inplace=True)
#Time_data = Chiller_Data['Time (minutes)']

In [None]:
y = Chiller_Data['Target'].as_matrix()
True_Labels = Chiller_Data['Label'].as_matrix()
Chiller_Data.drop(['Target','Label','Time (minutes)'], axis=1, inplace=True)
X = Chiller_Data.as_matrix()
Feature_Names = list(Chiller_Data)

In [None]:
#################################################################################################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.55, shuffle=False)
TL_train, TL_Test = train_test_split(True_Labels, test_size=0.55, shuffle=False)
#DT_train, DT_Test = train_test_split(Time_data, test_size=0.55, shuffle=False)
#################################################################################################

In [None]:
def calc_dyn_threshold(A, P, I, N):
    # Control false alarm rates by tuning I and N. eg. increase I or N to reduce false alarms
    threshold = np.zeros(I-1)
    threshold[0:(I-1)] = P[0:(I-1)]
    labels = np.zeros(I-1)
    for k in np.arange(I,len(P)+1):
        #print(k)
        mu = np.mean(P[(k-I):k])
        #mu = np.mean(P[0:k])
        #print(mu)
        sigma = np.std(P[(k-I):k])
        #sigma = np.std(P[0:k])
        #print(sigma)
        T = mu + N*sigma
        #print(T)
        threshold = np.append(threshold,T)
        #print(threshold)
        if (A[k-1] > threshold[k-1]) :
            labels = np.append(labels,1)
        else:
            labels = np.append(labels,0)
    #print(P, labels, threshold)
    return labels, threshold

In [None]:
t0 = time()
np.random.seed(7)
########################################################################################
# Regression
kf = KFold(n_splits=10, shuffle=True, random_state=7)
scoring_param = make_scorer(mean_squared_error,greater_is_better=False)

rfecv = RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=1, cv=kf, scoring=scoring_param, n_jobs=-1)
FS_model = rfecv.fit(X_train, y_train)

ranks = FS_model.ranking_
FN =[]
for i in range(len(ranks)):
    if ranks[i] == 1:
        FN.append(Feature_Names[i])
print(FN)

X = Chiller_Data[FN].as_matrix()
X_train, X_test = train_test_split(X, test_size=0.55, shuffle=False)

NE = [int(i) for i in np.linspace(100,1000,num=10)]
p_grid = dict()
p_grid = dict(n_estimators = NE)

model = GridSearchCV(estimator = RandomForestRegressor(n_jobs=-1), param_grid = p_grid, scoring = scoring_param, cv = kf, 
                     n_jobs=-1)
model.fit(X_train, y_train)
    
params = model.best_params_
print("Best Est: %s" % (params['n_estimators']))
    
Y_Test_Predicted = model.predict(X_test)
    
rmse = np.sqrt(mean_squared_error(y_test,Y_Test_Predicted))
data_range = y_test.max() - y_test.min()
NRMSE = (rmse/data_range) * 100.0
RSQ = r2_score(y_test,Y_Test_Predicted)
print("Normalized RMSE: %0.3f" % NRMSE)
print("R-squared: %0.3f" % RSQ)

Labels, Threshold = calc_dyn_threshold(y_test, Y_Test_Predicted, 2, 2)
Temp = pd.DataFrame(data={'Actual': y_test, 'Predicted':Y_Test_Predicted, 'Labels':TL_Test, 
                               'Threshold':Threshold, 'Pred_Labels': Labels})

print("########################################################################################")
print("Confusion Matrix - testing:")
print(confusion_matrix(Temp['Labels'], Temp['Pred_Labels']))
tn, fp, fn, tp = confusion_matrix(Temp['Labels'], Temp['Pred_Labels']).ravel()
print("True Negative, False Positive, False Negative, True Positive {}.".format([tn, fp, fn, tp]))
print("False positive means false alarms")
print("False Negative means missed faults")
print("########################################################################################")
print("Classification Report - testing:")
print(classification_report(Temp['Labels'], Temp['Pred_Labels'], target_names=['Normal', 'Fault']))
print("########################################################################################")
print("Accuracy - testing: %0.3f" % accuracy_score(Temp['Labels'], Temp['Pred_Labels']))
print("########################################################################################")
print("ROC AUC score - testing: %0.3f" % roc_auc_score(Temp['Labels'], Temp['Pred_Labels']))
print("########################################################################################")
########################################################################################

#fig = plt.figure(figsize=(30,10))
#ax = fig.add_subplot(1, 1, 1)
#plt.plot(y_test, 'gd', markersize=30, label=u'Observations')
#plt.plot(Y_Test_Predicted, 'r-*', lw=10, label=u'Prediction')
#plt.xlabel('Data index',fontsize=40)
#plt.ylabel('Compressor Power [kW]',fontsize=40)
#plt.xticks(fontsize=25)
#plt.yticks(fontsize=25)
#plt.legend(loc='best',fontsize=30)

fig = plt.figure(figsize=(25,20))
ax = fig.add_subplot(1, 1, 1)
Data_0 = Temp.loc[Temp['Labels'][Temp['Labels']==0].index]
Data_1 = Temp.loc[Temp['Labels'][Temp['Labels']==1].index]
ax.scatter(list(Data_0.index), Data_0['Actual'], c=plt.cm.coolwarm(0.), s=200,
           edgecolors='y', marker='o', label=u'Actual normal data')
ax.scatter(list(Data_1.index), Data_1['Actual'], c=plt.cm.coolwarm(1.), s=200, 
           edgecolors='y', marker='^', label=u'Actual fault data')
plt.plot(list(Temp.index), Temp['Predicted'], 'c-*', lw = 4, ms = 5, label=u'XGBoost Prediction')
plt.xlabel('Data index',fontsize=30)
plt.ylabel('Condenser water temperature difference [F]',fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.legend(loc='best',fontsize=30)
plt.savefig('M2-Red-Cond-Flow-Actual-Labels-Predictions')

fig = plt.figure(figsize=(25,20))
ax = fig.add_subplot(1, 1, 1)
Data_0 = Temp.loc[Temp['Pred_Labels'][Temp['Pred_Labels']==0].index]
Data_1 = Temp.loc[Temp['Pred_Labels'][Temp['Pred_Labels']==1].index]
ax.scatter(list(Data_0.index), Data_0['Actual'], c=plt.cm.coolwarm(0.), s=200, 
           edgecolors='y', marker='o', label=u'Predicted normal data')
ax.scatter(list(Data_1.index), Data_1['Actual'], c=plt.cm.coolwarm(1.), s=200,
           edgecolors='y', marker='^', label=u'Predicted fault data')
plt.plot(list(Temp.index), Temp['Predicted'], 'c-*', lw = 4, ms = 5, label=u'XGBoost Prediction')
plt.plot(list(Temp.index), Temp['Threshold'], 'k--', lw = 4, label=u'Dynamic threshold')
plt.xlabel('Data index',fontsize=30)
plt.ylabel('Condenser water temperature difference [F]',fontsize=30)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.legend(loc='best',fontsize=30)
plt.savefig('M2-Red-Cond-Flow-RF-Dynamic-Threshold-Predicted-Labels')
    
t1 = time()
print('Time taken for this trial %f' %(t1-t0))