In [21]:
# libraries importing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn import metrics
import numpy as np
import os

colors = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b','#e377c2','#7f7f7f','#bcbd22','#17becf']

from algorithms.Conv_AE import Conv_AE
from data_processing.process_data import process_data, get_single_df

## Data loading

In [22]:
datasets = process_data()

valve1_X =  datasets["valve1_X"]
valve1_y = datasets["valve1_y"]
valve2_X = datasets["valve2_X"]
valve2_y = datasets["valve2_y"]
other_anomaly_X = datasets["other_anomaly_X"]
other_anomaly_y = datasets["other_anomaly_y"]

In [23]:
X, y = get_single_df()

In [24]:
# hyperparameters selection
N_STEPS = 120
Qs = np.arange(0.25, 0.90, 0.05) # quantile for upper control limit (UCL) selection
model = Conv_AE()

In [25]:
def test_train_split(df_X, df_y):
    size_train = int(df_X.shape[0]*0.8)
    size_test = df_X.shape[0] - size_train
    x_train = df_X[:size_train]
    y_train = df_y[:size_train].anomaly
    x_test = df_X[-size_test:]
    y_test = df_y[-size_test:].anomaly
    return x_train, y_train, x_test, y_test

In [26]:
x_train_valve1, y_train_valve1, x_test_valve1, y_test_valve1 = test_train_split(valve1_X, valve1_y)
x_train_valve2, y_train_valve2, x_test_valve2, y_test_valve2 = test_train_split(valve2_X, valve2_y)
x_train_other_anomaly, y_train_other_anomaly, x_test_other_anomaly, y_test_other_anomaly = test_train_split(other_anomaly_X, other_anomaly_y)

In [27]:
x_train, y_train, x_test, y_test = test_train_split(X, y)

In [28]:
# Generated training sequences for use in the model.
def create_sequences(values, time_steps=N_STEPS):
    output = []
    for i in range(len(values) - time_steps + 1):
        output.append(values[i : (i + time_steps)])
    return np.stack(output)

In [29]:
x_train_steps_valve1 = create_sequences(np.array([row.values for i, row in x_train_valve1.iterrows()]), N_STEPS)
x_test_steps_valve1 = create_sequences(np.array([row.values for i, row in x_test_valve1.iterrows()]), N_STEPS)

x_train_steps_valve2 = create_sequences(np.array([row.values for i, row in x_train_valve2.iterrows()]), N_STEPS)
x_test_steps_valve2 = create_sequences(np.array([row.values for i, row in x_test_valve2.iterrows()]), N_STEPS)

x_train_steps_other_anomaly = create_sequences(np.array([row.values for i, row in x_train_other_anomaly.iterrows()]), N_STEPS)
x_test_steps_other_anomaly = create_sequences(np.array([row.values for i, row in x_test_other_anomaly.iterrows()]), N_STEPS)

In [30]:
x_train_steps = create_sequences(np.array([row.values for i, row in x_train.iterrows()]), N_STEPS)
x_test_steps = create_sequences(np.array([row.values for i, row in x_test.iterrows()]), N_STEPS)

#### Test model for combined dataset

In [31]:
results_combined = pd.DataFrame(columns = ['Dataset', 'Q', 'TPR', 'TNR', 'PPV', 'NPV', 'FPR', 'FNR', 'FDR', 'ACC'])

model.fit(x_train_steps)

for Q in Qs:

    # results predicting
    residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps - model.predict(x_train_steps)), axis=1), axis=1))
    UCL = residuals.quantile(Q)

    # train prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps - model.predict(x_train_steps)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_train_steps) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_train = pd.Series(data=0, index=x_train.index)
    yhat_train.iloc[anomalous_data_indices] = 1

    # test prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_test_steps - model.predict(x_test_steps)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_test_steps) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_test = pd.Series(data=0, index=x_test.index)
    yhat_test.iloc[anomalous_data_indices] = 1

    conf_matrix = metrics.confusion_matrix(y_test, yhat_test)

    TN, FP, FN, TP = conf_matrix.ravel()

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate FAR false alarm rate
    FPR = FP/(FP+TN)
    # False negative rate MAR missing alarm rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)

    row = dict(Dataset = 'Combined',
            Q = Q,
            TPR = TPR,
            TNR = TNR,
            PPV = PPV,
            NPV = NPV,
            FPR = FPR,
            FNR = FNR,
            FDR = FDR,
            ACC = ACC)

    results_combined = pd.concat([results_combined, pd.DataFrame(row, index = [0])], ignore_index = True)



 68/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



 48/865 [>.............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 56/865 [>.............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 48/865 [>.............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 60/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 76/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 74/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 78/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 75/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 82/865 [=>............................] - ETA: 0s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 75/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 76/865 [=>............................] - ETA: 1s


invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [32]:
for i, col in enumerate(results_combined.columns[2:]):
    if col in ['FPR', 'FNR']:
        fig = go.Figure()

        fig.add_trace(go.Scatter(mode='lines+text', x=results_combined.Q, y=results_combined[f'{col}'],
                                marker=dict(color=colors[i]),
                                texttemplate='%{y:.2f}', textposition='top center',
                                textfont=dict(color=colors[i], size=12),
                                name=f'{col}',
                                showlegend=True)
                        )

        fig.update_layout(height=400,width=900, template='plotly_white',
                        title=dict(text=f'{col} with different Q values', font=dict(size=18), x=.5, y=.95),
                        yaxis=dict(title=f'{col}', side='left', showgrid=True,),
                        xaxis=dict(title='Q', showgrid=False),
                        legend=dict(orientation="h", yanchor="bottom", y=1, x=0.5, xanchor="center"),
                        )

        fig.show()

#### Test model for valve 1

In [33]:
results_valve1 = pd.DataFrame(columns = ['Dataset', 'Q', 'TPR', 'TNR', 'PPV', 'NPV', 'FPR', 'FNR', 'FDR', 'ACC'])

model.fit(x_train_steps_valve1)

for Q in Qs:

    # results predicting
    residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps_valve1 - model.predict(x_train_steps_valve1)), axis=1), axis=1))
    UCL = residuals.quantile(Q)

    # train prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps_valve1 - model.predict(x_train_steps_valve1)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_train_steps_valve1) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_train = pd.Series(data=0, index=x_train_valve1.index)
    yhat_train.iloc[anomalous_data_indices] = 1

    # test prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_test_steps_valve1 - model.predict(x_test_steps_valve1)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_test_steps_valve1) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_test = pd.Series(data=0, index=x_test_valve1.index)
    yhat_test.iloc[anomalous_data_indices] = 1

    conf_matrix = metrics.confusion_matrix(y_test_valve1, yhat_test)

    TN, FP, FN, TP = conf_matrix.ravel()

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate FAR false alarm rate
    FPR = FP/(FP+TN)
    # False negative rate MAR missing alarm rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)

    row = dict(Dataset = 'Valve 1',
            Q = Q,
            TPR = TPR,
            TNR = TNR,
            PPV = PPV,
            NPV = NPV,
            FPR = FPR,
            FNR = FNR,
            FDR = FDR,
            ACC = ACC)

    results_valve1 = pd.concat([results_valve1, pd.DataFrame(row, index = [0])], ignore_index = True)






The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.






invalid value encountered in scalar divide


invalid value encountered in scalar divide



 70/451 [===>..........................] - ETA: 0s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 62/451 [===>..........................] - ETA: 0s


invalid value encountered in scalar divide


invalid value encountered in scalar divide



 98/451 [=====>........................] - ETA: 0s


invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide



 69/451 [===>..........................] - ETA: 0s


invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [34]:
for i, col in enumerate(results_valve1.columns[2:]):
    if col in ['FPR', 'FNR']:
        fig = go.Figure()

        fig.add_trace(go.Scatter(mode='lines+text', x=results_valve1.Q, y=results_valve1[f'{col}'],
                                marker=dict(color=colors[i]),
                                texttemplate='%{y:.2f}', textposition='top center',
                                textfont=dict(color=colors[i], size=12),
                                name=f'{col}',
                                showlegend=True)
                        )

        fig.update_layout(height=400,width=900, template='plotly_white',
                        title=dict(text=f'{col} with different Q values', font=dict(size=18), x=.5, y=.95),
                        yaxis=dict(title=f'{col}', side='left', showgrid=True,),
                        xaxis=dict(title='Q', showgrid=False),
                        legend=dict(orientation="h", yanchor="bottom", y=1, x=0.5, xanchor="center"),
                        )

        fig.show()

#### Test model for valve 2

In [35]:
results_valve2 = pd.DataFrame(columns = ['Dataset', 'Q', 'TPR', 'TNR', 'PPV', 'NPV', 'FPR', 'FNR', 'FDR', 'ACC'])

model.fit(x_train_steps_valve2)

for Q in Qs:

    # results predicting
    residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps_valve2 - model.predict(x_train_steps_valve2)), axis=1), axis=1))
    UCL = residuals.quantile(Q)

    # train prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps_valve2 - model.predict(x_train_steps_valve2)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_train_steps_valve2) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_train = pd.Series(data=0, index=x_train_valve2.index)
    yhat_train.iloc[anomalous_data_indices] = 1

    # test prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_test_steps_valve2 - model.predict(x_test_steps_valve2)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_test_steps_valve2) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_test = pd.Series(data=0, index=x_test_valve2.index)
    yhat_test.iloc[anomalous_data_indices] = 1

    conf_matrix = metrics.confusion_matrix(y_test_valve2, yhat_test)

    TN, FP, FN, TP = conf_matrix.ravel()

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate FAR false alarm rate
    FPR = FP/(FP+TN)
    # False negative rate MAR missing alarm rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)

    row = dict(Dataset = 'Valve 2',
            Q = Q,
            TPR = TPR,
            TNR = TNR,
            PPV = PPV,
            NPV = NPV,
            FPR = FPR,
            FNR = FNR,
            FDR = FDR,
            ACC = ACC)

    results_valve2 = pd.concat([results_valve2, pd.DataFrame(row, index = [0])], ignore_index = True)






The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.






invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [36]:
for i, col in enumerate(results_valve2.columns[2:]):
    if col in ['FPR', 'FNR']:

        fig = go.Figure()

        fig.add_trace(go.Scatter(mode='lines+text', x=results_valve2.Q, y=results_valve2[f'{col}'],
                                marker=dict(color=colors[i]),
                                texttemplate='%{y:.2f}', textposition='top center',
                                textfont=dict(color=colors[i], size=12),
                                name=f'{col}',
                                showlegend=True)
                        )

        fig.update_layout(height=400,width=900, template='plotly_white',
                        title=dict(text=f'{col} with different Q values', font=dict(size=18), x=.5, y=.95),
                        yaxis=dict(title=f'{col}', side='left', showgrid=True,),
                        xaxis=dict(title='Q', showgrid=False),
                        legend=dict(orientation="h", yanchor="bottom", y=1, x=0.5, xanchor="center"),
                        )

        fig.show()

#### Test model for other anomalies

In [37]:
results_other_anomaly = pd.DataFrame(columns = ['Dataset', 'Q', 'TPR', 'TNR', 'PPV', 'NPV', 'FPR', 'FNR', 'FDR', 'ACC'])

model.fit(x_train_steps_other_anomaly)

for Q in Qs:

    # results predicting
    residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps_other_anomaly - model.predict(x_train_steps_other_anomaly)), axis=1), axis=1))
    UCL = residuals.quantile(Q)

    # train prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_train_steps_other_anomaly - model.predict(x_train_steps_other_anomaly)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_train_steps_other_anomaly) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_train = pd.Series(data=0, index=x_train_other_anomaly.index)
    yhat_train.iloc[anomalous_data_indices] = 1

    # test prediction
    cnn_residuals = pd.Series(np.sum(np.mean(np.abs(x_test_steps_other_anomaly - model.predict(x_test_steps_other_anomaly)), axis=1), axis=1))

    # data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
    anomalous_data = cnn_residuals > UCL
    anomalous_data_indices = []
    for data_idx in range(N_STEPS - 1, len(x_test_steps_other_anomaly) - N_STEPS + 1):
        if np.all(anomalous_data[data_idx - N_STEPS + 1 : data_idx]):
            anomalous_data_indices.append(data_idx)

    yhat_test = pd.Series(data=0, index=x_test_other_anomaly.index)
    yhat_test.iloc[anomalous_data_indices] = 1

    conf_matrix = metrics.confusion_matrix(y_test_other_anomaly, yhat_test)

    TN, FP, FN, TP = conf_matrix.ravel()

    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Specificity or true negative rate
    TNR = TN/(TN+FP)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate FAR false alarm rate
    FPR = FP/(FP+TN)
    # False negative rate MAR missing alarm rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)

    row = dict(Dataset = 'Other anomalies',
            Q = Q,
            TPR = TPR,
            TNR = TNR,
            PPV = PPV,
            NPV = NPV,
            FPR = FPR,
            FNR = FNR,
            FDR = FDR,
            ACC = ACC)

    results_other_anomaly = pd.concat([results_other_anomaly, pd.DataFrame(row, index = [0])], ignore_index = True)






invalid value encountered in scalar divide


invalid value encountered in scalar divide


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide






invalid value encountered in scalar divide


invalid value encountered in scalar divide



In [38]:
for i, col in enumerate(results_other_anomaly.columns[2:]):
    if col in['FPR', 'FNR']:
        fig = go.Figure()

        fig.add_trace(go.Scatter(mode='lines+text', x=results_other_anomaly.Q, y=results_other_anomaly[f'{col}'],
                                marker=dict(color=colors[i]),
                                texttemplate='%{y:.2f}', textposition='top center',
                                textfont=dict(color=colors[i], size=12),
                                name=f'{col}',
                                showlegend=True)
                        )

        fig.update_layout(height=400,width=900, template='plotly_white',
                        title=dict(text=f'{col} with different Q values', font=dict(size=18), x=.5, y=.95),
                        yaxis=dict(title=f'{col}', side='left', showgrid=True,),
                        xaxis=dict(title='Q', showgrid=False),
                        legend=dict(orientation="h", yanchor="bottom", y=1, x=0.5, xanchor="center"),
                        )

        fig.show()

In [40]:
results = pd.concat([results_combined, results_valve1, results_valve2, results_other_anomaly], axis=0).round(3)

display(results)

Unnamed: 0,Dataset,Q,TPR,TNR,PPV,NPV,FPR,FNR,FDR,ACC
0,Combined,0.25,0.0,1.0,,0.633,0.0,1.0,,0.633
1,Combined,0.3,0.0,1.0,,0.633,0.0,1.0,,0.633
2,Combined,0.35,0.0,1.0,,0.633,0.0,1.0,,0.633
3,Combined,0.4,0.0,1.0,,0.633,0.0,1.0,,0.633
4,Combined,0.45,0.0,1.0,,0.633,0.0,1.0,,0.633
5,Combined,0.5,0.0,1.0,,0.633,0.0,1.0,,0.633
6,Combined,0.55,0.0,1.0,,0.633,0.0,1.0,,0.633
7,Combined,0.6,0.0,1.0,,0.633,0.0,1.0,,0.633
8,Combined,0.65,0.0,1.0,,0.633,0.0,1.0,,0.633
9,Combined,0.7,0.0,1.0,,0.633,0.0,1.0,,0.633


In [41]:
results.to_csv('Conv_AE_base_model_Q_values_evaluation.csv')