In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import pickle
import csv

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.utils import to_categorical
#from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.callbacks import CSVLogger
import tensorflow as tf

**Data Loading**

In [25]:
colab_path = '/content/drive/MyDrive/squishy/'
Sampled_train = pd.read_csv("dataset/train.csv")
Sampled_test = pd.read_csv("dataset/test.csv")
Sampled_cv = pd.read_csv('dataset/cv.csv')

**Data Preprocessing**

In [26]:
# Drop some mysterious fault type
Sampled_train.drop(Sampled_train[(Sampled_train.faultNumber == 3) | (Sampled_train.faultNumber == 9) | (Sampled_train.faultNumber == 15)].index, inplace = True)
Sampled_test.drop(Sampled_test[(Sampled_test.faultNumber == 3) | (Sampled_test.faultNumber == 9) | (Sampled_test.faultNumber == 15)].index, inplace = True)
Sampled_cv.drop(Sampled_cv[(Sampled_cv.faultNumber == 3) | (Sampled_cv.faultNumber == 9) | (Sampled_cv.faultNumber == 15)].index, inplace = True)

In [27]:
# make the Y value usable in LSTM
y_train = to_categorical(Sampled_train['faultNumber'],num_classes=21)
y_test = to_categorical(Sampled_test['faultNumber'],num_classes=21)
y_cv = to_categorical(Sampled_cv['faultNumber'],num_classes=21)

In [28]:
# drop unused meta data from x
x_train_df = Sampled_train.drop(['faultNumber','simulationRun','sample'],axis=1)
x_test_df = Sampled_test.drop(['faultNumber','simulationRun','sample'],axis =1)
x_cv_df = Sampled_cv.drop(['faultNumber','simulationRun','sample'],axis =1)

**Parameters**

In [29]:
sensors = ['xmv_10', 'xmv_11', 'xmeas_19', 'xmeas_21', 'xmv_9', 'xmv_4', 'xmv_5', 'xmeas_17', 'xmeas_18', 'xmeas_9']
model_path = 'models/evsi/' + '0_' + "none/"

ratios = [2**j for j in range(1, 6)]

# get prior probability
temp = Sampled_train['faultNumber'].value_counts()
non_fault = temp[0]
total = temp.sum()

temp = Sampled_cv['faultNumber'].value_counts()
non_fault += temp[0]
total += temp.sum()

P_present = non_fault/total
P_absent = 1 - P_present

**Utility Functions**

In [30]:
def feature_remover(features_names):
    # remove a list of features from x
    
    dimension = dict()
    
    # row dimension
    dimension['train_row'] = len(x_train_df)
    dimension['test_row'] = len(x_test_df)
    dimension['cv_row'] = len(x_cv_df)
    
    # create a copy so we don't change the original dataframe
    x_train_masked_df = x_train_df.copy()
    x_test_masked_df = x_test_df.copy()
    x_cv_masked_df = x_cv_df.copy()
    
    for feature in features_names:
        x_train_masked_df.drop([feature], axis = 1, inplace = True)
        x_test_masked_df.drop([feature], axis = 1, inplace = True)
        x_cv_masked_df.drop([feature], axis = 1, inplace = True)
        
    # column dimension
    dimension['train_col'] = x_train_masked_df.shape[1]
    dimension['test_col'] = x_test_masked_df.shape[1]
    dimension['cv_col'] = x_cv_masked_df.shape[1]
    
    standard_scalar = StandardScaler()
    x_train_masked_df = standard_scalar.fit_transform(x_train_masked_df)
    x_test_masked_df = standard_scalar.transform(x_test_masked_df)
    x_cv_masked_df = standard_scalar.transform(x_cv_masked_df)    
    
    x_train = np.resize(x_train_masked_df, (dimension['train_row'], dimension['train_col'], 1))
    x_test = np.resize(x_test_masked_df, (dimension['test_row'], dimension['test_col'], 1))
    x_cv = np.resize(x_cv_masked_df, (dimension['cv_row'], dimension['cv_col'], 1))
    
    return dimension, x_train, x_test, x_cv

In [31]:
# helper function to calculate probability of correctly giving signal when present
def get_signal_present(prediction, ground_truth, cost_false_positive, cost_false_negative):
    present_index = list()
    for i in range(len(ground_truth)):
        if ground_truth[i] == 0:
            present_index.append(i)
    
    counter = 0
    for index in present_index:
        if prediction[index] == 0:
            counter += 1
    
    return counter/len(present_index)

# helper function to calculate probability of correctly giving signal when present
# there should be a more generic way using operator module to merge this with the one above.
def get_no_signal_absent(prediction, ground_truth, cost_false_positive, cost_false_negative):
    absent_index = list()
    for i in range(len(ground_truth)):
        if ground_truth[i] != 0:
            absent_index.append(i)
    
    counter = 0
    for index in absent_index:
        if prediction[index] != 0:
            counter += 1
    return counter/len(absent_index)

In [32]:
def get_expected_cost(prediction, cost_false_positive, cost_false_negative):
  # get P(signal|present) and P(no signal|absent)
    ground_truth = Sampled_test['faultNumber'].tolist()
    P_signal_present = get_signal_present(prediction, ground_truth, cost_false_positive, cost_false_negative)
    P_no_signal_absent = get_no_signal_absent(prediction, ground_truth, cost_false_positive, cost_false_negative)
    P_signal_absent = 1 - P_no_signal_absent
    P_no_signal_present = 1 - P_signal_present

  # get P(signal)
    P_signal = P_present * P_signal_present + P_absent * P_signal_absent
    P_no_signal = 1 - P_signal

  # bayesian probability
    P_absent_signal = (P_signal_absent * P_absent) / P_signal
    P_present_signal = (P_signal_present * P_present) / P_signal
    P_absent_no_signal = (P_no_signal_absent * P_absent) / P_no_signal
    P_present_no_signal = (P_no_signal_present * P_present) / P_no_signal

  #calculate the evoi
    signal_action = None
    no_signal_action = None
    evoi = P_signal * min(cost_false_positive * P_absent_signal, cost_false_negative * P_present_signal) + P_no_signal * min(cost_false_positive * P_absent_no_signal, cost_false_negative * P_present_no_signal)
    if cost_false_positive * P_absent_signal >= cost_false_negative * P_present_signal:
        signal_action = 'No Fix'
    else:
        signal_action = 'Fix'
    
    if cost_false_positive * P_absent_no_signal >= cost_false_negative * P_present_no_signal:
        no_signal_action = 'No Fix'
    else:
        no_signal_action = 'Fix'
    return evoi, signal_action, no_signal_action

In [33]:
def plot_dict(dictionary):
    x, y = [], []
    for key, value in dictionary.items():
        x.append(key)
        y.append(value)
    return x, y

**Sensitivity Analysis**

In [34]:
with open('log/sensitivity_analysis.csv', 'a', newline='') as out:
    csv_out = csv.writer(out)
    csv_out.writerow(['sensor', 'evsi', 'signal action', 'no signal action'])

for ratio in ratios:
    print(ratio)
    
    # prepare the data
    dimension, x_train, x_test, x_cv = feature_remover(features_names = sensors)

    # load the model 
    base_model = load_model(model_path + 'base', compile = False)

    # get prediction from the base model
    base_prediction = base_model.predict_classes(x_test, verbose = 0)

    base_cost, _, _ = get_expected_cost(base_prediction, 2, 2*ratio)

    upper_dict = dict()
    for sensor in sensors:
        remove_sensors = sensors.copy()
        remove_sensors.remove(sensor)
        dimension, x_train, x_test, x_cv = feature_remover(features_names = remove_sensors)

        upper_model = load_model(model_path + '+' + sensor, compile= False)
        upper_prediction = upper_model.predict_classes(x_test, verbose = 0)
        upper_dict[sensor] = list(get_expected_cost(upper_prediction, 2, 2*ratio))
        print(upper_dict[sensor])

    # Calculate the evoi
    evoi_dict = dict()
    for sensor in sensors:
        evoi_dict[sensor] = [base_cost - upper_dict[sensor][0], upper_dict[sensor][1], upper_dict[sensor][2]]
    ranked_sensors = sorted(evoi_dict.items(), key=lambda x: x[1][0], reverse= True)
    
    with open('log/sensitivity_analysis.csv', 'a', newline='') as out:
        csv_out = csv.writer(out)
        for row in ranked_sensors:
            csv_out.writerow(row)
        csv_out.writerow([])

2
[0.41331370262390665, 'Fix', 'No Fix']
[0.641731778425656, 'Fix', 'No Fix']
[0.6148594752186589, 'Fix', 'No Fix']
[0.5183790087463557, 'Fix', 'No Fix']
[0.6037720116618076, 'Fix', 'No Fix']
[0.6387072886297376, 'Fix', 'No Fix']
[0.6082489795918368, 'Fix', 'No Fix']
[0.632660058309038, 'Fix', 'No Fix']
[0.594401749271137, 'Fix', 'No Fix']
[0.4891871720116618, 'Fix', 'No Fix']
4
[0.6250483965014577, 'Fix', 'No Fix']
[0.9748221574344023, 'Fix', 'No Fix']
[0.9873084548104956, 'Fix', 'No Fix']
[0.7348513119533527, 'Fix', 'No Fix']
[0.8993259475218659, 'Fix', 'No Fix']
[0.9728909620991254, 'Fix', 'No Fix']
[0.8637154518950438, 'Fix', 'No Fix']
[0.9504443148688048, 'Fix', 'No Fix']
[0.9078128279883382, 'Fix', 'No Fix']
[0.7001930029154518, 'Fix', 'No Fix']
8
[1.0485177842565596, 'Fix', 'No Fix']
[1.6355685131195337, 'Fix', 'Fix']
[1.6355685131195337, 'Fix', 'Fix']
[1.167795918367347, 'Fix', 'No Fix']
[1.4904338192419826, 'Fix', 'No Fix']
[1.6355685131195337, 'Fix', 'Fix']
[1.374648396501457

MemoryError: Unable to allocate 91.8 MiB for an array with shape (51, 235840) and data type float64

In [None]:
# x_1, y_1 = plot_dict(evoi_dict)
# #y_1_100 = [z * 100 for z in y_1]
# plt.figure(figsize=(20,10))
# sns.barplot(x_1, y_1, palette = mpl.cm.ScalarMappable(cmap=sns.dark_palette("#69d", reverse=False, as_cmap=True)).to_rgba(y_1))
# #plt.xticks(rotation=45);
# #plt.ylim((-5.0, 16.0))
# plt.title('Complete Feature Validation Accuracy Advantage')
# plt.ylabel('%')
# plt.xlabel('Masked Feature Name');