In [156]:
import os
import glob
import pickle
from datetime import datetime

import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, GRU
from keras.metrics import AUC,Precision,Recall
from sklearn.model_selection import StratifiedKFold
from tensorflow import keras

In [None]:
NORMAL_INPUT_DATA_DIR = "./split_data/normal/"
FAULT_INPUT_DATA_DIR = "./split_data/fault/"
TRAIN_RESULTS_DIR = './train_results'

In [157]:
files = glob.glob('./models/*')
for f in files:
    print(f)
    os.remove(f)

./models\new.hdf5
./models\vl0.041945va0.989097m0s2f2.hdf5
./models\vl0.053769va0.984424m0s2f1.hdf5


In [158]:
def get_data(input_dir, series_size=31):
    data = []
    for (dirpath, dirnames, filenames) in os.walk(input_dir):
        for filename in filenames:
            try:
                path = input_dir+"/" + filename
                frame = pd.read_csv(path, header=None)
                if frame.to_numpy().shape == (31,10):
                    data.append(frame.to_numpy().tolist())
            except Exception as exp:
                print(filename)
                print(exp)

    split_data = []
    for i in range(0,len(data)):
        for j in range(0,len(data[i])-series_size+1):
            split_data.append(data[i][j:j+series_size])
    return np.array(split_data)[:,:,1:10]

In [159]:
def load_data(series_size):
    normal_data = get_data(NORMAL_INPUT_DATA_DIR,series_size)
    fault_data = get_data(FAULT_INPUT_DATA_DIR,series_size)
    data = np.concatenate((normal_data, fault_data))
    labels = np.concatenate((np.zeros(len(normal_data)), np.ones(len(fault_data))))
    return data, labels

In [160]:
def create_model(data, architecture=0, n_timesteps=31, n_features=9,n_outputs=1):
    tf.keras.backend.clear_session()
    model = None
    input_shape = (n_timesteps,n_features)
    layer = tf.keras.layers.Normalization(axis=2,input_shape=input_shape)
    layer.adapt(data)
    model = Sequential()
    model.add(layer)
    
    if architecture == 0:
        model.add(LSTM(100, kernel_initializer='random_normal',bias_initializer='zeros'))
        model.add(Dropout(0.5))
        model.add(Dense(100, activation='relu'))
        
    if architecture == 1:
        model.add(LSTM(units = 50, return_sequences = True, kernel_initializer='random_normal',bias_initializer='zeros'))
        model.add(Dropout(0.4))

        model.add(LSTM(units = 50, return_sequences = True))
        model.add(Dropout(0.4))

        model.add(LSTM(units = 50, return_sequences = True))
        model.add(Dropout(0.4))

        model.add(LSTM(units = 50))
        model.add(Dropout(0.4))
        
    if architecture == 2:
        model.add(GRU(100, kernel_initializer='random_normal',bias_initializer='zeros'))
        model.add(Dropout(0.5))
        model.add(Dense(100, activation='relu'))
        
    if architecture == 3:
        model.add(GRU(units = 50, return_sequences = True, kernel_initializer='random_normal',bias_initializer='zeros'))
        model.add(Dropout(0.4))

        model.add(GRU(units = 50, return_sequences = True))
        model.add(Dropout(0.4))

        model.add(GRU(units = 50, return_sequences = True))
        model.add(Dropout(0.4))

        model.add(GRU(units = 50))
        model.add(Dropout(0.4))
        
    model.add(Dense(n_outputs, activation='sigmoid'))
    opt = keras.optimizers.Adam(learning_rate=0.0005)
    
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy',AUC(),Precision(),Recall()])
    return model

In [161]:
def train_and_evaluate_model(model, data_train, labels_train, data_test, labels_test):
    checkpoint_filepath = './models/new.hdf5'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        monitor='val_loss',
        mode='min',
        save_best_only=True)
    return model.fit(data_train, labels_train,
                     validation_data=(data_test, labels_test), 
                     epochs=30, batch_size=32, verbose=0,
                     callbacks=[model_checkpoint_callback],)

In [162]:
title = []
avg = []
desc = []
history = []
models = []
conf_mat =[]
params = []

n_splits = 5
number_of_models = [
    0,
    1,
    2,
    3,
]
series_sizes = [
    2,
    5, 
    10,
    31,
]
cross_validation = True

for num_model in number_of_models:
    for series_size in series_sizes:
        
        title.append("Model number: "+str(num_model)+"   Series size: "+ str(series_size))
        params.append([num_model,series_size])
        print(title[-1])
        desc.append([])
        history.append([])
        models.append([])
        conf_mat.append([])
        

        avg.append([0.0,0.0,0.0])
        
        data, labels = load_data(series_size)
        skf = StratifiedKFold(n_splits=n_splits, random_state=None, shuffle=True)
        for i,(train, test) in enumerate(skf.split(data, labels)):
            if cross_validation == True or i == 0:
                start_time = datetime.now()
                
                models[-1].append(create_model(data,num_model, series_size))
                
                history[-1].append(train_and_evaluate_model(models[-1][-1], data[train], labels[train], data[test], labels[test]).history)
                end_time = datetime.now()
                
                models[-1][-1] = keras.models.load_model('./models/new.hdf5')
                
                y_pred = models[-1][-1].predict(data)
                conf_mat[-1].append(sklearn.metrics.confusion_matrix(labels[test], y_pred[test].round()))
                
                his = history[-1][-1]
                val_loss = np.min(his['val_loss'])
                val_acc_when_min_loss = his['val_accuracy'][np.argmin(his['val_loss'])]
                val_acc = np.max(his['val_accuracy'])
                avg[-1][0]+=val_loss
                avg[-1][1]+=val_acc_when_min_loss
                avg[-1][2]+=val_acc
                desc[-1].append("m"+str(num_model)+"s"+str(series_size)+"f"+str(i+1)
                           + "   min_val_loss = {0:.6f}".format(val_loss) + 
                                "   val_acc_when_min_loss = {0:.6f}".format(val_acc_when_min_loss) +
                                "   max_val_acc = {0:.6f}".format(val_acc))
                print(desc[-1][-1])
                print('Duration: {}'.format(end_time - start_time))

                os.rename("./models/new.hdf5", "./models/vl{0:.6f}".format(val_loss)
                          +"va{0:.6f}".format(val_acc_when_min_loss)
                          +"m"+str(num_model)+"s"+str(series_size)+"f"+str(i+1)+".hdf5")
        if cross_validation == True:
            avg[-1][0] /= n_splits
            avg[-1][1] /= n_splits
            avg[-1][2] /= n_splits
        print("Average min val loss: ", avg[-1][0], "  Average val accuracy when min loss: ",avg[-1][1],"  Average max val accuracy: ",avg[-1][1],"\n\n")

Model number: 0   Series size: 2
m0s2f1   min_val_loss = 0.048448   val_acc_when_min_loss = 0.985981   max_val_acc = 0.987539
Duration: 0:00:09.578989
m0s2f2   min_val_loss = 0.072309   val_acc_when_min_loss = 0.978193   max_val_acc = 0.979751
Duration: 0:00:09.294176
m0s2f3   min_val_loss = 0.063943   val_acc_when_min_loss = 0.979751   max_val_acc = 0.981308
Duration: 0:00:11.351029
m0s2f4   min_val_loss = 0.053028   val_acc_when_min_loss = 0.982866   max_val_acc = 0.982866
Duration: 0:00:09.493590
m0s2f5   min_val_loss = 0.062140   val_acc_when_min_loss = 0.981308   max_val_acc = 0.982866
Duration: 0:00:09.595014
Average min val loss:  0.059973782300949095   Average val accuracy when min loss:  0.981619930267334   Average max val accuracy:  0.981619930267334 


Model number: 0   Series size: 5
m0s5f1   min_val_loss = 0.023125   val_acc_when_min_loss = 0.994810   max_val_acc = 0.994810
Duration: 0:00:09.008574
m0s5f2   min_val_loss = 0.005486   val_acc_when_min_loss = 0.996540   max_v

m2s10f5   min_val_loss = 0.005740   val_acc_when_min_loss = 0.997872   max_val_acc = 0.997872
Duration: 0:00:07.581639
Average min val loss:  0.0014277545706136152   Average val accuracy when min loss:  0.9995744705200196   Average max val accuracy:  0.9995744705200196 


Model number: 2   Series size: 31
m2s31f1   min_val_loss = 0.002103   val_acc_when_min_loss = 1.000000   max_val_acc = 1.000000
Duration: 0:00:02.781695
m2s31f2   min_val_loss = 0.000468   val_acc_when_min_loss = 1.000000   max_val_acc = 1.000000
Duration: 0:00:02.838503
m2s31f3   min_val_loss = 0.051157   val_acc_when_min_loss = 0.952381   max_val_acc = 1.000000
Duration: 0:00:02.650136
m2s31f4   min_val_loss = 0.005239   val_acc_when_min_loss = 1.000000   max_val_acc = 1.000000
Duration: 0:00:02.790665
m2s31f5   min_val_loss = 0.007441   val_acc_when_min_loss = 1.000000   max_val_acc = 1.000000
Duration: 0:00:03.309929
Average min val loss:  0.013281805632868782   Average val accuracy when min loss:  0.9904761910438

In [163]:
print(history[0][0].keys())

dict_keys(['loss', 'accuracy', 'auc', 'precision', 'recall', 'val_loss', 'val_accuracy', 'val_auc', 'val_precision', 'val_recall'])


In [164]:
pickle.dump(title, open(TRAIN_RESULTS_DIR + 'title.p', 'wb'))
pickle.dump(params, open(TRAIN_RESULTS_DIR + 'params.p', 'wb'))
pickle.dump(avg, open(TRAIN_RESULTS_DIR + 'avg.p', 'wb'))
pickle.dump(desc, open(TRAIN_RESULTS_DIR+ 'desc.p', 'wb'))
pickle.dump(history, open(TRAIN_RESULTS_DIR + 'history.p', 'wb'))
pickle.dump(conf_mat, open(TRAIN_RESULTS_DIR + 'conf_mat.p', 'wb'))