# Compare AUC 

The following code aims to compare the performances on different methods 

In [8]:
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model
import tensorflow_probability as tfp
from tensorflow.keras import backend as K
import numpy as np
## import functions
import sys
sys.path.append("..")
from src import GetParquet
from src import Train_test_sets_maker
from src import MinMaxNormalisation
from src import CoxboxTransform
from src import Visualize
from src import MIMII_AE
from src import AE_variant1

# Lots of intermediary functions

TO UPDATE : These should be directly included in 'src' folder or be part of a constructor in a OOP implementation.

In [42]:
def lossCalcMSE(model,data):
    reconstructions = model(data)
    return tf.keras.losses.mse(reconstructions,data)

def computeLossesMSE(model, data):
    lossValues = []
    for raw_index in range(data.shape[0]):
        # parcour les 313 vecteurs
        spectrogram_flat = np.array(test_set.iloc[raw_index])
        spectrogram = spectrogram_flat.reshape(313,64)
        spec_df = pd.DataFrame(spectrogram)
        spec_tf = tf.cast(spec_df, tf.float32)
        ### Function used for AUC computation :
        lossValues.append(np.mean(lossCalcMSE(model, spec_tf)))  
    return lossValues
    
def lossCalcMAE(model,data):
    reconstructions = model(data)
    return tf.keras.losses.mae(reconstructions,data)

def computeLossesMAE(model, data):
    lossValues = []
    for raw_index in range(data.shape[0]):
        # parcour les 313 vecteurs
        spectrogram_flat = np.array(test_set.iloc[raw_index])
        spectrogram = spectrogram_flat.reshape(313,64)
        spec_df = pd.DataFrame(spectrogram)
        spec_tf = tf.cast(spec_df, tf.float32)
        ### Function used for AUC computation :
        lossValues.append(np.mean(lossCalcMAE(model, spec_tf)))  
    return lossValues
def AUC(test_labels,test_set,model,metric = "mse"):
    
    if metric == "mse":
        
        lossValues = computeLossesMSE(model,test_set)
        
    if metric == "mae":
        
        lossValues = computeLossesMAE(model,test_set)
    
    return 1 - metrics.roc_auc_score(test_labels, lossValues)



In [48]:
def GetNormalizeData(method,ID): 
       # import data
    df_normal, df_abnormal = GetParquet.fun(1,ID,method)
    # create datasets
    train_set, test_set, test_labels = Train_test_sets_maker.fun(df_normal,df_abnormal)
    # normalize [0,1]
    MIN,MAX = MinMaxNormalisation.getMinMax(train_set)
    
    train_set = MinMaxNormalisation.fun(train_set,MIN,MAX)
    test_set = MinMaxNormalisation.fun(test_set,MIN,MAX)
    
    return train_set,test_set,test_labels

In [49]:
import pandas as pd
def reshape_Spec(tf_set):
    
    """ Takes the (1,20032) input vectors and reshape them to become (1,64) """
    df = None
    for raw in range(tf_set.shape[0]):
    # parcour les 313 vecteurs
        spectrogram_flat = np.array(tf_set.iloc[raw])
        spectrogram = spectrogram_flat.reshape(313,64)
        spec_df = pd.DataFrame(spectrogram)

        if df is None:
            df = spec_df        
        else:
            df = pd.concat([df,spec_df],ignore_index = True)

    return df

In [50]:
def MIMII_AUC_calc(AUC_list,train_set,test_set,test_labels):
    
    train_set_RESHAPE = tf.cast(reshape_Spec(train_set),float)
    test_set_RESHAPE = tf.cast(reshape_Spec(test_set),float)
    
    InputSize = test_set_RESHAPE.shape[1]
    
    """ uses the MSE as loss fun"""

    autoencoder = MIMII_AE.fun(InputSize)
    # Train algorithm
    history = autoencoder.fit(train_set_RESHAPE, train_set_RESHAPE, 
                                  epochs=50, 
                                  batch_size=512,
                                  validation_data=(test_set_RESHAPE,test_set_RESHAPE),
                                  validation_split = 0.1,
                                  verbose = 0,
                                  shuffle=True)
                        
    AUC_list.append(AUC(test_labels, test_set, autoencoder))
    
    """ uses the MAE as loss fun"""

    
    autoencoder = MIMII_AE.fun(InputSize,'mae')
        # Train algorithm
    history = autoencoder.fit(train_set_RESHAPE, train_set_RESHAPE, 
                                  epochs=50, 
                                  batch_size=512,
                                  validation_data=(test_set_RESHAPE,test_set_RESHAPE),
                                  validation_split = 0.1,
                                  verbose = 0,
                                  shuffle=True)
    AUC_list.append(AUC(test_labels, test_set, autoencoder, 'mae'))
    print("AUC computed")
    return AUC_list

In [51]:
def Variant_AUC_calc(AUC_list,train_set,test_set,test_labels):
    
    train_set_RESHAPE = tf.cast(reshape_Spec(train_set),float)
    test_set_RESHAPE = tf.cast(reshape_Spec(test_set),float)
    
    InputSize = test_set_RESHAPE.shape[1]
    
    """ uses the MSE as loss fun"""
    
    autoencoder = AE_variant1.fun(InputSize,128,8)
        # Train algorithm
    history = autoencoder.fit(train_set_RESHAPE, train_set_RESHAPE, 
                                  epochs=50, 
                                  batch_size=512,
                                  validation_data=(test_set_RESHAPE,test_set_RESHAPE),
                                  validation_split = 0.1,
                                  verbose = 0,
                                  shuffle=True)
                        
    AUC_list.append(AUC(test_labels, test_set, autoencoder))
    
    """ uses the MAE as loss fun"""

    
    autoencoder = AE_variant1.fun(InputSize,128,8,'mae')
        # Train algorithm
    history = autoencoder.fit(train_set_RESHAPE, train_set_RESHAPE, 
                                  epochs=50, 
                                  batch_size=512,
                                  validation_data=(test_set_RESHAPE,test_set_RESHAPE),
                                  validation_split = 0.1,
                                  verbose = 0,
                                  shuffle=True)
    AUC_list.append(AUC(test_labels, test_set, autoencoder, 'mae'))
    print("AUC computed !")
    return AUC_list

# __Main__ function

In [52]:
method_to_test = [0] 
ID_to_test = [1,2,3,4]

AUCs = []
for method in method_to_test:
    for ID in ID_to_test:
        AUC_list = []
        # Get datasets
        train_set, test_set, test_labels = GetNormalizeData(method,ID)
        
        
        # All datapoints are shifter by 1, as the coxbox supports only STRICTLY positive numbers
        CB_train_set = CoxboxTransform.fun(train_set+1)
        CB_test_set = CoxboxTransform.fun(test_set+1)
        
        # Cast all functions as Tensorflow
        #train_set, test_set, CB_train_set, CB_test_set = tf.cast(df_train, tf.float32), tf.cast(df_test, tf.float32), tf.cast(CB_train_set, tf.float32), tf.cast(CB_test_set, tf.float32)
        
        # Get MIMII_AE for normal datasets
        AUC_list = MIMII_AUC_calc(AUC_list,train_set,test_set,test_labels)
        # Get MIMII_AE for gaussian distributed datasets
        AUC_list = MIMII_AUC_calc(AUC_list,CB_train_set,CB_test_set,test_labels)
        # Get Variant AE for normal datasets
        AUC_list = Variant_AUC_calc(AUC_list,train_set,test_set,test_labels)
        # Get Variant AE for gaussian distributed datasets
        AUC_list = Variant_AUC_calc(AUC_list,CB_train_set,CB_test_set,test_labels)
        print("ID" + str(ID) + "completed with method" + str(method) + "!")
        AUCs.append(AUC_list)



Data acquired !
Train & test sets created !
AUC computed
AUC computed
AUC computed !
AUC computed !
ID1completed with method0!
Data acquired !
Train & test sets created !
AUC computed
AUC computed
AUC computed !
AUC computed !
ID2completed with method0!
Data acquired !
Train & test sets created !
AUC computed
AUC computed
AUC computed !
AUC computed !
ID3completed with method0!
Data acquired !
Train & test sets created !
AUC computed
AUC computed
AUC computed !
AUC computed !
ID4completed with method0!
Data acquired !
Train & test sets created !


ValueError: cannot reshape array of size 320 into shape (313,64)

# Performances from MIMII Paper : Fan + 6dB

In [65]:
import pandas as pd
df_MIMII = pd.DataFrame([0.75, 0.99, 0.92, 0.99], columns = ['MIMII_ref']) 
df_MIMII


Unnamed: 0,MIMII_ref
0,0.75
1,0.99
2,0.92
3,0.99


# Performances for different implementation of the algorithm

In [53]:
import pandas as pd
columns = ['MSE_MIMII', 'MAE_MIMII', 'MSE_Variant','MAE_Variant','CB_MSE_MIMII', 'CB_MAE_MIMII', 'CB_MSE_Variant','CB_MAE_Variant']
df = pd.DataFrame(AUCs, columns = columns)

Unnamed: 0,MSE_MIMII,MAE_MIMII,MSE_Variant,MAE_Variant,CB_MSE_MIMII,CB_MAE_MIMII,CB_MSE_Variant,CB_MAE_Variant
0,0.80622,0.831002,0.44196,0.532936,0.83759,0.821253,0.513213,0.549904
1,0.995443,0.99187,0.412257,0.517072,0.996309,0.994593,0.151064,0.309689
2,0.935644,0.925753,0.291772,0.583187,0.922987,0.924532,0.320175,0.647335
3,0.998881,0.999244,0.415532,0.611019,0.99909,0.999545,0.784769,0.662785


In [66]:
print('MIMII paper mean AUC : ' + str(round(np.mean(df_MIMII['MIMII_ref']),4)))
print('MSE MIMII mean AUC : ' + str(round(np.mean(df['MSE_MIMII']),4)))
print('MAE MIMII mean AUC : ' + str(round(np.mean(df['MAE_MIMII']),4)))
print('box cox MSE MIMII mean AUC : ' + str(round(np.mean(df['CB_MSE_MIMII']),4)))

MIMII paper mean AUC : 0.9125
MSE MIMII mean AUC : 0.934
MAE MIMII mean AUC : 0.937
box cox MSE MIMII mean AUC : 0.939


# Conclusions

1) Comparison with MIMII paper :
- Greater AUC were achieved comparatively with some combinations of implementations
- The use of MAE or MSE doesn't seem to change significantly the performances of detection on dataset if these performances are already quite high (~ 0.01 AUC difference if AUC > 0.9). However, it seems greater AUC is obtained in case of lower performances : ~0.03 AUC difference. The influence of outliers in the training model should be investigated.
2) Variant AE :
- The variant introduces 2 additionnals layers [128,1] which considerably reduced the performances and introduced greater training times
3) Box-Cox transformation :
- Comparatively, the Box-Cox transformation induced better results on average and pre-processing the dataset for a having a Normal distribution in not to be overlooked. However, in this case the ratio of improvement over additionnal computationnal time is so low that the transformation is not worth it.
4) Changes compared to MIMII paper :
- Channel selection : Instead of using the 1st channel I used the 4th channel where the microphone is closer and directly pointing towards the fan. Therefore the data should carry clearer information.
- MinMax Normalisation : the train and test set are normalized (same normalization for both !) such that the NN can converges faster.

# Further research :