<div align='center'><font size="5" color="#00000"><center><h1 style="text-transform: uppercase; text-shadow: 1px 1px;"> PySaDML </h1></center></font></div> <br>
<div align='center'><font size="4" color="#00000"><center><h1 style="text-transform: uppercase; text-shadow: 1px 1px;"> PRE-PROCESSING. </h1></center></font></div>

<div align='center'><font size="2" color="#00000"><center><h1 style="text-transform: uppercase; text-shadow: 1px 1px;"> Détection de son anormal dans les pièces industrielles </h1></center></font></div>
<br>

# Data pre-processing

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =============================================================================
# Created By  : Mike G
# Created Date: Thursday Nov 25 15:00:00 UTC 2021
# =============================================================================
# Required libraries
import pandas as pd
import numpy as np
from include import common
from pathlib import Path

import time, datetime
from tqdm import tqdm, tqdm_notebook

# for plotting
%matplotlib inline
import matplotlib.pyplot as plt
import librosa

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)

#show pandas version
pd.__version__

In [None]:
# l'extension du fichier de données a importer.
EXT = '.csv'

# Le répertoire racine des données audio dev_data et eval_data
DATASET_ROOT = './../data'

# Les dossiers dans lesquels se trouvent les audios.
AUDIO_SUBFOLDER = '/dev_data'

# Les dossiers dans lesquels se trouvent les fichiers numpy.
NUMPY_SUBFOLDER = '/numpy_files'

DATASET_AUDIO_PATH = Path(DATASET_ROOT + AUDIO_SUBFOLDER)

DATASET_NUMPY_PATH = Path(DATASET_ROOT + NUMPY_SUBFOLDER)

# Si le dossier dev_data n'existe pas, le créer, sinon ne rien faire.
Path(DATASET_AUDIO_PATH).mkdir(parents=True, exist_ok=True)

In [None]:
!tree ./{DATASET_AUDIO_PATH}

### 1. Chargement des données audio

In [None]:
df = common.load_metadata(DATASET_AUDIO_PATH, EXT)

print(df.shape)

df.sample(5)

In [None]:
df.machine_type.value_counts()

In [None]:
list_machine = df.machine_type.unique()
list_machine

### 2. Paramètres

In [None]:
# Ces paramètres sont utilisés pour extraire valeurs mel ou mfcc.
"""
  n_mels (integer)     - number of Mel buckets (default: 64)
  n_mfcc (integer)     - number of MFCCs (default: 13)
  hop_length (integer) - describes how much this window is to be shifted along the audio signal

"""
n_mels = 64

n_mfcc = 40

n_fft = 2**13

hop_length = 2**11

frame = 0

sec = 0

samplingrate = int(df['samplingrate'].unique()[0])
sec_max = float(df['durations'].max())
sec_cut = 4.0

In [None]:
# trunc à true siginife que les audios seront tronqués de 1s
trunc = True

# normalize à true signifie que les données mel ou mfcc extraites seront normalisées
normalize = False

# extractimage à True permet de générer des images spectrogrammes des fichiers audio
extractimage = True

# suffixe des fichiers crées et sauvegardé sur le répertoire dev_data
suffixe = ''

if trunc:
    sec = sec_cut
    suffixe = '_trunc.npy'

if normalize:
    sec = sec_max
    suffixe = '_norm.npy'
    
if trunc & normalize:
    sec = sec_cut
    suffixe = '_trunc_norm.npy'

frame = int((sec * samplingrate) // hop_length + 1)

print(samplingrate, sec, frame)

### 3. Extraction des caractéristiques MFCC

#### 3.1 Par type de machine

In [None]:
for i in range(len(list_machine)):
    # Creating empty lists for MFCC bands and labels
    machinetype = list_machine[i]
    train_data = []
    test_data = []
    train_labels = []
    test_labels = []
    df_machine = []
    df_machine = df[df.machine_type == machinetype]
    df_machine.reset_index(drop=True, inplace=True)

    
    # Iterate through all audio files and extract MFCC
    start = time.time()
    for j in tqdm_notebook(range(len(df_machine)), desc='Extraction des caractéristiques MFCC pour la machine '+machinetype):
        status = df_machine.condition[j]

        data_split = df_machine.data_split[j]
        
        # Extracting the file path
        file_path = Path(df_machine.pathname[j])
        
        # Extract MFCCs
        if trunc:
            if normalize:
                mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc, duration=sec, normalize=normalize)
            else:
                mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc, duration=sec)
        else:
            if normalize:
                mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc, normalize=normalize)
            else:
                mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc)

        if not trunc:
            # Le tableau mfccs obtenu est la transposé du tableau obtenu par la fonction librosa mfcc 
            # car les features en sorties sont en lignes et non en colonne  
            num_frames = mfccs.shape[1]

            # Add padding to features where num_frames is inferior to frame
            if (num_frames < frame):
                mfccs = librosa.util.fix_length(mfccs, frame, axis=1)         

        # append data and labels
        if data_split == 'test':
            test_data.append(mfccs)
            test_labels.append(status)
        else:
            train_data.append(mfccs)
            train_labels.append(status)

        if extractimage:
            # make a figure with the follwing figsize
            #my_dpi=170
            #plt.figure(figsize=(400/my_dpi, 400/my_dpi), dpi=my_dpi)

            # Create spectogram image from this audio signal:
            image_path = file_path.parent.parent.parent.parent
            image_path = str(image_path).replace(image_path.name,'images')
            image_path = Path(image_path,data_split, machinetype, file_path.name.replace('.wav','_mfcc.png'))

            dir_dest = image_path.parent
            dir_dest.mkdir(parents=True, exist_ok=True)

            #convert this melspectrogram feature into a log scaled melspectrogram
            librosa.display.specshow(mfccs, sr=sr, y_axis='log')
            plt.axis('off');

            plt.savefig(image_path, bbox_inches='tight', pad_inches=0, format='png',dpi=100)
            plt.clf()

    # save data to disk
    np.save(Path(DATASET_NUMPY_PATH, "test_data_mfcc_"+machinetype+suffixe), np.array(test_data))    
    np.save(Path(DATASET_NUMPY_PATH, "train_data_mfcc_"+machinetype+suffixe), np.array(train_data))
    #----------------------------------------------------------------------------------
    # save label to disk
    np.save(Path(DATASET_NUMPY_PATH, "test_labels_"+machinetype+".npy"), np.array(test_labels))
    np.save(Path(DATASET_NUMPY_PATH, "train_labels_"+machinetype+".npy"), np.array(train_labels))
    #----------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------
    end = time.time()
    duree = int(end - start)
    print("La durée d'extraction est de: {}s".format(datetime.timedelta(seconds =duree)))
    print("Sauvegarde des extractions MFCC en tableau numpy.")
    print("\tTest split: {} \t\t\t Train split: {}".format(np.array(test_data).shape[0], np.array(train_data).shape[0]))
    print("\ttest_data shape: {} \t train_data shape: {}".format(np.array(test_data).shape, np.array(train_data).shape))
    print("\ttest_labels shape: {} \t\t train_labels shape: {}".format(np.array(test_labels).shape, np.array(train_labels).shape))
    #----------------------------------------------------------------------------------
    del mfccs
    del df_machine, status, data_split, file_path   
    del test_data, train_data, test_labels, train_labels


#### 3.2 Pour l'ensembles des machines

In [None]:
train_data = []
test_data = []
train_labels_type = []
train_labels_status = []
test_labels_type = []
test_labels_status = []

start = time.time()

for index in tqdm_notebook(range(len(df)), desc='Extraction des caractéristiques MFCC'+suffixe):
    machinetype = df.machine_type[index]
    
    status = df.condition[index]

    data_split = df.data_split[index]

    file_path = Path(df.pathname[index])

    # Extract MFCCs
    if trunc:
        if normalize:
            mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc, duration=sec, normalize=normalize)
        else:
            mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc, duration=sec)
    else:
        if normalize:
            mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc, normalize=normalize)
        else:
            mfccs, sr = common.get_mfcc(file_path, hop_length, n_mels, n_mfcc)

    if extractimage:
        image_path = file_path.parent.parent.parent.parent
        image_path = str(image_path).replace(image_path.name,'images')
        image_path = Path(image_path,data_split, machinetype, file_path.name.replace('.wav','_mfcc.png'))

        dir_dest = image_path.parent
        dir_dest.mkdir(parents=True, exist_ok=True)

        librosa.display.specshow(mfccs, sr=sr, hop_length=hop_length, y_axis='log')
        plt.axis('off');

        plt.savefig(image_path, bbox_inches='tight', pad_inches=0, format='png',dpi=100)
        plt.clf()

    if not trunc:
        num_frames = mfccs.shape[1]

        if (num_frames < frames_max):
            mfccs = librosa.util.fix_length(mfccs, frames_max, axis=1)         
        
    if data_split == 'test':
        test_data.append(mfccs)
        test_labels_type.append(machinetype)
        test_labels_status.append(status)
    else:
        train_data.append(mfccs)
        train_labels_type.append(machinetype)
        train_labels_status.append(status)
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
# save data to disk
np.save(Path(DATASET_NUMPY_PATH, "test_data_mfcc"+suffixe), np.array(test_data))    
np.save(Path(DATASET_NUMPY_PATH, "train_data_mfcc"+suffixe), np.array(train_data))
#----------------------------------------------------------------------------------
# save label to disk
np.save(Path(DATASET_NUMPY_PATH, "test_labels_type.npy"), np.array(test_labels_type))
np.save(Path(DATASET_NUMPY_PATH, "train_labels_type.npy"), np.array(train_labels_type))
np.save(Path(DATASET_NUMPY_PATH, "test_labels_status.npy"), np.array(test_labels_status))
np.save(Path(DATASET_NUMPY_PATH, "train_labels_status.npy"), np.array(train_labels_status))
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
end = time.time()
duree = int(end - start)
print("La durée d'extraction est de: {}s".format(datetime.timedelta(seconds =duree)))
print("Sauvegarde des extractions MFCC en tableau numpy.")
print("\tTest split: {} \t\t\t Train split: {}".format(np.array(test_data).shape[0], np.array(train_data).shape[0]))
print("\ttest_data shape: {} \t train_data shape: {}".format(np.array(test_data).shape, np.array(train_data).shape))
print("\ttest_labels_type shape: {} \t\t train_labels_type shape: {}".format(np.array(test_labels_type).shape, np.array(train_labels_type).shape))
print("\ttest_labels_status shape: {} \t\t train_labels_status shape: {}".format(np.array(test_labels_status).shape, np.array(train_labels_status).shape))
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
print("\nNumber of Status: {}".format(len(np.unique(np.array(test_labels_status)))))
print("Conditions: {}".format(np.unique(np.array(test_labels_status))))
print("\nNumber of machine type: {}".format(len(np.unique(np.array(test_labels_type)))))
print("Machine Type: {}".format(np.unique(np.array(test_labels_type))))

del mfccs

del status, data_split, file_path   
del test_data, train_data, train_labels_type, train_labels_status, test_labels_type, test_labels_status
    



### 4. Extraction des caractéristiques MEL

#### 4.1 Par type de machine

In [None]:
for i in range(len(list_machine)):
    # Creating empty lists for MFCC bands and labels
    machinetype = list_machine[i]
    train_data = []
    test_data = []
    train_labels = []
    test_labels = []
    df_machine = []
    df_machine = df[df.machine_type == machinetype]
    df_machine.reset_index(drop=True, inplace=True)

    
    # Iterate through all audio files and extract MFCC
    start = time.time()
    for j in tqdm_notebook(range(len(df_machine)), desc='Extraction des caractéristiques MEL pour la machine '+machinetype):
        status = df_machine.condition[j]

        data_split = df_machine.data_split[j]
        
        # Extracting the file path
        file_path = Path(df_machine.pathname[j])
        
        # Extract MEL
        if trunc:
            if normalize:
                mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft, duration=sec, normalize=normalize)
            else:
                mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft, duration=sec)
        else:
            if normalize:
                mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft, normalize=normalize)
            else:
                mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft)            

        if not trunc:
            # Le tableau mfccs obtenu est la transposé du tableau obtenu par la fonction librosa mfcc 
            # car les features en sorties sont en lignes et non en colonne  
            num_frames = mels.shape[1]

            # Add padding to features where num_frames is inferior to frame
            if (num_frames < frame):
                mels = librosa.util.fix_length(mels, frame, axis=1)         

        # append data and labels
        if data_split == 'test':
            test_data.append(mels)
            test_labels.append(status)
        else:
            train_data.append(mels)
            train_labels.append(status)

        if extractimage:
            image_path = file_path.parent.parent.parent.parent
            image_path = str(image_path).replace(image_path.name,'images')
            image_path = Path(image_path,data_split, machinetype, file_path.name.replace('.wav','_mel.png'))

            dir_dest = image_path.parent
            dir_dest.mkdir(parents=True, exist_ok=True)

            librosa.display.specshow(mels, sr=sr, hop_length=hop_length, y_axis='log')
            plt.axis('off');

            plt.savefig(image_path, bbox_inches='tight', pad_inches=0, format='png',dpi=100)
            plt.clf()

    # save data to disk
    test_data_name = "test_data_mel_"+machinetype+suffixe
    train_data_name = "train_data_mel_"+machinetype+suffixe
    test_data_path = Path(DATASET_NUMPY_PATH, test_data_name)
    train_data_path = Path(DATASET_NUMPY_PATH, train_data_name)
    test_data = np.array(test_data)
    train_data = np.array(train_data)
    np.save(test_data_path, test_data)    
    np.save(train_data_path, train_data)
    #----------------------------------------------------------------------------------
    # save label to disk
    test_labels_name = "test_labels_"+machinetype+".npy"
    train_labels_name = "train_labels_"+machinetype+".npy"
    test_labels_path = Path(DATASET_NUMPY_PATH, test_labels_name)
    train_labels_path = Path(DATASET_NUMPY_PATH, train_labels_name)
    test_labels = np.array(test_labels)
    np.save(test_labels_path, test_labels)
    train_labels = np.array(train_labels)
    np.save(train_labels_path, train_labels)
    #----------------------------------------------------------------------------------
    #----------------------------------------------------------------------------------
    end = time.time()
    duree = int(end - start)
    print("La durée d'extraction est de: {}s".format(datetime.timedelta(seconds =duree)))
    print("Sauvegarde des extractions MFCC en tableau numpy.")
    print("\tTest split: {} \t\t\t Train split: {}".format(np.array(test_data).shape[0], np.array(train_data).shape[0]))
    print("\ttest_data shape: {} \t train_data shape: {}".format(np.array(test_data).shape, np.array(train_data).shape))
    print("\ttest_labels shape: {} \t\t train_labels shape: {}".format(np.array(test_labels).shape, np.array(train_labels).shape))
    #----------------------------------------------------------------------------------
    del mels
    del df_machine, status, data_split, file_path   
    del test_data, train_data, test_labels, train_labels


#### 4.2 Pour l'ensembles des machines

In [None]:
train_data = []
test_data = []
train_labels_type = []
train_labels_status = []
test_labels_type = []
test_labels_status = []

start = time.time()

for index in tqdm_notebook(range(len(df)), desc='Extraction des caractéristiques MFCC'+suffixe):
    machinetype = df.machine_type[index]
    
    status = df.condition[index]

    data_split = df.data_split[index]

    file_path = Path(df.pathname[index])

    # Extract MEL
    if trunc:
        if normalize:
            mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft, duration=sec, normalize=normalize)
        else:
            mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft, duration=sec)
    else:
        if normalize:
            mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft, normalize=normalize)
        else:
            mels, sr = common.get_mel(file_path, hop_length, n_mels, n_fft)            

    if extractimage:
        image_path = file_path.parent.parent.parent.parent
        image_path = str(image_path).replace(image_path.name,'images')
        image_path = Path(image_path,data_split, machinetype, file_path.name.replace('.wav','_mfcc.png'))

        dir_dest = image_path.parent
        dir_dest.mkdir(parents=True, exist_ok=True)

        librosa.display.specshow(mfccs, sr=sr, hop_length=hop_length, y_axis='log')
        plt.axis('off');

        plt.savefig(image_path, bbox_inches='tight', pad_inches=0, format='png',dpi=100)
        plt.clf()

    if not trunc:
        num_frames = mels.shape[1]

        if (num_frames < frames_max):
            mels = librosa.util.fix_length(mels, frames_max, axis=1)         
        
    if data_split == 'test':
        test_data.append(mels)
        test_labels_type.append(machinetype)
        test_labels_status.append(status)
    else:
        train_data.append(mels)
        train_labels_type.append(machinetype)
        train_labels_status.append(status)
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
# save data to disk
np.save(Path(DATASET_NUMPY_PATH, "test_data_mel"+suffixe), np.array(test_data))    
np.save(Path(DATASET_NUMPY_PATH, "train_data_mel"+suffixe), np.array(train_data))
#----------------------------------------------------------------------------------
# save label to disk
np.save(Path(DATASET_NUMPY_PATH, "test_labels_type.npy"), np.array(test_labels_type))
np.save(Path(DATASET_NUMPY_PATH, "train_labels_type.npy"), np.array(train_labels_type))
np.save(Path(DATASET_NUMPY_PATH, "test_labels_status.npy"), np.array(test_labels_status))
np.save(Path(DATASET_NUMPY_PATH, "train_labels_status.npy"), np.array(train_labels_status))
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
end = time.time()
duree = int(end - start)
print("La durée d'extraction est de: {}s".format(datetime.timedelta(seconds =duree)))
print("Sauvegarde des extractions MEL en tableau numpy.")
print("\tTest split: {} \t\t\t Train split: {}".format(np.array(test_data).shape[0], np.array(train_data).shape[0]))
print("\ttest_data shape: {} \t train_data shape: {}".format(np.array(test_data).shape, np.array(train_data).shape))
print("\ttest_labels_type shape: {} \t\t train_labels_type shape: {}".format(np.array(test_labels_type).shape, np.array(train_labels_type).shape))
print("\ttest_labels_status shape: {} \t\t train_labels_status shape: {}".format(np.array(test_labels_status).shape, np.array(train_labels_status).shape))
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
print("\nNumber of Status: {}".format(len(np.unique(np.array(test_labels_status)))))
print("Conditions: {}".format(np.unique(np.array(test_labels_status))))
print("\nNumber of machine type: {}".format(len(np.unique(np.array(test_labels_type)))))
print("Machine Type: {}".format(np.unique(np.array(test_labels_type))))

del mels

del status, data_split, file_path   
del test_data, train_data, train_labels_type, train_labels_status, test_labels_type, test_labels_status
    

