In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import pyarrow
import pyarrow.parquet as pq
import itertools

import librosa
from scipy.signal import stft
from scipy import signal
from scipy.fft import rfft
import pywt

from tqdm import tqdm
import os
import gc
import time
from scipy.io import loadmat

# Preparation

In [44]:
fault_dir = 'CWRU/12DriveEndFault/'
speeds = os.listdir(fault_dir)

faults = []
for s in speeds:
    faults.append(os.listdir('CWRU/12DriveEndFault/'+s))

for f in faults[0]:
    directory = 'Conditions/BA/'+f[:-4]
    if not os.path.exists(directory):
        os.makedirs(directory)

In [None]:
for s in speeds:
    for f in faults[0]:
        mat_file_path = 'CWRU/12DriveEndFault/'+s+'/'+f
        mat_data = loadmat(mat_file_path)

        data_dict = {}
        for key in mat_data:
            if not key.startswith('__') and key.endswith('BA_time'):
                if mat_data[key].ndim >= 2:
                    flattened_array = mat_data[key].flatten()
                    data_dict[key] = flattened_array
                else:
                    data_dict[key] = mat_data[key]

        for key in data_dict:
            df = pd.DataFrame(data_dict)
            output_filename  = 'Conditions/BA/'+f[:-4]+'/'+s+'.parquet'
            table = pyarrow.Table.from_pandas(df)
            pq.write_table(table, output_filename)

In [22]:
normal_dir = 'CWRU/NormalBaseline/'
speeds = os.listdir(normal_dir)

normals = []
for s in speeds:
    normals.append(os.listdir(normal_dir+s))

for n in normals[0]:
    directory = 'Conditions/'+n[:-4]
    if not os.path.exists(directory):
        os.makedirs(directory)

In [29]:
for s in speeds:
    for n in normals[0]:
        mat_file_path = 'CWRU/NormalBaseline/'+s+'/'+n
        mat_data = loadmat(mat_file_path)

        data_dict = {}
        for key in mat_data:
            if not key.startswith('__') and key.endswith('DE_time'):
                if mat_data[key].ndim >= 2:
                    flattened_array = mat_data[key].flatten()
                    data_dict[key] = flattened_array
                else:
                    data_dict[key] = mat_data[key]

        for key in data_dict:
            df = pd.DataFrame(data_dict[key])
            output_filename  = 'Conditions/'+n[:-4]+'/'+key+'_'+s+'.parquet'
            table = pyarrow.Table.from_pandas(df)
            pq.write_table(table, output_filename)

In [31]:
data = pd.read_parquet("Conditions/Normal/X097_DE_time_1797.parquet")

In [None]:
faults = []
for s in speeds:
    faults.append(os.listdir('CWRU/12DriveEndFault/'+s))

for f in faults[0]:
    directory = 'Conditions/DE/'+f[:-4]
    if not os.path.exists(directory):
        os.makedirs(directory)
        
for s in speeds:
    for f in faults[0]:
        mat_file_path = 'CWRU/12DriveEndFault/'+s+'/'+f
        mat_data = loadmat(mat_file_path)

        data_dict = {}
        for key in mat_data:
            if not key.startswith('__') and key.endswith('FE_time'):
                if mat_data[key].ndim >= 2:
                    flattened_array = mat_data[key].flatten()
                    data_dict[key] = flattened_array
                else:
                    data_dict[key] = mat_data[key]

        for key in data_dict:
            df = pd.DataFrame(data_dict)
            output_filename  = 'Conditions/FE/'+f[:-4]+'/'+s+'.parquet'
            table = pyarrow.Table.from_pandas(df)
            pq.write_table(table, output_filename)

# Functions

In [48]:
def read(path=''):
    data = pd.read_parquet(path)
    data.rename({data.columns[0]: "Value"}, axis=1, inplace=True)
    print(len(data))
    return data
    
def generate_spectr_coeff(data, kind="mfcc", period=10):
    coeff = []
    split = []
    date = []
    
    # Params
    sample_rate = 2
    frame_size = 256
    hop = 32
    n_coeff = 20
    lifter = 0
    n_mels = 20
    dct = 2
    win_length=32
    wind = 'tukey'
    
    step = period*120
    for i in range(0,len(data),step):
        split.append(data.Value.iloc[i:i+step].tolist())
            
    ############
    ### MFCC ###
    if kind == "mfcc":
        for j in tqdm(range(len(split)-1)):
            
            coef = librosa.feature.mfcc(y=np.array(split[j]), sr=sample_rate, lifter=lifter, n_fft=frame_size, hop_length=hop, win_length=win_length, n_mfcc=n_coeff, n_mels=n_mels, dct_type=dct, window=wind)
            # Compute delta coefficients
            deltas = librosa.feature.delta(coef)
            #deltasdeltas = librosa.feature.delta(deltas)

            # Concatenate the MFCC and delta coefficients
            features = np.concatenate((coef, deltas), axis=0)
            coeff.append(features)
            # del features
    
    ############        
    ### STFT ###    
    elif kind == "stft":
        for j in tqdm(range(len(split))):
            signal = split[j]
            window = wind
            #The length of each segment of the signal, measured in samples.
            nperseg = sample_rate*50
            noverlap = nperseg//2

            # Compute STFT
            f, t, Zxx = stft(signal, fs=sample_rate, window=window, nperseg=nperseg, noverlap=noverlap)

            # Convert magnitude spectrogram to dB scale
            # np.finfo(float).eps to avoid taking log(0)
            coeff.append(20 * np.log10(np.abs(Zxx + np.finfo(float).eps)))
            del f, t, Zxx
    
    ###########        
    ### CWT ###
    elif kind == "cwt":
        for j in tqdm(range(len(split))):
            signal = split[j]
            wavelet = "morl"
            coef, freqs = pywt.cwt(signal, np.arange(1, 128), wavelet)
            
    elif kind == "fft":
        for j in tqdm(range(len(split))):
            signal = split[j]
            fft_sig = rfft(signal, axis=-1, norm='ortho', overwrite_x=False)
            coeff.append(np.abs(fft_sig[1:]))
            
    del split
    print(f'Coefficient computed {np.max(coeff[0])}, {np.min(coeff[0])}')
    return coeff

def generate_spectr(coeff, condition, folder_name="", kind="mfcc"):
    directory = folder_name+'/'+condition
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(f"Saving images in {directory}")
    if kind == "mfcc":
        for i,co in enumerate(tqdm(coeff)):
            fig, ax = plt.subplots(figsize=(5,5))
            librosa.display.specshow(co, cmap='inferno', vmin=-10, ax=ax)
            fig.savefig(f'{directory}/_{i}.png', bbox_inches='tight', pad_inches=0)
            plt.close(fig)
            fig, ax = None, None
    return gc.collect()

def generate_data(data_folder:str='', condition='', ouput_name='Images', kind="mfcc", location='DE'):
   
    records = os.listdir(f'{data_folder}/{location}/{d}')
    print(f'{data_folder}/{location}/{d}')
    record_data = []
    for r in records:
        record_data.append(read(f'{data_folder}/{location}/{d}/{r}'))
    print(len(record_data))
    record_data = pd.concat(record_data).reset_index(drop=True)
    print(len(record_data))
    coeff = generate_spectr_coeff(record_data, kind=kind)
    record_data = None
    generate_spectr(coeff, condition=condition, folder_name=f'{ouput_name}_{kind}/{location}', kind=kind)
    coeff = None
    del coeff, record_data, records
    gc.collect()

In [49]:
directory = os.listdir('Conditions/BA')

In [50]:
for d in directory[0:4]:
    generate_data(data_folder=f'Conditions', condition=d, ouput_name='Images', location='BA')

Conditions/BA/0.007-Ball
121556
121556
121410
122571
4
487093


100%|███████████████████████████████████████████████████████████████████████████████| 405/405 [00:00<00:00, 426.63it/s]


Coefficient computed 45.62387572272087, -25.623604313146128
Saving images in Images_mfcc/BA/0.007-Ball


100%|████████████████████████████████████████████████████████████████████████████████| 405/405 [00:22<00:00, 17.71it/s]


Conditions/BA/0.007-InnerRace
122917
122136
121991
121265
4
488309


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:01<00:00, 259.25it/s]


Coefficient computed 52.89669042367298, -31.560188835173065
Saving images in Images_mfcc/BA/0.007-InnerRace


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:23<00:00, 17.30it/s]


Conditions/BA/0.007-OuterRace12
122136
122281
121991
122281
4
488689


100%|███████████████████████████████████████████████████████████████████████████████| 407/407 [00:00<00:00, 434.24it/s]


Coefficient computed 39.1624085737935, -22.84398174967926
Saving images in Images_mfcc/BA/0.007-OuterRace12


100%|████████████████████████████████████████████████████████████████████████████████| 407/407 [00:24<00:00, 16.78it/s]


Conditions/BA/0.007-OuterRace3
122281
121556
121846
122281
4
487964


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:01<00:00, 325.66it/s]


Coefficient computed 48.64139139303589, -26.15474407117673
Saving images in Images_mfcc/BA/0.007-OuterRace3


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:27<00:00, 14.61it/s]


In [51]:
gc.collect()

3319456

In [52]:
for d in directory[4:8]:
    generate_data(data_folder=f'Conditions', condition=d, ouput_name='Images', location='BA')

Conditions/BA/0.007-OuterRace6
122571
121410
122426
121991
4
488398


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:00<00:00, 408.11it/s]


Coefficient computed 50.53325495838148, -31.96074552658546
Saving images in Images_mfcc/BA/0.007-OuterRace6


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:21<00:00, 18.50it/s]


Conditions/BA/0.014-Ball
122136
121991
122136
121846
4
488109


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:00<00:00, 412.54it/s]


Coefficient computed 45.14663745143722, -24.420568051490754
Saving images in Images_mfcc/BA/0.014-Ball


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:21<00:00, 18.88it/s]


Conditions/BA/0.014-InnerRace
121701
121846
121846
121846
4
487239


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:01<00:00, 363.87it/s]


Coefficient computed 33.66853610026884, -26.68821380616653
Saving images in Images_mfcc/BA/0.014-InnerRace


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:21<00:00, 19.20it/s]


Conditions/BA/0.014-OuterRace6
121991
121846
122136
121846
4
487819


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:01<00:00, 352.07it/s]


Coefficient computed 40.76615401662785, -25.013669842669522
Saving images in Images_mfcc/BA/0.014-OuterRace6


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:21<00:00, 18.80it/s]


In [53]:
gc.collect()

3319456

In [54]:
for d in directory[8:12]:
    generate_data(data_folder=f'Conditions', condition=d, ouput_name='Images', location='BA')

Conditions/BA/0.021-Ball
122136
122136
121701
121991
4
487964


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:00<00:00, 431.19it/s]


Coefficient computed 40.03463151806926, -21.27301841691733
Saving images in Images_mfcc/BA/0.021-Ball


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:19<00:00, 20.57it/s]


Conditions/BA/0.021-InnerRace
121991
121846
121556
122136
4
487529


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:01<00:00, 386.00it/s]


Coefficient computed 45.72566432055882, -19.775027959317974
Saving images in Images_mfcc/BA/0.021-InnerRace


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:21<00:00, 18.64it/s]


Conditions/BA/0.021-OuterRace12
121701
122716
122426
121846
4
488689


100%|███████████████████████████████████████████████████████████████████████████████| 407/407 [00:00<00:00, 429.87it/s]


Coefficient computed 47.52352733348701, -27.01387684228397
Saving images in Images_mfcc/BA/0.021-OuterRace12


100%|████████████████████████████████████████████████████████████████████████████████| 407/407 [00:18<00:00, 22.01it/s]


Conditions/BA/0.021-OuterRace3
122136
122281
121991
121701
4
488109


100%|███████████████████████████████████████████████████████████████████████████████| 406/406 [00:00<00:00, 437.40it/s]


Coefficient computed 57.745868719800896, -17.664197176546644
Saving images in Images_mfcc/BA/0.021-OuterRace3


100%|████████████████████████████████████████████████████████████████████████████████| 406/406 [00:18<00:00, 21.39it/s]


In [55]:
gc.collect()

3321500

In [56]:
for d in directory[12:]:
    generate_data(data_folder=f'Conditions', condition=d, ouput_name='Images', location='BA')

Conditions/BA/0.021-OuterRace6
121991
122281
121991
122426
4
488689


100%|███████████████████████████████████████████████████████████████████████████████| 407/407 [00:00<00:00, 456.99it/s]


Coefficient computed 49.45102114236879, -33.10648539342995
Saving images in Images_mfcc/BA/0.021-OuterRace6


100%|████████████████████████████████████████████████████████████████████████████████| 407/407 [00:18<00:00, 22.44it/s]


In [58]:
gc.collect()

0

# Image Spliting

In [59]:
import shutil
import random

In [60]:
# Set the paths
data_folder = 'Images_mfcc/BA'
train_folder = 'Model_Image/train'
val_folder = 'Model_Image/val'
test_folder = 'Model_Image/test'

train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Loop through each label subfolder
for label_folder in os.listdir(data_folder):
    label_path = os.path.join(data_folder, label_folder)
    if os.path.isdir(label_path):
        images = os.listdir(label_path)
        num_images = len(images)
        random.shuffle(images)
        
        # Calculate split indices
        train_split = int(train_ratio * num_images)
        val_split = train_split + int(val_ratio * num_images)
        
        # Split images
        train_images = images[:train_split]
        val_images = images[train_split:val_split]
        test_images = images[val_split:]
        
        # Move images to respective folders
        for img in train_images:
            src = os.path.join(label_path, img)
            dest = os.path.join(train_folder, label_folder, img)
            os.makedirs(os.path.dirname(dest), exist_ok=True)
            shutil.copy(src, dest)
            
        for img in val_images:
            src = os.path.join(label_path, img)
            dest = os.path.join(val_folder, label_folder, img)
            os.makedirs(os.path.dirname(dest), exist_ok=True)
            shutil.copy(src, dest)
            
        for img in test_images:
            src = os.path.join(label_path, img)
            dest = os.path.join(test_folder, label_folder, img)
            os.makedirs(os.path.dirname(dest), exist_ok=True)
            shutil.copy(src, dest)

In [61]:
labels = os.listdir('Model_Image/train')
train_samples = 0
val_samples = 0
test_samples = 0
for l in labels:
    train_samples += len(os.listdir('Model_Image/train/'+l))
for l in labels:
    val_samples += len(os.listdir('Model_Image/val/'+l))
for l in labels:
    test_samples += len(os.listdir('Model_Image/test/'+l))

In [62]:
print("train_samples : ",train_samples)
print("val_samples : ",val_samples)
print("test_samples : ",test_samples)

train_samples :  12909
val_samples :  2741
test_samples :  2815
