In [12]:
import os, sys
import glob
from torch.utils.data import Dataset, DataLoader
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [22]:
data_path = "/home/ubuntu/anudeep/machine_sound/"
paths = glob.glob(data_path+"0_dB_fan/*/*/*/*")

In [31]:
from tqdm import trange

class MIMII(Dataset):
    def __init__(self, data_paths):
        self.n_mels = 64
        self.frames = 5
        self.n_fft = 2048
        self.hop_length = 512
        self.power = 2.0
        
        # convert audio to spectograms
        
        self.spectrograms = []
        self.labels = []
        
        t = trange(len(data_paths), desc='Converting audio files to spectrograms', leave=True)        
        for index in t:
            t.set_description("Converting file no. %i of %i" % (index, len(data_paths)))
            t.refresh()
            
            wav_file_path = data_paths[index]
            if "abnormal" in wav_file_path:
                curr_label = 1
            elif "normal" in wav_file_path:
                curr_label = 0
            else:
                curr_label = -1
            
            curr_spectrogram = self.convert_to_spectrogram(wav_file_path)
            self.spectrograms.append(curr_spectrogram)
            self.labels.append(curr_label)
            
            
    def __get_item__(self, index):
        
        # return and indexed item from the list
        return self.transform(self.spectrograms[index]), self.transform(self.labels[index])
    
        
    def __len__(self):
        
        # number of samples loaded
        return len(self.data_paths)
    
        
    def convert_to_spectrogram(self, wav_file_path):
        signal, sampling_rate = self.load_sound_file(wav_file_path)
        
#         ## Perform fourier transform
#         stft = librosa.stft(signal, n_fft=self.n_fft, hop_length=self.hop_length)
#         # Map the magnitude to a decibel scale:
#         dB = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
        
        ## Mel spectrogram calculation
        db_mels = []
        for channel in range(signal.shape[0]):
            mel = librosa.feature.melspectrogram(signal[channel], sr=sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length,\
                                             n_mels=self.n_mels)
            db_mel = librosa.power_to_db(mel, ref=np.max)
            db_mels.append(db_mel)
        
        return np.array(db_mels)
        
    
    def load_sound_file(self, wav_name, mono=False, channel=0):
        multi_channel_data, sampling_rate = librosa.load(wav_name, sr=None, mono=mono)
        signal = np.array(multi_channel_data)
    
        return signal, sampling_rate
    
    transform = T.Compose([T.ToTensor()])
        

In [None]:
a = MIMII(paths)

Converting file no. 714 of 5550:  13%|█▎        | 715/5550 [02:26<21:07,  3.81it/s]