In [35]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import windows
from scipy.fftpack import rfft
from tqdm import tqdm

In [36]:
# 將音訊檔轉換為MFCC特徵的函式
def wav2mfcc(file_path, max_pad_len=11):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    wave = wave[::3]
    wave = np.array(wave, order='F')
    
    # 明確地以關鍵字形式傳遞參數 y 和 sr
    mfcc = librosa.feature.mfcc(y=wave, sr=8000)
    
    if mfcc.shape[1] > max_pad_len:
        mfcc = mfcc[:, :max_pad_len]
    else:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    
    return mfcc


In [37]:
def stft(x, n_fft, hop_length, window):
    num_frames = 1 + (len(x) - n_fft) // hop_length
    frames = np.lib.stride_tricks.as_strided(x, shape=(n_fft, num_frames),
                                             strides=(x.itemsize, hop_length*x.itemsize))
    return rfft(frames * window[:, None], n=n_fft, axis=0)

In [39]:
def load_wav_files(directory, target_sr=8000):
    wav_files = []
    for root, dirs, files in os.walk(directory):
        with tqdm(total=len(files), desc='Loading files', unit='file') as pbar:
            for file in files:
                if file.endswith(".wav") and file != 'all_channel.wav':
                    file_path = os.path.join(root, file)
                    y, sr = librosa.load(file_path, sr=None)
                    if sr != target_sr:
                        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
                    path = root.split(os.path.sep)
                    filename = f'{path[-1]}_{file}'
                    pbar.set_postfix(file=filename, )
                    wav_files.append((y, filename))
                pbar.update(1)
    return wav_files


In [40]:
data = load_wav_files('training_data/leak')

Loading files: 100%|██████████| 2076/2076 [01:24<00:00, 24.50file/s, file=training_data/leak_ITRI0011003_rate_70_2023-07-17T170135.wav]      


In [43]:
import IPython.display as display

display.Audio(data[2][0], rate=8000)

In [45]:
def convert_to_mel_spectrogram(audio, n_fft, hop_length, n_mels, sr=8000):
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length,
                                                     n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db

In [46]:
def plot_mel_spectrogram(audio_data, sample_rate, filename=None, save_dir='images/mel_spectrograms'):
    # 確保音頻數據是單聲道的
    if len(audio_data.shape) > 1:
        audio_data = audio_data[:, 0]

    # 將音頻數據轉換為浮點型並歸一化
    if audio_data.dtype.kind in 'iu':
        audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max
    elif audio_data.dtype.kind == 'f':
        audio_data = audio_data.astype(np.float32)
        max_value = np.max(np.abs(audio_data))
        if max_value > 1.0:
            audio_data /= max_value

    # 設置STFT參數
    n_fft = sample_rate // 5
    hop_length = n_fft // 4  # 75% 重疊

    # 應用漢寧窗
    window = windows.hann(n_fft, sym=False)

    # 執行STFT
    stft_result = stft(audio_data, n_fft, hop_length, window)

    # 計算功率譜
    power_spectrum = np.abs(stft_result) ** 2

    # 創建梅爾濾波器組
    n_mels = 128
    mel_filterbank = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mels)

    # 將功率譜轉換為梅爾頻譜
    mel_spectrogram = np.dot(mel_filterbank, power_spectrum)

    # 轉換為分貝刻度
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # 創建時間軸
    time = np.arange(mel_spectrogram.shape[1]) * hop_length / sample_rate

    # 繪製梅爾頻譜圖
    plt.figure(figsize=(15, 10))
    plt.imshow(mel_spectrogram_db, aspect='auto', origin='lower',
               extent=[time.min(), time.max(), 0, n_mels],
               cmap='jet')

    plt.colorbar(label='amplitude (dB)')
    plt.xlabel('time (sec)')
    plt.ylabel('mel frequency')
    plt.title(f'Mel Spectrogram of Audio Signal (n_fft = {n_fft}, n_mels = {n_mels})')
    plt.savefig(f'{save_dir}/{filename}_mel_spectrogram.png')


In [50]:
mel = convert_to_mel_spectrogram(data[2][0], n_fft=2048, hop_length=512, n_mels=256)
plot_mel_spectrogram(mel, sample_rate=8000, filename='leak', save_dir='images')

ValueError: negative dimensions are not allowed