# <center>Study of UrbanSound8K Classes Acoustics Signature</center>

## Libraries Import

In [None]:
import numpy as np
import pandas as pd
import psycopg2
import os
import matplotlib.pyplot as plt
import math
import IPython
import datetime
import librosa.display
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as transforms

## Import of the Metadata File

In [None]:
metadata = pd.read_csv("dataset/UrbanSound8K.csv")
metadata

## Torchaudio Transforms

### Spectrogram

In [None]:
spectrogram_transform = torchaudio.transforms.Spectrogram( 
                    # Size of FFT, creates n_fft // 2 + 1 bins. (Default: 400)
                    n_fft = 256,
                    # Window size. (Default: n_fft)
                    win_length = 256,
                    # Length of hop between STFT windows. (Default: win_length // 2)
                    hop_length = 128,
                    # Two sided padding of signal. (Default: 0)
                    pad = 0,
                    # A function to create a window tensor that is applied/multiplied to each frame/window. (Default: torch.hann_window)
                    window_fn = torch.hann_window,
                    # Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: 2)
                    power = 2,
                    # Whether to normalize by magnitude after stft. (Default: False)
                    normalized = True,
                    # Arguments for window function. (Default: None)
                    wkwargs = None,
                    # Whether to pad waveform on both sides so that the t-th frame is centered at time t x hop_length (Default: True)
                    center = False,
                    # Controls the padding method used when center is True. (Default: "reflect")
                    pad_mode = "reflect",
                    # Controls whether to return half of results to avoid redundancy. (Default: True)
                    onesided = True,
                    # Indicates whether the resulting complex-valued Tensor should be represented with native complex dtype, 
                    # such as torch.cfloat and torch.cdouble, or real dtype mimicking complex value with an extra dimension 
                    # for real and imaginary parts. (See also torch.view_as_real.)
                    # This argument is only effective when power=None. It is ignored for cases where power is a number as in those cases, the returned tensor is power spectrogram, which is a real-valued tensor.
                    return_complex = False
                    )   

### Mel-Spectrogram

In [None]:
mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
                    # Sample rate of audio signal. (Default: 16000)
                    sample_rate = 22050,
                    # Size of FFT, creates n_fft // 2 + 1 bins. (Default: 400)
                    n_fft = 256,
                    # Window size. (Default: n_fft)
                    win_length = 256,
                    # Length of hop between STFT windows. (Default: win_length // 2)
                    hop_length = 128,
                    # Minimum frequency. (Default: 0.)
                    f_min = 0.,
                    # Maximum frequency. (Default: None)
                    f_max = 20000,
                    # Two sided padding of signal. (Default: 0)    
                    pad = 0,
                    # Number of mel filterbanks. (Default: 128)
                    n_mels = 128,
                    # A function to create a window tensor that is applied/multiplied to each frame/window. (Default: torch.hann_window)
                    window_fn = torch.hann_window,
                    # Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: 2)
                    power = 2,
                    # Whether to normalize by magnitude after stft. (Default: False)
                    normalized = True,
                    # Arguments for window function. (Default: None)
                    wkwargs = None, 
                    # Whether to pad waveform on both sides so that the t-th frame is centered at time t x hop_length (Default: True)
                    center = True, 
                    # Controls the padding method used when center is True. (Default: "reflect")
                    pad_mode = "reflect", 
                    # Controls whether to return half of results to avoid redundancy. (Default: True)
                    onesided = True, 
                    # If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization). (Default: None)
                    norm = None,
                    # Scale to use: htk or slaney. (Default: "htk")
                    mel_scale = "htk"
                    )

### MFCC

In [None]:
mfcc_transform = torchaudio.transforms.MFCC(
                    # Sample rate of audio signal. (Default: 16000)
                    sample_rate = 22050,
                    # Number of mfc coefficients to retain. (Default: 40)
                    n_mfcc = 40,
                    # Type of DCT (discrete cosine transform) to use. (Default: 2)
                    dct_type = 2,
                    # Norm to use. (Default: "ortho")
                    norm = "ortho", 
                    # Whether to use log-mel spectrograms instead of db-scaled. (Default: False)
                    log_mels = True,
                    # Arguments for MelSpectrogram (Default: None)
                    melkwargs = None
                    )

### Amplitude to dB

In [None]:
amplitude_to_db_transform = torchaudio.transforms.AmplitudeToDB(
                    # Scale of input tensor ("power" or "magnitude"). The power being the elementwise square of the magnitude. (Default: "power")
                    stype = "power",
                    # Minimum negative cut-off in decibels. A reasonable number is 80. (Default: None)
                    top_db = None
                    )

## Audio Processing Parameters

In [None]:
target_length = 4
target_sample_rate = 22050
n_samples = target_length * target_sample_rate

## Study of a Class

For a recall, the different classes present in the UrbanSound8K dataset are:
- air_conditioner
- car_horn
- dog_bark
- drilling
- engine_idling
- gun_shot
- jackhammer
- siren
- street_music

### Selection of Multiple Audios From One Class

In [None]:
# Input annotations filtering critera
label = "dog_bark"
salience = 1 
n_audios = 10

# Filter annotations based on criteria
filtered_metadata = metadata.loc[
    (metadata["class"]==label)
    & (metadata["salience"]==salience)
    ]

# Randomly select an audio from the filtered annotations
selected_audios = filtered_metadata.sample(n=n_audios) 
display(selected_audios)

In [None]:
# Creation of a dictionnary to store each of the selected audio paths and their corresponding pre-processed temporal signal
audios_signal = {}

for index, row in selected_audios.iterrows():
    audio_filename = row["slice_file_name"]
    audio_fold = f"fold{row['fold']}"
    audio_path = os.path.join("dataset", audio_fold, audio_filename)
    ### Load the audio and pre-process it
    # Load the audio signal
    audio_signal, sr = torchaudio.load(audio_path)
    # Mix it down to mono if necessary
    if audio_signal.shape[0] > 1:
        audio_signal = torch.mean(audio_signal, dim=0, keepdim=True)
    # Resample it
    resample_transform = transforms.Resample(sr, target_sample_rate)
    audio_signal = resample_transform(audio_signal)
    # Cut if necessary
    if audio_signal.shape[1] > n_samples:
        audio_signal = event_signal[:, :n_samples]
    # Right pad if necessary
    if audio_signal.shape[1] < n_samples:
        n_missing_samples = n_samples - audio_signal.shape[1]
        last_dim_padding = (0, n_missing_samples)
        audio_signal = nn.functional.pad(audio_signal, last_dim_padding)
    audios_signal[audio_path] = audio_signal

### Temporal Representation

In [None]:
fig, axs = plt.subplots(len(selected_audios),1, figsize=(15, 50))

for i, (audio_path, audio_signal) in enumerate(audios_signal.items()):
    #audio_signal = audio_signal.numpy()
    
    num_channels, num_frames = audio_signal.shape
    time_axis = torch.arange(0, num_frames) / target_sample_rate
    
    axs[i].plot(time_axis, audio_signal[0], linewidth=1)
    axs[i].set_ylabel("Amplitude")
    axs[i].set_xlabel("Time (s)")
    axs[i].set_xlim([0, 4])    
    axs[i].set_ylim([-1, 1])
    
plt.show()

In [None]:
fig, axs = plt.subplots(len(selected_audios),1, figsize=(15, 50))

for i, (audio_path, audio_signal) in enumerate(audios_signal.items()):
    
    num_channels, num_frames = audio_signal.shape
    time_axis = torch.arange(0, num_frames) / target_sample_rate
    
    spectrogram = torch.squeeze(spectrogram_transform(audio_signal)[0])
    spectrogram_db = amplitude_to_db_transform(spectrogram)
    
    n_fft_spec = (spectrogram_db.shape[0] - 1) * 2
    frequency = (target_sample_rate / n_fft_spec) * np.linspace(0, n_fft_spec/2, spectrogram_db.shape[0])
    max_frequency_bin = frequency_axis.max()
    
    axs[i].imshow(spectrogram_db, extent=[0, target_length, 0, target_sample_rate/2], origin="lower", aspect="auto")
    axs[i].set_ylabel("Frequency (Hz)")
    axs[i].set_xlabel("Time (s)")
    
plt.show()

### Mel-Spectrogram

In [None]:
fig, axs = plt.subplots(len(selected_audios),1, figsize=(15, 50))

for i, (audio_path, audio_signal) in enumerate(audios_signal.items()):
    
    num_channels, num_frames = audio_signal.shape
    time_axis = torch.arange(0, num_frames) / target_sample_rate
    
    mel_spectrogram = torch.squeeze(mel_spectrogram_transform(audio_signal)[0])
    
    axs[i].imshow(mel_spectrogram, extent=[0, target_length, 0, mel_spectrogram.shape[0]+1], origin="lower", aspect="auto")
    axs[i].set_ylabel("Mel Bands")
    axs[i].set_xlabel("Time (s)")
    
plt.show()

### MFCCs

In [None]:
fig, axs = plt.subplots(len(selected_audios),1, figsize=(15, 50))

for i, (audio_path, audio_signal) in enumerate(audios_signal.items()):
    
    num_channels, num_frames = audio_signal.shape
    time_axis = torch.arange(0, num_frames) / target_sample_rate
    
    mfcc = torch.squeeze(mfcc_transform(audio_signal)[0])
    
    axs[i].imshow(mfcc, extent=[0, target_length, 0, mfcc.shape[0]+1], origin="lower", aspect="auto")
    axs[i].set_ylabel("MFCC")
    axs[i].set_xlabel("Time (s)")
    
    
plt.show()

## Comparison of Multiple Audios From the Same Class

## Comparison of Classes

In [None]:
classes = metadata["class"].unique()
selected_audios = {classe: np.nan for classe in classes}

for classe in classes:
    metadata_classe = metadata[metadata["class"]==classe]
    selected_audio = metadata_classe.sample(n=1) 
    selected_audio_filename = selected_audio.iloc[0]["slice_file_name"]
    selected_audio_fold = f"fold{selected_audio.iloc[0]['fold']}"
    selected_audio_path = os.path.join("dataset", selected_audio_fold, selected_audio_filename)
    selected_audios[classe] = selected_audio_path

In [None]:
fig = plt.figure(figsize=(20,80), constrained_layout=True)
subfigs = fig.subfigures(len(selected_audios), 1)

for i, (classe, audio_path) in enumerate(selected_audios.items()):
    ### Load the audio and pre-process it
    # Load the audio signal
    audio_signal, sr = torchaudio.load(audio_path)
    # Mix it down to mono if necessary
    if audio_signal.shape[0] > 1:
        audio_signal = torch.mean(audio_signal, dim=0, keepdim=True)
    # Resample it
    resample_transform = transforms.Resample(sr, target_sample_rate)
    audio_signal = resample_transform(audio_signal)
    # Cut if necessary
    if audio_signal.shape[1] > n_samples:
        audio_signal = event_signal[:, :n_samples]
    # Right pad if necessary
    if audio_signal.shape[1] < n_samples:
        n_missing_samples = n_samples - audio_signal.shape[1]
        last_dim_padding = (0, n_missing_samples)
        audio_signal = nn.functional.pad(audio_signal, last_dim_padding)
        
    ### Compute the Spectrogram
    spectrogram = torch.squeeze(spectrogram_transform(audio_signal)[0])
    spectrogram_db = amplitude_to_db_transform(spectrogram)
    
    ### Compute the Mel-Spectrogram
    mel_spectrogram = torch.squeeze(mel_spectrogram_transform(audio_signal)[0])
    
    ### Compute the MFCCs
    mfcc = torch.squeeze(mfcc_transform(audio_signal)[0])

    subfig = subfigs[i]
    subfig.suptitle(classe, fontsize=16, fontweight="bold")
    axs = subfig.subplots(1, 3)
    axs[0].imshow(spectrogram, extent=[0, target_length, 0, target_sample_rate/2], origin="lower", aspect="auto")
    axs[0].set_title("Spectrogram")
    axs[0].set_xlabel("Time (s)")
    axs[0].set_ylabel("Frequency (Hz)")    
    axs[1].imshow(mel_spectrogram, extent=[0, target_length, 0, mel_spectrogram.shape[0]+1], origin="lower", aspect="auto")
    axs[1].set_title("Mel-Spectrogram")
    axs[1].set_xlabel("Time (s)")
    axs[1].set_ylabel("Mel Bands")    
    axs[2].imshow(mfcc, extent=[0, target_length, 0, mfcc.shape[0]+1], origin="lower", aspect="auto")
    axs[2].set_title("MFCC")
    axs[2].set_xlabel("Time (s)")
    axs[2].set_ylabel("MFCC")
