# Preparing training data

In [1]:
#-----------------------------------------
# Prepare training data from Metadata file
#-----------------------------------------
import pandas as pd
from pathlib import Path

download_path = Path.cwd()/'UrbanSound8K'

# Read metadata file
metadata_file = download_path/'metadata'/'UrbanSound8k.csv'
df = pd.read_csv(metadata_file)
df.head()

# Construct file path by concatenating fold and file name
df['relative_path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)

# Take relevant columns
df = df[['relative_path','classID']]
df.head()

Unnamed: 0,relative_path,classID
0,/fold5/100032-3-0-0.wav,3
1,/fold5/100263-2-0-117.wav,2
2,/fold5/100263-2-0-121.wav,2
3,/fold5/100263-2-0-126.wav,2
4,/fold5/100263-2-0-137.wav,2


# Audio Preprocessing: Define Transforms

In [2]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class AudioUtil():
    #----------------------------------------
    # Load an audio file, Return the signal as a tensor and the sample rate
    #----------------------------------------
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig,sr)    

In [3]:
    #----------------------------------------
    # Convert the given audio to the desired number of channels
    #----------------------------------------
    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud
        
        if (sig.shape[0] == new_channel):
            # Nothing to do
            return aud
        
        if (new_chanel == 1):
            # Convert from stereo to mono by selecting only the first channel
            resig = sig[:1, :]
            
        else:
            # Convert from mono to stereo by duplicating the first channel
            resig = torch.cat([sig, sig])
            
        return ((resig, sr))

In [4]:
    #--------------------------------------
    # Since Resample applies to a single channel, we resample one channel at a time
    #--------------------------------------
    @staticmethod
    def resample(aud, newsr):
        sig, sr = aud
        
        if (sr == newsr):
            # Nothing to do
            return aud
        
        num_channels = sig.shape[0]
        #Resample first channel
        resig =  torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if (num_channels > 1):
            # Resample the seond channel and merge both channels
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])
            
        return ((resig, newsr))

In [1]:
    #----------------------------------------
    # Pad (or truncate) the signal to a fixed length 'max_ms' is milliseconds
    # ---------------------------------------
    @staticmethod
    def pad_trunc(aud, max_ms):
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms
        
        if (sig_len > max_len):
            # Truncate the signal to the given length
            sig = sig[:, :maxlen]
            
        elif (sig_len < max_len):
            # Length of padding to add at the beginning and end of the signal
            pad_begin_len = random.randint(0, max_len - sig_len)
            pad_end_len = max_len - sig_len - pad_begin_len
            
            #pad with 0s
            pad_begin = torch.zeros((num_rows, pad_begin_len))
            pad_end = torch.zeros((num_rows, pad_end_len))
            
            sig = torch.cat((pad_begin, sig, pad_end), 1)
        
        return (sig, sr)

In [6]:
    #-------------------------------
    # Shifts the signal to the left or right by some percent. Values at the end
    # are 'wrapped around' to the start of the transformed signal.
    #-------------------------------
    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

In [7]:
    #-------------------------------
    # Genrate a Spectrogram
    #-------------------------------
    @staticmethod
    def spectro_gram(aud, n_mels = 64, n_fft = 1024, hop_len = None):
        sig, sr = aud
        top_db = 80
        
        # spec has shape [channel, n_mels, time], where channel is mono, stereo, etc
        spec = transforms.MelSpectrogram(sr, n_fft = n_fft, hop_length = hop_len, n_mels = n_mels)(sig)
        
        #Convert to decibels
        spec = tranforms.AmplitudeToDB(top_db = top_db)(spec)
        return(spec)

In [8]:
    #------------------------------
    # Augment the spectrogram by masking out some sections of it in both frequency
    # dimension (ie. horizontal bars) and the time dimensions (ie. vertical bars) to prevent 
    # overfitting and to help the model generalise better. The masked sections are 
    # replaced with the mean value
    #------------------------------
    @staticmethod
    def spectro_augment(spec, max_mask_pct = 0.1, n_freq_masks = 1, n_time_masks = 1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec
        
        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
            
        return aug_spec