In [None]:
import librosa
import soundfile as sf
import numpy as np
from audiomentations import AddGaussianNoise
import random
from scipy.signal import butter, lfilter
import matplotlib.pyplot as plt
import tensorflow as tf
from IPython.display import Audio

In [None]:
#--------AUGMENTATION-----------#

class Augmenter:

    def __init__(self, sr=16000,
                 noise_prob=0.5, noise_max_amp=0.01,
                 reverb_prob=0.3, reverb_delay=0.025, reverb_decay=0.2,
                 shuffle_prob=0.3, shuffle_segments=3,
                 time_stretch_prob=0.3, time_stretch_range=(0.9, 1.1),
                 gaps_prob=0.3, gaps_n=2, gaps_max_duration=0.3,
                 freq_mask_prob=0.3, freq_mask_n=1,
                 shuffle_seg_dur=0.08, shuffle_overlap=0.02, shuffle_local_range=3):

        self.sr = sr
        self.noise_aug = AddGaussianNoise(p=1.0, max_amplitude=noise_max_amp)

        self.noise_prob = noise_prob
        self.reverb_prob = reverb_prob
        self.reverb_delay = reverb_delay
        self.reverb_decay = reverb_decay
        self.shuffle_prob = shuffle_prob
        self.shuffle_segments = shuffle_segments
        self.time_stretch_prob = time_stretch_prob
        self.time_stretch_range = time_stretch_range
        self.gaps_prob = gaps_prob
        self.gaps_n = gaps_n
        self.gaps_max_duration = gaps_max_duration
        self.freq_mask_prob = freq_mask_prob
        self.freq_mask_n = freq_mask_n
        self.shuffle_seg_dur = shuffle_seg_dur
        self.shuffle_overlap = shuffle_overlap
        self.shuffle_local_range = shuffle_local_range

    def augment(self, audio):
        if not isinstance(audio, tf.Tensor):
            audio = tf.convert_to_tensor(audio, dtype=tf.float32)
        #Set of distortions to be applied randomly with probabilities below
        distortions = []

        # 1. Noise
        # Adds gaussian noise -> makes it slightly grainy
        # Min max amplitude of noise - not set
        # p=1.0 - always apply noise
        if random.random() < self.noise_prob:
            distortions.append('noise')

        # 2. Reverb
        # Echo effect
        # Delay the audio by 0.1 -> reduce volume -> pad it to original length -> add
        if random.random() < self.reverb_prob:
            distortions.append('reverb')

        # 3. Shuffle
        # Break into n segments and concat them randomly
        if random.random() < self.shuffle_prob:
            distortions.append('shuffle')

        # 4. Time stretch
        # Randomly slows (0.9) or speeds (1.1) the audio / doesn't change pitch
        if random.random() < self.time_stretch_prob:
            distortions.append('time_stretch')

        # 5. Missing Gaps
        # Randomly insert silences/gaps in the audio
        if random.random() < self.gaps_prob:
            distortions.append('missing_gaps')

        # 6. Frequency Masking
        # Randomly masks a range of frequencies in the spectrogram
        #Butterworth filter is better than applying freqeuncy masks on spectogram (which already has frequency bins) because real wrld freq loss occurs during sound capture/transmission, affecting the raw audio.
        #Butterworth simulates this situation by removing frequency content from the waveform which can then go thru the rest of pipeline,
        #additionally it affects the phase relations and harmonics naturally in contrast to the crude zeroing of freq bins in spectogram

        if random.random() < self.freq_mask_prob:
            distortions.append('frequency_masking')



        # Apply selected distortions
        for distortion in distortions:
            if distortion == 'noise':
                audio = self._add_noise(audio)

            elif distortion == 'reverb':
                audio = self._add_reverb(audio)

            elif distortion == 'shuffle':
                audio = self._segment_shuffle(audio)

            elif distortion == 'time_stretch':
                audio = self._time_stretch(audio)

            elif distortion == 'missing_gaps':
                audio = self._add_missing_gaps(audio)

            elif distortion == 'frequency_masking':
                audio = self._add_frequency_mask(audio)

        return audio


    #Augmentation methods
    def _add_noise(self, audio):
        if isinstance(audio, tf.Tensor):
          audio = audio.numpy()
        return self.noise_aug(audio, self.sr)

    def _add_reverb(self, audio):
        delay = int(self.reverb_delay * self.sr)  # Delay in samples (0.05 sec)
        reverb = tf.pad(audio * self.reverb_decay, [[delay, 0]]) #Amplitude scaling -> 0.2
        reverb = reverb[:tf.shape(audio)[0]]
        return audio + reverb

    def _segment_shuffle(self, audio, n_segments=None):
        # Previous method -> shuffle random large segments / unrealistic and destroys linguistic stuff
        # if n_segments is None:
        #     n_segments = self.shuffle_segments
        # segments = np.array_split(audio, n_segments)
        # np.random.shuffle(segments)
        # return np.concatenate(segments)


        #New method -> try to simulate temporal jitter / noise in the time domain -> split into micro-segments which are overlapping  -> the segments are shuffle locally within shuffle range
        if isinstance(audio, tf.Tensor):
          audio = audio.numpy()
        seg_len = int(self.shuffle_seg_dur * self.sr)
        overlap = int(self.shuffle_overlap * self.sr)
        local_range = self.shuffle_local_range
        segments = []
        i = 0
        while i < len(audio):
            end = min(i + seg_len, len(audio))
            segments.append(audio[i:end])
            i += seg_len - overlap
        n_regions = min(4, max(1, len(segments) // 10))
        region_indices = random.sample(range(len(segments)), n_regions)
        shuffled = segments.copy()
        for r in region_indices:
            for offset in range(-local_range, local_range + 1):
                idx = r + offset
                if 0 <= idx < len(segments):
                    shift = random.randint(-local_range, local_range)
                    new_idx = max(0, min(len(segments) - 1, idx + shift))
                    shuffled[idx] = segments[new_idx]
        return np.concatenate(shuffled)[:len(audio)]


    def _time_stretch(self, audio):
          if isinstance(audio, tf.Tensor):
              audio = audio.numpy()
          stretched = librosa.effects.time_stretch(audio, rate=random.uniform(*self.time_stretch_range))
          return tf.constant(stretched, dtype=tf.float32)

    def _add_missing_gaps(self, audio, n_gaps=None, max_gap_duration=None):
        # if n_gaps is None:
        #     n_gaps = self.gaps_n
        # if max_gap_duration is None:
        #     max_gap_duration = self.gaps_max_duration
        # gap_audio = np.copy(audio)
        # for _ in range(n_gaps):
        #     gap_duration = random.uniform(0.1, max_gap_duration)
        #     gap_samples = int(gap_duration * self.sr)
        #     start = random.randint(0, max(1, len(audio) - gap_samples))
        #     gap_audio[start:start + gap_samples] = 0
        # return gap_audio

        #New method -> fill gaps with low level noises and make edges smoother
        if n_gaps is None:
            n_gaps = self.gaps_n
        if max_gap_duration is None:
            max_gap_duration = self.gaps_max_duration

        gap_audio = tf.identity(audio)

        for _ in range(n_gaps):
            gap_duration = tf.random.uniform([], 0.1, max_gap_duration, dtype=tf.float32)
            gap_samples = tf.cast(gap_duration * tf.cast(self.sr, tf.float32), tf.int32)
            start = tf.random.uniform([], 0, tf.shape(audio)[0] - gap_samples, dtype=tf.int32)

            fade_len = tf.minimum(tf.cast(0.05 * tf.cast(gap_samples, tf.float32), tf.int32), gap_samples // 4)
            fade_out = tf.cast(tf.linspace(1.0, 0.0, fade_len), tf.float32)
            fade_in = tf.cast(tf.linspace(0.0, 1.0, fade_len), tf.float32)

            mid_len = gap_samples - 2 * fade_len

            if mid_len > 0:
            #In case 0 values are needed.
            #     if random.random() < 0.5:
            #         gap_audio[start + fade_len:start + fade_len + mid_len] = 0
            #     else:
              noise = tf.random.normal([mid_len], stddev=0.001, dtype=tf.float32)
              indices = tf.reshape(tf.range(start + fade_len, start + fade_len + mid_len), (-1, 1))
              gap_audio = tf.tensor_scatter_nd_update(gap_audio, indices, noise)

        fade_indices = tf.reshape(tf.range(start + fade_len + mid_len, start + gap_samples), (-1, 1))
        fade_vals = gap_audio[start + fade_len + mid_len:start + gap_samples] * fade_in
        gap_audio = tf.tensor_scatter_nd_update(gap_audio, fade_indices, fade_vals)

        return gap_audio[:len(audio)]
    #APPLY FREQUENCY MASKING USING BUTTERWORTH FILTER
    #n_masks -> how many bands will be filtered out
    #A 16 kHz sampler can only capture up to 8 kHz frequencies because you need at least two samples per wave cycle to know what the wave looks like -> nyquist
    def _add_frequency_mask(self, audio, n_masks=None):
      if n_masks is None:
          n_masks = self.freq_mask_n
      if isinstance(audio, tf.Tensor):
        masked_audio = audio.numpy().copy()
      else:
        masked_audio = audio.copy()
      nyquist = self.sr/2
      for _ in range(n_masks):
        l_freq = random.uniform(500,5000)
        h_freq = l_freq + random.uniform(500,2000)
        l_freq = min(l_freq,nyquist-100)
        h_freq = min(h_freq,nyquist-100)
        b,a = butter(N=4, Wn=[l_freq,h_freq], btype= "bandstop", fs = self.sr) #N=4 th order -> a dip in frequency response where that removal band is with smooth edges
        masked_audio = lfilter(b,a,masked_audio)
      return masked_audio

augmenter = Augmenter()

In [None]:
def mel_log_gen(audio_file):
    audio, sr = librosa.load(audio_file, sr=16000)
    audio = augmenter.augment(audio)
    if isinstance(audio, tf.Tensor):
        audio = audio.numpy()
    mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=80)
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    return log_mel_spectrogram