In [1]:
%load_ext autoreload
%autoreload 2

In [46]:
import numpy as np
import os
from scipy.io import wavfile
from IPython.display import Audio
import glob

import audiomentations
from KeywordDataset import KeywordDataset, get_fns

In [4]:
training_meta = dict(
    wanted_words = ['on', 'off'],
    data_path = '/home/average-joe/coding_data/keyword_detection_nano/dataset/',
    epochs = 5,
    learning_rate = 1e-3,
    batch_size = 32,
)

audio_meta = dict(
    sample_rate = 16000,
    clip_duration = 1000,
    window_size_ms = 30,
    window_stride = 20,
    feature_bin_count = 40,
)

desired_samples = int(audio_meta['sample_rate'] * audio_meta['clip_duration'] / 1000)
window_size_samples = int(audio_meta['sample_rate'] * audio_meta['window_size_ms'] / 1000)
window_stride_samples = int(audio_meta['sample_rate'] * audio_meta['window_stride'] / 1000)
length_minus_window = desired_samples - window_size_samples
spectrogram_lenght = 1 + int(length_minus_window / window_stride_samples)

audio_meta['desired_samples'] = desired_samples
audio_meta['spectrogram_lenght'] = spectrogram_lenght
audio_meta['fingerprint_size'] = spectrogram_lenght * audio_meta['feature_bin_count']

augmentation_meta = dict(
    background_frequency = 0.8,
    background_volume_range = 0.1,
    time_shift_ms = 100.0,
    silence_percentage = 0.2,
    unknown_percentage = 0.2,
)

meta_dict = dict(
    audio = audio_meta,
    augmentation = augmentation_meta,
    training = training_meta
)

In [5]:
train_fns, val_fns, background_fns = get_fns(training_meta['data_path'], training_meta['wanted_words'])
ds = KeywordDataset(train_fns,background_fns,meta_dict,32)

2022-08-09 15:05:43.130999: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-09 15:05:43.158109: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-09 15:05:43.158305: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-09 15:05:43.172051: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [30]:
data_path = training_meta['data_path']
fns = glob.glob(os.path.join(data_path,'one','*.wav'))
fn = fns[0]

In [37]:
sr, original_audio = wavfile.read(fn)
Audio(original_audio, rate = sr)

In [43]:
audio = ds.get_audio(fn, 'one').numpy().flatten()

0.019334113811379974


In [44]:
Audio(audio, rate=sr)

In [83]:
augment = audiomentations.Compose([
    audiomentations.PitchShift(min_semitones=2, max_semitones=2, p=1.),
    audiomentations.BandPassFilter(p = .5),
    #audiomentations.BandStopFilter(p=.5),
    #audiomentations.ClippingDistortion(p=.5),
    audiomentations.HighPassFilter(p=.5),
    audiomentations.LowPassFilter(p=.5),
    #audiomentations.LowShelfFilter(p=.5),
    #audiomentations.Mp3Compression(min_bitrate=128, max_bitrate = 128, p=1.),
    audiomentations.RoomSimulator(leave_length_unchanged = True, p = .5),
    audiomentations.TanhDistortion(p = .5)
    
])
augmented_audio = augment(audio/1., sample_rate = sr)
Audio(augmented_audio, rate=sr)

In [84]:
augmented_audio

array([-2.88889492e-33, -2.21122646e-05, -1.30879198e-04, ...,
       -3.19431492e-05,  3.09086499e-05, -1.09599336e-04], dtype=float32)