In [25]:
# OS and filesystem
import os
import sys
from pathlib import Path
from timeit import default_timer as timer
from datetime import timedelta

# Math
import numpy

# Data
import pandas
from matplotlib import pyplot

# Sound processing
import soundfile
import audiomentations

# Model processing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow import keras

# Console output
from colorama import Style
from tqdm.notebook import tqdm

# Jupyter output
import IPython.display

# Local files
sys.path.append(os.path.join(os.pardir, os.pardir))
import helpers

In [26]:
pyplot.style.use("ggplot")  # Set the matplotlib style
keras.backend.set_image_data_format("channels_last")  # Define the last value of the model input shape as the color channel

In [27]:
# Filesystem paths
PARENT_FOLDER = Path.cwd()
DATA_FOLDER = (PARENT_FOLDER / ".." / ".." / "data").resolve()
URBAN_DATASET_FOLDER = DATA_FOLDER / "UrbanSound8K"
MODELS_FOLDER = (PARENT_FOLDER / ".." / ".." / "models").resolve()
TEMP_FOLDER = (PARENT_FOLDER / ".." / ".." / "temp").resolve()

# Dataset
CLASSES = ["air_conditioner", "car_horn", "children_playing", "dog_bark", "drilling", "engine_idling", "gun_shot", "jackhammer", "siren", "street_music"]
N_MELS = 128

# Misc.
RANDOM_STATE = 2077

In [28]:
data = pandas.read_csv(URBAN_DATASET_FOLDER / "metadata" / "UrbanSound8K.csv")
print(f"{data.shape[0]} rows, {data.shape[1]} columns")
data.head(n=5)

8732 rows, 8 columns


Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [33]:
augment_pipeline = audiomentations.SomeOf(num_transforms=(1, None), transforms=[
    audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    audiomentations.TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    audiomentations.PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    audiomentations.Shift(min_fraction=-0.5, max_fraction=0.5, rollover=True, fade=False, p=0.5)
])

augmented_data = []

for index, row in tqdm(data.iterrows(), total=data.shape[0], desc="Augmenting the dataset..."):
        file_path = helpers.urban_df.get_full_path(dataset_folder=URBAN_DATASET_FOLDER, fold=row["fold"], file_name=row["slice_file_name"])
        samples, sampling_rate = helpers.urban_df.load_audio(dataset_folder=URBAN_DATASET_FOLDER, fold=row["fold"], file_name=row["slice_file_name"])
        samples_augmented = augment_pipeline(samples=samples, sample_rate=sampling_rate)

        save_folder = (URBAN_DATASET_FOLDER / "audio_augmented" / f"fold{row['fold']}")
        save_folder.mkdir(parents=True, exist_ok=True)
        soundfile.write(file=(save_folder / row["slice_file_name"]), data=samples_augmented, samplerate=sampling_rate)

        row_dict = row.to_dict()
        row_dict["augmented"] = True
        augmented_data.append(row_dict)

Augmenting the dataset...:   0%|          | 0/8732 [00:00<?, ?it/s]

  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [34]:
augmented_df = pandas.DataFrame(augmented_data)
augmented_df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,augmented
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark,True
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing,True
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing,True
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing,True
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing,True


In [35]:
augmented_df.to_csv(URBAN_DATASET_FOLDER / "metadata" / "UrbanSound8K_augmented.csv")

In [16]:
print(samples)
IPython.display.Audio(data=samples, rate=sampling_rate)

[-0.00341243 -0.00506065 -0.00463294 ... -0.00247565 -0.00155365
 -0.00035246]


In [17]:
print(samples_2)
IPython.display.Audio(data=samples_2, rate=sampling_rate)

[ 0.0238811   0.00217778  0.01567344 ...  0.00105772  0.00804857
 -0.00377326]
