In [None]:
import soundata
from torch.utils.data import Dataset
import torch
import sys
from pathlib import Path
# from IPython.display import Audio breaks the notebook
import torch
import numpy as np
import pandas as pd
import librosa.feature
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

project_root = Path("/Users/daniellevy/bioacoustic-classifier")
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))
from src.data_utils import UrbanSoundTorchDataset
from src.eda import plot_grouped_histograms

In [None]:
# create an instance of "soundata.datasets.urbansound8k.Dataset" - class which wraps the entire UrbanSound8K dataset
dataset = soundata.initialize("esc50", data_home="/Users/daniellevy/bioacoustic-classifier/data/ESC-50-master") # e.g.
dataset.validate()


meta = pd.read_csv(str(Path(project_root / "data" / "ESC-50-master" / "meta" / "esc50.csv")))
meta['filename'] = meta['filename'].str.removesuffix('.wav')
meta = meta.rename(columns ={'filename':'cid'})

100%|██████████| 1/1 [00:00<00:00, 1077.40it/s]
100%|██████████| 2000/2000 [00:02<00:00, 851.70it/s]
INFO: Success: the dataset is complete and all files are valid.
INFO: --------------------


In [None]:

# little demo of our data loader
train_dataset = UrbanSoundTorchDataset(dataset, fold=[1, 2, 3])
print(f"There are {len(train_dataset)} samples in this example training dataset")

# load in an example dataset
audio, label = train_dataset[1]

# get one audio and target out. Need as numpy not tensor
if torch.is_tensor(audio):
    audio_np = audio.detach().cpu().numpy()
else:
    audio_np = np.array(audio)

Audio(audio_np, rate = 44100) # sr is constant throughout dataset

In [3]:
esc_50 = UrbanSoundTorchDataset(dataset, fold=[1, 2, 3, 4, 5])

In [None]:
# get some insights about our data from basic class descriptions to more descriptive information about the audio samples
def compute_features(cid):
    clip = dataset.clip(cid)
    audio, sr = clip.audio # 1D waveform - doesn't tell us which frequencies are present at a given time
    S = np.abs(librosa.stft(audio)) # short-time fourier transform giving a 2D representation of the data and we can see how energy is distributed across frequencies and time
    return dict(
        cid=cid,
        fold=clip.fold,
        label=clip.target,
        audio_len=len(audio),
        sr=sr,
        duration=len(audio) / sr,
        rms=np.sqrt(np.mean(audio ** 2)),
        zcr=np.mean(librosa.feature.zero_crossing_rate(y=audio)),
        spec_cent=np.mean(librosa.feature.spectral_centroid(S=S, sr=sr)),
        spec_bw=np.mean(librosa.feature.spectral_bandwidth(S=S, sr=sr)),
        rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=sr, roll_percent=0.85)),
        flatness = np.mean(librosa.feature.spectral_flatness(S=S)),
        contrast = np.mean(librosa.feature.spectral_contrast(S=S, sr=sr))
    )

rows = Parallel(n_jobs=8)(delayed(compute_features)(cid) for cid in esc_50.clip_ids)

df = pd.DataFrame(rows)
df = pd.merge(df, meta[['cid', 'category']], on='cid')

df['category'].value_counts() # all classes have 40 samples
df['fold'].value_counts() # all folds have 400 samples

assert len(df['category'].unique()) == 50, "There are not 50 classes present."
assert df['category'].value_counts().nunique() == 1, "Classes are not equally balanced"
assert len(df['fold'].unique()) == 5, "There are not 5 folds present."
assert df['fold'].value_counts().nunique() == 1, "folds are not equally balanced"
assert df['audio_len'].nunique() == 1, "More than 1 audio length value"
assert df['sr'].nunique() == 1, "More than 1 sampling rate value"

df.to_csv(str(Path(project_root / "data" / "processed_data" / "baseline_model.csv")))

In [None]:


def play_category_sound(category):
    cid_ex = df[df['category'] == category].sample(1)['cid'].values[0]
    clip = dataset.clip(cid_ex)
    audio, sr = clip.audio
    return Audio(audio, rate = 44100) 

play_category_sound('laughing')

In [5]:
animal_noises = ['crow', 'cow','cat', 'chirping_birds', 'pig', 'crickets','frog','insects', 'rooster', 'hen', 'crying_baby', 'dog', 'sheep']
quantitative = ['rms', 'zcr', 'spec_cent', 'spec_bw']

In [None]:
plot_grouped_histograms(df=df[df['category'].isin(animal_noises)], group_col = 'category', value_col = 'rms', figsize = (30,30))

In [7]:
descriptors = {}
for col in quantitative:
    descriptors[col] = df.groupby('category')[col].describe()

In [8]:
"""
rms: Overall loudness or signal energy
zcr: How often the waveform crosses zero amplitude (high: noisy, percussive, high frequency, low: smooth tonal sounds)
Spectral Centroid: The centre of mass of the frequency spectrum (high: bright, sharp, high pitched, low: dark, bassy)
Spectral Bandwidth: The spread or variance of frequences around the spectrum (high: broad, complex, noisy, low: pure, simple tones)
"""

descriptors['spec_bw'].sort_values(by=['50%'])

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rooster,40.0,1888.55399,1047.108547,760.762487,1007.602677,1491.303705,2707.727203,4044.019872
glass_breaking,40.0,2068.672641,1396.622224,640.821334,784.504166,1492.869739,3137.945367,4947.861126
airplane,40.0,1713.237954,671.792442,702.101971,1219.102434,1545.374345,2130.76676,3131.049243
church_bells,40.0,1812.295108,607.40417,895.513957,1352.314001,1641.174147,2214.072114,3583.793525
door_wood_knock,40.0,2054.217615,1278.258374,519.80232,922.355256,1687.672879,3045.675056,4812.275784
coughing,40.0,2196.702562,1416.358491,437.352255,889.937238,1701.8848,3479.277387,5510.148075
helicopter,40.0,2089.583928,1129.802011,503.517642,1319.257942,1759.715588,2702.45524,4656.492854
siren,40.0,1729.547238,496.640884,809.357424,1502.948393,1783.513869,2058.425823,2595.12393
sneezing,40.0,2424.056935,1318.539807,766.911975,1342.285754,1918.800387,3594.010411,4707.442817
thunderstorm,40.0,2097.346157,826.032475,768.491533,1555.622787,1975.052605,2450.026839,4288.656854
