# Audio Deep Learning Made Simple: Sound Classification, Step-by-Step
## Dataset Usage Testing

In [None]:
import pandas as pd
from pathlib import Path 

datasource_path = Path.home()/'dataset'/'UrbanSound8K'

# read the metadata file
metadata_file_path = datasource_path/'metadata'/'UrbanSound8K.csv'
df = pd.read_csv(metadata_file_path)
df.head()

In [None]:
df['path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name'].astype(str)
df = df[['path', 'classID']]
df.head()

## Audio File Reading
```python
pip install playsound
```

In [None]:
from playsound import playsound
audio_path = datasource_path/'audio'
audio_sample_path = audio_path/'fold5'/'100032-3-0-0.wav'
print(audio_sample_path)
playsound(str(audio_sample_path))

In [None]:
from lib.wavUtil import WavOps
(sig, sr) = WavOps.open(audio_sample_path)
sig.shape, sr

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.plot(sig.numpy().T)
plt.title('Audio Waveform')
plt.ylabel('Amplitude')
plt.xlabel('Time')
plt.show()

In [None]:
audio = WavOps.resampleRate(audio=(sig, sr), new_sample_rate=44100)
audio = WavOps.rechannel(audio=audio, channel_num=2)
audio = WavOps.pad_trunc(audio=audio, max_ms=4000)
audio = WavOps.time_shift(audio=audio, shift_limit=.4)
audio = WavOps.spectro_gram(audio=audio, n_mels=64, n_fft=1024, hop_len=None)
audio = WavOps.spectro_augment(spec=audio, max_mask_perctage=.1, freq_mask_num=2, time_mask_num=2)

plt.figure(figsize=(10,4))
plt.imshow(audio[0].detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.title('Mel Spectrogram in channel 0')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.colorbar(format='%+2.0f dB')
plt.show()

In [None]:
plt.figure(figsize=(10,4))
plt.imshow(audio[1].detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.title('Mel Spectrogram in channel 1')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.colorbar(format='%+2.0f dB')
plt.show()

## Prepare the DataLoader

In [None]:
from torch.utils.data import DataLoader, random_split
from lib.wavDataUtil import WavDataset

audioDS = WavDataset(df, audio_path)

audio, class_id = audioDS[0]
audio.shape, class_id

In [None]:
sample_num = len(audioDS)
train_num = round(sample_num * .8)
val_num = sample_num - train_num

train_ds, val_ds = random_split(audioDS, [train_num, val_num])
train_dl = DataLoader(dataset=train_ds, batch_size=16, shuffle=True)
val_dl = DataLoader(dataset=val_ds, batch_size=16, shuffle=False)

feature, label = next(iter(train_dl))
feature.shape, label

In [None]:
plt.figure(figsize=(10,4))
plt.imshow(audio[0].detach().numpy(), cmap='viridis', origin='lower', aspect='auto')
plt.title('Mel Spectrogram')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.colorbar(format='%+2.0f dB')
plt.show()

## Training

In [None]:
import torch 
from lib.acModel import AudioClassifier

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AudioClassifier().to(device=device)
model

In [None]:
from lib.acModel import Processor

Processor.training(model=model, train_dl=train_dl, device=device, num_epochs=20)