In [44]:
import soundata

dataset = soundata.initialize('urbansound8k')
#dataset.download()  # download the dataset
dataset.validate()  # validate that all the expected files are there

example_clip = dataset.choice_clip()  # choose a random example clip
print(example_clip)  # see the available data

100%|██████████| 1/1 [00:00<00:00, 7049.25it/s]
100%|██████████| 8732/8732 [00:00<00:00, 243463.24it/s]
INFO: Files missing for metadata:
INFO: UrbanSound8K.csv
INFO: --------------------
INFO: Files missing for clips:
INFO: 135776-2-0-49
INFO: 46654-6-0-0
INFO: 57320-0-0-24
INFO: 134717-0-0-26
INFO: 174276-7-5-0
INFO: 180937-7-2-6
INFO: 17913-4-0-1
INFO: 103074-7-4-6
INFO: 176787-5-0-9
INFO: 55020-4-0-11
INFO: 138015-3-0-7
INFO: 180937-7-0-4
INFO: 180937-7-1-13
INFO: 157867-8-0-10
INFO: 103074-7-2-0
INFO: 113205-5-1-4
INFO: 57584-4-0-8
INFO: 118279-8-0-8
INFO: 137156-9-0-73
INFO: 165645-4-1-0
INFO: 182800-2-2-0
INFO: 103074-7-0-2
INFO: 159738-8-0-1
INFO: 180937-7-4-0
INFO: 164053-8-1-0
INFO: 159738-8-0-0
INFO: 132016-9-0-11
INFO: 182800-2-2-1
INFO: 180937-7-3-17
INFO: 125791-3-0-9
INFO: 108041-9-0-9
INFO: 197318-6-1-0
INFO: 118279-8-0-9
INFO: 191431-9-0-8
INFO: 139951-9-0-33
INFO: 113205-5-1-5
INFO: 180937-7-1-12
INFO: 180937-7-0-5
INFO: 157867-8-0-11
INFO: 99180-9-0-7
INFO: 176787-5-

Clip(
  audio_path="/Users/cafalena/sound_datasets/urbansound8k/audio/fold4/55728-9-0-30.wav",
  clip_id="55728-9-0-30",
  audio: The clip's audio
            * np.ndarray - audio signal
            * float - sample rate,
  class_id: The clip's class id.
            * int - integer representation of the class label (0-9). See Dataset Info in the documentation for mapping,
  class_label: The clip's class label.
            * str - string class name: air_conditioner, car_horn, children_playing, dog_bark, drilling, engine_idling, gun_shot, jackhammer, siren, street_music,
  fold: The clip's fold.
            * int - fold number (1-10) to which this clip is allocated. Use these folds for cross validation,
  freesound_end_time: The clip's end time in Freesound.
            * float - end time in seconds of the clip in the original freesound recording,
  freesound_id: The clip's Freesound ID.
            * str - ID of the freesound.org recording from which this clip was taken,
  freesound_sta

In [45]:
import matplotlib.pyplot as plt
import os
import torch
import librosa
import numpy as np
import pandas as pd
from pydub import AudioSegment


class UrbanSoundDataset(torch.utils.data.Dataset):
    def __init__(self, annotations, audio_dir):
        if isinstance(annotations, pd.DataFrame):
            self.annotations = annotations
        else:
            self.annotations = pd.read_csv(annotations)
        self.audio_dir = audio_dir

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        audio_path = os.path.join(self.audio_dir, self.annotations.loc[index, 'slice_file_name'])
        class_id = self.annotations.loc[index, 'classID']
        audio, sr = librosa.load(audio_path, sr=None, mono=True)
        return torch.FloatTensor(audio), torch.LongTensor([class_id])

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index]['fold']}"
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index]['slice_file_name'])
        return path


In [46]:
def calc_fft(y, rate):
    n = len(y)
    freq = np.fft.rfftfreq(n, d=1/rate)
    Y = abs(np.fft.rfft(y)/n)
    return Y, freq

def plot_signal_fft(signal, rate):
    fig, axs = plt.subplots(2, 1, figsize=(20, 10))
    axs[0].plot(signal)
    axs[0].set_title('Signal')
    Y, freq = calc_fft(signal, rate)
    axs[1].plot(freq, Y)
    axs[1].set_title('FFT')
    plt.show()

def calc_spectrogram(signal, rate):
    n_fft = 2048
    hop_length = 512
    spectrogram = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
    spectrogram = np.abs(spectrogram)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    return log_spectrogram

def plot_spectrogram(signal, rate):
    log_spectrogram = calc_spectrogram(signal, rate)
    fig, axs = plt.subplots(1, 1, figsize=(20, 10))
    axs.imshow(log_spectrogram, aspect='auto', origin='lower', cmap='jet')


In [47]:
import torchvision.models as models

model = models.resnet18(pretrained=False)
num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, 10)




In [48]:
from sklearn.model_selection import train_test_split

# Load the dataset
csvdataset = pd.read_csv('/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/metadata/UrbanSound8K.csv')

# Split the dataset into 80% training and 20% temporary
train_data, temp = train_test_split(csvdataset, test_size=0.2, random_state=42)

# Split the temporary set into 50% validation and 50% testing
validation_data, test_data = train_test_split(temp, test_size=0.5, random_state=42)

# Now, `train_data` is your training set (80% of total), 
# `validation_data` is your validation set (10% of total), and 
# `test_data` is your testing set (10% of total).

In [49]:
train_set = UrbanSoundDataset(train_data, 'audio')
train_loader = torch.utils.data.DataLoader(train_set, batch_size=16, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

model.train()

for epoch in range(10):
    running_loss = 0.0
    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target.squeeze())
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * data.size(0)
    epoch_loss = running_loss / len(train_set)
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 10, epoch_loss))


  return f(*args, **kwargs)


FileNotFoundError: [Errno 2] No such file or directory: 'audio/169466-4-3-3.wav'

In [None]:
valid_set = UrbanSoundDataset('valid.csv', 'audio')
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=False)

model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for data, target in valid_loader:
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += (predicted == target.squeeze()).sum().item()
    print('Accuracy of the model on the validation set: {:.2f}%'.format(100 * correct / total))
