## Real-Time Sound Classification using PyTorch


This project aims to develop an artificial intelligence system capable of classifying live sounds using the PyTorch framework. The goal is to build a simple yet effective sound classification model that can accurately identify different types of sounds in real-time.

In [64]:
##Hardware
import torch
if torch.cuda.is_available() == True:
    device = 'cuda'
    templist = [1,2,3]
    templist = torch.FloatTensor(templist).to(device)
    print("Cuda torch working : ",end="")
    print(templist.is_cuda)
    print("current device no. : ",end="")
    print(torch.cuda.current_device())
    print("GPU device count : ",end="")
    print(torch.cuda.device_count())
    print("GPU name : ",end="")
    print(torch.cuda.get_device_name(0))
    print("device : ",device)
    ! nvidia-smi
elif torch.backends.mps.is_available() == True:
    print("Apple device detected\nActivating Apple Silicon GPU")
    device = torch.device("mps")
else:
    print("cant use gpu , activating cpu")
    device = 'cpu'

Apple device detected
Activating Apple Silicon GPU


In [65]:
'''SEED Everything'''
import random
import numpy as np
def seed_everything(SEED=42):
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True # keep True if all the input have same size.
SEED=42
seed_everything(SEED=SEED)

In [66]:
try:
    import soundata

    dataset = soundata.initialize('urbansound8k')
    #dataset.download()  # download the dataset
    #dataset.validate()  # validate that all the expected files are there

    example_clip = dataset.choice_clip()  # choose a random example clip
    print(example_clip)  # see the available data
except:
    print("SKIP")

Clip(
  audio_path="/Users/cafalena/sound_datasets/urbansound8k/audio/fold3/65750-3-3-48.wav",
  clip_id="65750-3-3-48",
  audio: The clip's audio
            * np.ndarray - audio signal
            * float - sample rate,
  class_id: The clip's class id.
            * int - integer representation of the class label (0-9). See Dataset Info in the documentation for mapping,
  class_label: The clip's class label.
            * str - string class name: air_conditioner, car_horn, children_playing, dog_bark, drilling, engine_idling, gun_shot, jackhammer, siren, street_music,
  fold: The clip's fold.
            * int - fold number (1-10) to which this clip is allocated. Use these folds for cross validation,
  freesound_end_time: The clip's end time in Freesound.
            * float - end time in seconds of the clip in the original freesound recording,
  freesound_id: The clip's Freesound ID.
            * str - ID of the freesound.org recording from which this clip was taken,
  freesound_sta

In [67]:
import matplotlib.pyplot as plt
import os
import torch
import librosa
import numpy as np
import pandas as pd
from pydub import AudioSegment

class UrbanSoundDataset(torch.utils.data.Dataset):  # torch.utils.data.Dataset를 상속하는 UrbanSoundDataset 클래스를 정의
    def __init__(self, annotations):  # 초기화 함수, annotations와 audio_dir를 매개변수로 받음(annotations은 오디오 파일의 이름, 클래스 ID, 폴더 번호가 있음)
        if isinstance(annotations, pd.DataFrame):  # annotations가 DataFrame 형태인지 확인
            self.annotations = annotations  # 맞다면 그대로 저장
        else:
            self.annotations = pd.read_csv(annotations)  # 아니라면 csv 파일을 읽어와 DataFrame으로 변환

    def __len__(self):  # 데이터셋의 길이를 반환하는 함수
        return len(self.annotations)  # annotations DataFrame의 길이를 반환

    def __getitem__(self, index):
        try:
            audio_path = os.path.join("/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/audio","fold"+str(self.annotations.iloc[index]['fold']),self.annotations.loc[index, 'slice_file_name'])  
        except:
            audio_path = os.path.join("/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/audio","fold"+str(self.annotations.iloc[index]['fold']),self.annotations.loc[index, 'slice_file_name'])  
        class_id = self.annotations.loc[index, 'classID']
        audio, _ = librosa.load(audio_path, sr=44100, mono=True)  # setting standard sampling rate

        # Handle variable length audio files
        fixed_length = 44100 * 4  # 4 seconds // datashape (batch, 44100*4)
        if len(audio) < fixed_length:
            audio = np.pad(audio, (0, fixed_length - len(audio)))  # pad with zeros (zeros are silence)
        elif len(audio) > fixed_length:
            audio = audio[:fixed_length]  # trim to fixed length
        # Transform audio into Mel spectrogram
        return torch.from_numpy(audio), torch.tensor([class_id]) 


#dataset = UrbanSoundDataset()


In [68]:
#/ 전처리 나중에 적용

def calc_fft(y, rate):
    n = len(y)
    freq = np.fft.rfftfreq(n, d=1/rate)
    Y = abs(np.fft.rfft(y)/n)
    return Y, freq

def plot_signal_fft(signal, rate):
    fig, axs = plt.subplots(2, 1, figsize=(20, 10))
    axs[0].plot(signal)
    axs[0].set_title('Signal')
    Y, freq = calc_fft(signal, rate)
    axs[1].plot(freq, Y)
    axs[1].set_title('FFT')
    plt.show()

def calc_spectrogram(signal, rate):
    n_fft = 2048
    hop_length = 512
    spectrogram = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
    spectrogram = np.abs(spectrogram)
    log_spectrogram = librosa.amplitude_to_db(spectrogram)
    return log_spectrogram

def plot_spectrogram(signal, rate):
    log_spectrogram = calc_spectrogram(signal, rate)
    fig, axs = plt.subplots(1, 1, figsize=(20, 10))
    axs.imshow(log_spectrogram, aspect='auto', origin='lower', cmap='jet')


In [69]:
#paremeters

BATCHSIZE = 16

In [70]:
import torch.nn as nn
import torch.nn.functional as F
from torch import nn
"""model 1 39%
class SoundClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SoundClassifier, self).__init__()

        self.network = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = x.view(x.size(0), -1)  # flatten the input
        return self.network(x)


#chaning the last line to 10 number since there is total 10 class
# Define some parameters
num_classes = 10  # for UrbanSound8K dataset, there are 10 classes
input_size = 44100 * 4  # based on the fixed length of the audio samples

# Create the model
model = SoundClassifier(input_size, num_classes)
    """
    
import torch.nn as nn
import torch.nn.functional as F

class SoundClassifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(SoundClassifier, self).__init__()

        self.network = nn.Sequential(
            nn.Linear(input_size, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.5),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.5),
            
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = x.view(x.size(0), -1)  # flatten the input
        return self.network(x)


# Define some parameters
num_classes = 10  # for UrbanSound8K dataset, there are 10 classes
input_size = 44100 * 4  # based on the fixed length of the audio samples

# Create the model
model = SoundClassifier(input_size, num_classes)

# Initialize weights
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(init_weights)



# 0 = air_conditioner
# 1 = car_horn
# 2 = children_playing
# 3 = dog_bark
# 4 = drilling
# 5 = engine_idling
# 6 = gun_shot
# 7 = jackhammer
# 8 = siren
# 9 = street_music

SoundClassifier(
  (network): Sequential(
    (0): Linear(in_features=176400, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=512, out_features=256, bias=True)
    (9): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.5, inplace=False)
    (12): Linear(in_features=256, out_features=128, bias=True)
    (13): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): Dropout(p=0.5, inplace=False)
    (16): Linear(in_features=128, out_features=64, bias=True)
    (17): BatchNorm1d(64, eps=1e-05, momentum=0.1, affin

In [71]:


from sklearn.model_selection import train_test_split

# Load the dataset
try: 
    csvdataset = pd.read_csv('/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/metadata/UrbanSound8K.csv')
except:
    csvdataset = pd.read_csv('/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/metadata/UrbanSound8K.csv')



# Split the dataset into 80% training and 20% temporary
##train_data, temp = train_test_split(csvdataset, test_size=0.2, random_state=42)

# Split the temporary set into 50% validation and 50% testing
##validation_data, test_data = train_test_split(temp, test_size=0.5, random_state=42)

# Now, `train_data` is your training set (80% of total), 
# `validation_data` is your validation set (10% of total), and 
# `test_data` is your testing set (10% of total).


In [72]:

#train_set = UrbanSoundDataset(train_data)
#train_loader = torch.utils.data.DataLoader(train_set, batch_size=16, shuffle=True)


In [73]:
LR_list = [100,10,1,1e-1,1e-2,1e-3,1e-5,1e-7,1e-10]
NB_EPOCH = 1


for LR in LR_list:

    #train_set = UrbanSoundDataset(train_data)
    train_set = UrbanSoundDataset(csvdataset)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCHSIZE, shuffle=True)
    #// we have to change that it uses many lr 
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = torch.nn.CrossEntropyLoss()
    model = SoundClassifier(input_size, num_classes)

    #////
    #////
    #// the data shape is batch 44100*4
    from tqdm import tqdm

    model.train()
    model.to(device)
    for epoch in range(NB_EPOCH):
        running_loss = 0.0
        for data, target in tqdm(train_loader):
            data = data.to(device)
            target = target.to(device)
            #print(data.shape)
            #print(target.shape)
            #print(data)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target.squeeze())
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * data.size(0)
        epoch_loss = running_loss / len(train_set)
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, 10, epoch_loss))
        


KeyboardInterrupt: 

In [None]:
print(0)

print(1)
testdataset = pd.read_csv('/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/metadata/UrbanSound8K.csv')
print(2)
test_loader = UrbanSoundDataset(testdataset)
train_loader = torch.utils.data.DataLoader(test_loader, batch_size=BATCHSIZE, shuffle=True)
print(3)
model.eval()

with torch.no_grad():
    print(4)
    correct = 0
    total = 0
    for data, target in test_loader:
        #data = data.to(device)
        #target = target.to(device)
        
        output = model(data)
        _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        correct += (predicted == target.squeeze()).sum().item()
    print('Accuracy of the model on the validation set: {:.2f}%'.format(100 * correct / total))

0
1
2
3
4


RuntimeError: Placeholder storage has not been allocated on MPS device!

#### Diary

**5/14 1300**: I have resolved the folder location issue, but now I am facing a new problem. The folder location and the sound file do not match. It's strange because the folder and the CSV files are fine. However, the `audio_path` is pointing in the wrong direction.

**5/14 1310**: I noticed that the folder and the file name were slightly off, which indicates that it's not entirely random. So, I decided to avoid using split and shuffle. Surprisingly, it worked. It seems like the split function was causing the problem, but I will keep monitoring the situation. Although the location error still persists, I tried specifying the complete location path as "/Users/cafalena/sound_datasets/urbansound8k/UrbanSound8K/". This resolved the issue, and I observed that it recognized multiple sound files. However, now I am facing a tensor problem. I need to address this next.

**5/14 1752**: I was planning to use a CNN (Convolutional Neural Network), but I realized that sound waves are 1-dimensional. I'm struggling to figure out how to utilize a CNN with sound. Therefore, for now, I will stick with an NN (Neural Network). Once I successfully implement the NN, I can revisit using a CNN. Additionally, I need to work on the accuracy and test code sections.
