In [1]:
import os

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import tensor

import torchaudio
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np

import librosa


In [2]:
# 定义一个自定义数据集类
class AudioDataset(Dataset):
    def __init__(self, file_list, label_list):
        self.file_list = file_list
        self.label_list = label_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = self.label_list[idx]

        # 使用 Librosa 加载音频文件，并将其转换为梅尔频谱图
        y, sr = librosa.load(file_path, sr=16000)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=80)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = np.expand_dims(mel_spec_db, axis=0)

        # 返回梅尔频谱图和标签
        return mel_spec_db, label


In [18]:
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3,3), stride=(1,1), padding=(1,1))
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))

        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1))
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=True)
        self.pool2 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))

        self.fc1 = nn.Linear(49920, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.relu3 = nn.ReLU(inplace=True)

        self.fc2 = nn.Linear(128, 20)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)

        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.fc2(x)

        return x


In [4]:
idx_to_label = "Red-billed Starling,Intermediate Egret,Blue-and-white Flycatcher,Pin-tailed Snipe,Eastern Marsh-Harrier,Manchurian Reed Warbler,Chinese Pond-Heron,Rock Bunting,Isabelline Shrike,Japanese Scops-Owl,Red-backed Shrike,Bronzed Drongo,Claudia's Leaf Warbler,Common Myna,Koklass Pheasant,Barred Warbler,Besra,Pallid Harrier,Tickell's Leaf Warbler,Gray-cheeked Warbler".split(',')

NUM_CLASSES = len(idx_to_label)

label_to_idx = {idx_to_label[i]: i+1 for i in range(NUM_CLASSES)}

train_data_path = 'data/train'
test_data_path = 'data/test'

label_to_idx = {value: key for key, value in label_to_idx.items()}

In [5]:
train_files=[]
train_labels =[]
for label in label_to_idx:
        label_dir = f'{train_data_path}/{label}'
        for wav_file in tqdm(os.listdir(label_dir)):
            train_files.append(label_dir + f'/{wav_file}')
            train_labels.append(label)

100%|███████████████████████████████████████| 99/99 [00:00<00:00, 231200.50it/s]
100%|██████████████████████████████████████████| 6/6 [00:00<00:00, 24197.91it/s]
100%|█████████████████████████████████████| 100/100 [00:00<00:00, 164160.63it/s]
100%|███████████████████████████████████████| 99/99 [00:00<00:00, 239743.70it/s]
100%|████████████████████████████████████████| 21/21 [00:00<00:00, 34595.59it/s]
100%|████████████████████████████████████████| 39/39 [00:00<00:00, 93526.50it/s]
100%|███████████████████████████████████████| 16/16 [00:00<00:00, 104530.94it/s]
100%|█████████████████████████████████████| 100/100 [00:00<00:00, 418593.21it/s]
100%|███████████████████████████████████████| 98/98 [00:00<00:00, 201688.81it/s]
100%|████████████████████████████████████████| 15/15 [00:00<00:00, 51569.31it/s]
100%|█████████████████████████████████████| 100/100 [00:00<00:00, 129854.61it/s]
100%|█████████████████████████████████████| 100/100 [00:00<00:00, 170085.32it/s]
100%|███████████████████████

In [6]:
len(train_files)

1359

In [7]:
len(train_labels)

1359

In [9]:
train_f = train_files[:1100]
train_l = train_labels[:1100]
test_f = train_files[1100:]
test_l = train_labels[1100:]

In [10]:
len(train_f)

1100

In [11]:
train_dataset = AudioDataset(train_f, train_l)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = AudioDataset(test_f, test_l)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [19]:
model = AudioCNN()
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

AudioCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=49920, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU(inplace=True)
  (fc2): Linear(in_features=128, out_features=20, bias=True)
)

In [21]:
def train(epoch):
    for idx, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if idx % 10 == 0:
            print('epoch: {}, loss: {}'.format(epoch, loss.item()))
            #torch.save(model.state_dict(), './model/yesno_net.pkl')
            #torch.save(optimizer.state_dict(), './model/yesno_optimizer.pkl')
        idx = idx + 1

In [22]:
def test():
    loss_list = []
    sample_num = 0
    acc_num = 0

    for idx, (inputs, labels) in enumerate(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs)

            cur_loss = criterion(outputs, labels).cpu()
            # outputs中的每一项均为包含两个位于0和1之间的浮点数的数组，较大浮点数所在位置即为预测值
            pred = outputs.argmax(dim=1, keepdim=True)
            # 统计预测正确的个数
            acc_num = acc_num + pred.eq(labels.view_as(pred)).sum().item()
            # 记录预测的样本数
            sample_num = sample_num + labels.size()[0]

            loss_list.append(cur_loss)

    print('平均损失率:{}, 平均准确率:{}'.format(np.mean(loss_list), acc_num / sample_num))

In [23]:
train(10)

epoch: 10, loss: 3.003204107284546
epoch: 10, loss: 2.3967456817626953
epoch: 10, loss: 2.2718915939331055
epoch: 10, loss: 2.063397169113159
