In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import tensor

import torchaudio
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np

import librosa


In [2]:
# 定义一个自定义数据集类
class AudioDataset(Dataset):
    def __init__(self, file_list, label_list):
        self.file_list = file_list
        self.label_list = label_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = self.label_list[idx]

        # 使用 Librosa 加载音频文件，并将其转换为梅尔频谱图
        y, sr = librosa.load(file_path, sr=16000)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=80)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = np.expand_dims(mel_spec_db, axis=0)

        # 返回梅尔频谱图和标签
        return mel_spec_db, label


In [3]:
class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3,3), stride=(1,1), padding=(1,1))
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))

        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3,3), stride=(1,1), padding=(1,1))
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU(inplace=True)
        self.pool2 = nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))

        self.fc1 = nn.Linear(49920, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.relu3 = nn.ReLU(inplace=True)

        self.fc2 = nn.Linear(128, 20)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)

        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.fc2(x)

        return x


In [4]:
idx_to_label = "Red-billed Starling,Intermediate Egret,Blue-and-white Flycatcher,Pin-tailed Snipe,Eastern Marsh-Harrier,Manchurian Reed Warbler,Chinese Pond-Heron,Rock Bunting,Isabelline Shrike,Japanese Scops-Owl,Red-backed Shrike,Bronzed Drongo,Claudia's Leaf Warbler,Common Myna,Koklass Pheasant,Barred Warbler,Besra,Pallid Harrier,Tickell's Leaf Warbler,Gray-cheeked Warbler".split(',')

NUM_CLASSES = len(idx_to_label)

label_to_idx = {idx_to_label[i]: i for i in range(NUM_CLASSES)}

train_data_path = 'data/train'
test_data_path = 'data/test'

label_to_idx = {value: key for key, value in label_to_idx.items()}

In [5]:
label_to_idx

{0: 'Red-billed Starling',
 1: 'Intermediate Egret',
 2: 'Blue-and-white Flycatcher',
 3: 'Pin-tailed Snipe',
 4: 'Eastern Marsh-Harrier',
 5: 'Manchurian Reed Warbler',
 6: 'Chinese Pond-Heron',
 7: 'Rock Bunting',
 8: 'Isabelline Shrike',
 9: 'Japanese Scops-Owl',
 10: 'Red-backed Shrike',
 11: 'Bronzed Drongo',
 12: "Claudia's Leaf Warbler",
 13: 'Common Myna',
 14: 'Koklass Pheasant',
 15: 'Barred Warbler',
 16: 'Besra',
 17: 'Pallid Harrier',
 18: "Tickell's Leaf Warbler",
 19: 'Gray-cheeked Warbler'}

In [6]:
train_files=[]
train_labels =[]
for label in label_to_idx:
        label_dir = f'{train_data_path}/{label}'
        for wav_file in tqdm(os.listdir(label_dir)):
            train_files.append(label_dir + f'/{wav_file}')
            train_labels.append(label)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 20991.51it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<?, ?it/s]
100%|███████████████████████████████████

In [7]:
len(train_files)

1359

In [8]:
type(train_files)

list

In [9]:
len(train_labels)

1359

In [10]:
from sklearn.model_selection import train_test_split
'''
(1)random_state不填或者为0时，每次都不同；其余值表示不同随机数
(2)shuffle表示是否在分割之前对数据进行洗牌（默认True）
'''
train_f, test_f, train_l, test_l = train_test_split(train_files, train_labels, test_size=0.20,random_state=42,shuffle=True)


In [11]:
train_l

[12,
 18,
 4,
 11,
 0,
 13,
 11,
 1,
 8,
 16,
 12,
 11,
 14,
 9,
 19,
 16,
 14,
 6,
 1,
 13,
 11,
 3,
 8,
 1,
 15,
 11,
 2,
 4,
 8,
 12,
 0,
 1,
 9,
 3,
 3,
 11,
 3,
 16,
 16,
 19,
 5,
 9,
 8,
 5,
 5,
 1,
 19,
 4,
 14,
 12,
 11,
 8,
 1,
 16,
 16,
 16,
 17,
 9,
 7,
 8,
 12,
 13,
 4,
 0,
 13,
 16,
 13,
 8,
 1,
 16,
 8,
 11,
 14,
 13,
 6,
 8,
 8,
 4,
 4,
 4,
 9,
 8,
 19,
 11,
 16,
 12,
 4,
 1,
 1,
 3,
 19,
 19,
 3,
 15,
 4,
 4,
 16,
 13,
 0,
 11,
 9,
 8,
 11,
 9,
 11,
 16,
 3,
 3,
 13,
 18,
 16,
 17,
 3,
 4,
 4,
 5,
 0,
 6,
 18,
 6,
 13,
 16,
 7,
 12,
 13,
 4,
 14,
 14,
 9,
 11,
 8,
 15,
 13,
 4,
 13,
 11,
 4,
 14,
 4,
 16,
 0,
 1,
 14,
 0,
 15,
 1,
 12,
 17,
 3,
 14,
 11,
 14,
 15,
 11,
 4,
 4,
 1,
 4,
 3,
 15,
 8,
 1,
 12,
 16,
 8,
 1,
 16,
 14,
 3,
 13,
 3,
 4,
 6,
 5,
 4,
 9,
 3,
 12,
 6,
 12,
 18,
 12,
 16,
 14,
 11,
 12,
 14,
 17,
 8,
 16,
 11,
 9,
 11,
 9,
 11,
 4,
 8,
 12,
 11,
 14,
 13,
 0,
 0,
 3,
 11,
 16,
 1,
 4,
 16,
 9,
 8,
 16,
 19,
 19,
 8,
 0,
 1,
 19,
 16,
 12,
 14,
 13,

In [12]:
type(train_f)

list

In [13]:
len(train_f)

1087

In [14]:
train_dataset = AudioDataset(train_f, train_l)
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)

test_dataset = AudioDataset(test_f, test_l)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)


In [15]:
for idx, (inputs, labels) in enumerate(train_loader):
    print(inputs.shape)
    break

torch.Size([100, 1, 80, 157])


In [15]:
model = AudioCNN()
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [19]:
model = torch.load('CNN.pt')

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#device

AudioCNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace=True)
  (pool1): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace=True)
  (pool2): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=49920, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU(inplace=True)
  (fc2): Linear(in_features=128, out_features=20, bias=True)
)

In [27]:
def train(epoch):
    min_loss = 999
    for i in range(epoch):
        for idx, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if idx % 10 == 0:
                print('epoch: {}, loss: {}'.format(i+1, loss.item()))
                #torch.save(model.state_dict(), './model/yesno_net.pkl')
                #torch.save(optimizer.state_dict(), './model/yesno_optimizer.pkl')
            if loss < min_loss:
                min_loss = loss
                print("save model")
                # 保存模型语句
                torch.save(model, 'CNN-50.pt') 
            idx = idx + 1

In [18]:
from sklearn.metrics import f1_score

def test(model):
    loss_list = []
    sample_num = 0
    acc_num = 0
    f1 = 0

    for idx, (inputs, labels) in enumerate(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs)

            #cur_loss = criterion(outputs, labels).cpu()
            # outputs中的每一项均为包含两个位于0和1之间的浮点数的数组，较大浮点数所在位置即为预测值
            pred = outputs.argmax(dim=1, keepdim=True).cpu()  # 将pred转移到CPU上
            labels = labels.cpu()  # 将labels转移到CPU上

            # 统计预测正确的个数
            acc_num = acc_num + pred.eq(labels.view_as(pred)).sum()
            # 记录预测的样本数
            #sample_num = sample_num + labels.size()[0]

            # 计算F1分数，注意我们需要将预测和标签转换为NumPy数组
            f1 += f1_score(labels.numpy(), pred.numpy(), average='micro')

            #loss_list.append(cur_loss)

    # 计算平均F1分数
    f1 = f1 / len(test_loader)

    print('平均准确率:{}'.format(acc_num /len(test_f)))
    print('F1 micro 分数为：',f1)


In [19]:
train(10)

epoch: 1, loss: 3.047656774520874
epoch: 1, loss: 2.409719467163086
epoch: 2, loss: 2.262383222579956
epoch: 2, loss: 1.8428598642349243
epoch: 3, loss: 1.8027180433273315
epoch: 3, loss: 1.5218979120254517
epoch: 4, loss: 1.4019790887832642
epoch: 4, loss: 1.1832630634307861
epoch: 5, loss: 0.9899139404296875
epoch: 5, loss: 0.8611960411071777
epoch: 6, loss: 0.651847243309021
epoch: 6, loss: 0.5434620976448059
epoch: 7, loss: 0.3716967701911926
epoch: 7, loss: 0.22023546695709229
epoch: 8, loss: 0.20557396113872528
epoch: 8, loss: 0.15621672570705414
epoch: 9, loss: 0.09665294736623764
epoch: 9, loss: 0.07904089242219925
epoch: 10, loss: 0.07197050005197525
epoch: 10, loss: 0.045750755816698074


In [28]:
train(40)

epoch: 1, loss: 0.05406859889626503
save model
save model
save model
save model
save model
epoch: 1, loss: 0.0598292201757431
epoch: 2, loss: 0.05651723966002464
save model
save model
epoch: 2, loss: 0.05248680338263512
epoch: 3, loss: 0.047463979572057724
epoch: 3, loss: 0.050165899097919464
epoch: 4, loss: 0.047072701156139374
epoch: 4, loss: 0.0549277700483799
epoch: 5, loss: 0.048995841294527054
epoch: 5, loss: 0.051910948008298874
epoch: 6, loss: 0.053177233785390854
save model
epoch: 6, loss: 0.0578417107462883
epoch: 7, loss: 0.04878412187099457
epoch: 7, loss: 0.0479990690946579
epoch: 8, loss: 0.05275847762823105
epoch: 8, loss: 0.055494632571935654
epoch: 9, loss: 0.05251884460449219
epoch: 9, loss: 0.0545039027929306
epoch: 10, loss: 0.053538255393505096
epoch: 10, loss: 0.05004219710826874
epoch: 11, loss: 0.05123279243707657
epoch: 11, loss: 0.04203002154827118
epoch: 12, loss: 0.04982112720608711
epoch: 12, loss: 0.05193329229950905
epoch: 13, loss: 0.04068469628691673
sa

In [20]:
torch.save(model, 'CNN.pt') 

In [29]:
test(model)

平均准确率:0.5551470518112183
F1 micro 分数为： 0.5551851851851851
