In [28]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import tensor

import torchaudio
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np

import librosa


In [29]:
# 定义一个自定义数据集类
class AudioDataset(Dataset):
    def __init__(self, file_list, label_list):
        self.file_list = file_list
        self.label_list = label_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = self.label_list[idx]

        # 使用 Librosa 加载音频文件，并将其转换为梅尔频谱图
        y, sr = librosa.load(file_path, sr=16000)#y:NumPy数组，表示音频的时间序列数据，sr:整数，表示音频的采样率
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=80)#y：音频信号。sr：采样率。n_fft：FFT窗口的大小，即用于计算频谱的窗口大小。hop_length：窗口之间的跳跃样本数。n_mels：梅尔滤波器的数量，即梅尔频谱图的频带数量。
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)#梅尔频谱图从功率尺度（Power Scale）转换为分贝尺度（Decibel Scale）
        mel_spec_db = np.expand_dims(mel_spec_db, axis=0)#这个函数用于增加数组的维度。它将二维的梅尔频谱图增加为三维

        # 返回梅尔频谱图和标签
        return mel_spec_db, label


In [52]:
input_dim = 80 * 157#输入数据为[100, 1, 80, 157]
hidden_dim = 128
num_layers = 5
num_classes = 20

# 定义模型
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True,bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1, input_dim)
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim).to(x.device) 
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [53]:
idx_to_label = "Red-billed Starling,Intermediate Egret,Blue-and-white Flycatcher,Pin-tailed Snipe,Eastern Marsh-Harrier,Manchurian Reed Warbler,Chinese Pond-Heron,Rock Bunting,Isabelline Shrike,Japanese Scops-Owl,Red-backed Shrike,Bronzed Drongo,Claudia's Leaf Warbler,Common Myna,Koklass Pheasant,Barred Warbler,Besra,Pallid Harrier,Tickell's Leaf Warbler,Gray-cheeked Warbler".split(',')

NUM_CLASSES = len(idx_to_label)

label_to_idx = {idx_to_label[i]: i for i in range(NUM_CLASSES)}

train_data_path = 'data/train'
test_data_path = 'data/test'

label_to_idx = {value: key for key, value in label_to_idx.items()}

In [54]:
label_to_idx

{0: 'Red-billed Starling',
 1: 'Intermediate Egret',
 2: 'Blue-and-white Flycatcher',
 3: 'Pin-tailed Snipe',
 4: 'Eastern Marsh-Harrier',
 5: 'Manchurian Reed Warbler',
 6: 'Chinese Pond-Heron',
 7: 'Rock Bunting',
 8: 'Isabelline Shrike',
 9: 'Japanese Scops-Owl',
 10: 'Red-backed Shrike',
 11: 'Bronzed Drongo',
 12: "Claudia's Leaf Warbler",
 13: 'Common Myna',
 14: 'Koklass Pheasant',
 15: 'Barred Warbler',
 16: 'Besra',
 17: 'Pallid Harrier',
 18: "Tickell's Leaf Warbler",
 19: 'Gray-cheeked Warbler'}

In [33]:
train_files=[]
train_labels =[]
for label in label_to_idx:
        label_dir = f'{train_data_path}/{label}'
        for wav_file in tqdm(os.listdir(label_dir)):
            train_files.append(label_dir + f'/{wav_file}')
            train_labels.append(label)

100%|███████████████████████████████████████| 99/99 [00:00<00:00, 241979.08it/s]
100%|███████████████████████████████████████| 99/99 [00:00<00:00, 439868.75it/s]
100%|██████████████████████████████████████████| 6/6 [00:00<00:00, 50840.05it/s]
100%|█████████████████████████████████████| 100/100 [00:00<00:00, 577727.82it/s]
100%|███████████████████████████████████████| 99/99 [00:00<00:00, 268935.30it/s]
100%|████████████████████████████████████████| 21/21 [00:00<00:00, 65633.67it/s]
100%|███████████████████████████████████████| 39/39 [00:00<00:00, 168985.39it/s]
100%|████████████████████████████████████████| 16/16 [00:00<00:00, 80082.18it/s]
100%|█████████████████████████████████████| 100/100 [00:00<00:00, 538421.57it/s]
100%|███████████████████████████████████████| 98/98 [00:00<00:00, 844028.32it/s]
100%|████████████████████████████████████████| 15/15 [00:00<00:00, 24595.22it/s]
100%|█████████████████████████████████████| 100/100 [00:00<00:00, 152188.10it/s]
100%|███████████████████████

In [34]:
len(train_files)

1359

In [35]:
type(train_files)

list

In [36]:
len(train_labels)

1359

In [37]:
from sklearn.model_selection import train_test_split
'''
(1)random_state不填或者为0时，每次都不同；其余值表示不同随机数
(2)shuffle表示是否在分割之前对数据进行洗牌（默认True）
'''
train_f, test_f, train_l, test_l = train_test_split(train_files, train_labels, test_size=0.20,random_state=42,shuffle=True)


In [38]:
train_l

[12,
 18,
 4,
 11,
 0,
 13,
 11,
 1,
 8,
 16,
 12,
 11,
 14,
 9,
 19,
 16,
 14,
 6,
 1,
 13,
 11,
 3,
 8,
 1,
 15,
 11,
 2,
 4,
 8,
 12,
 0,
 1,
 9,
 3,
 3,
 11,
 3,
 16,
 16,
 19,
 5,
 9,
 8,
 5,
 5,
 1,
 19,
 4,
 14,
 12,
 11,
 8,
 1,
 16,
 16,
 16,
 17,
 9,
 7,
 8,
 12,
 13,
 4,
 0,
 13,
 16,
 13,
 8,
 1,
 16,
 8,
 11,
 14,
 13,
 6,
 8,
 8,
 4,
 4,
 4,
 9,
 8,
 19,
 11,
 16,
 12,
 4,
 1,
 1,
 3,
 19,
 19,
 3,
 15,
 4,
 4,
 16,
 13,
 0,
 11,
 9,
 8,
 11,
 9,
 11,
 16,
 3,
 3,
 13,
 18,
 16,
 17,
 3,
 4,
 4,
 5,
 0,
 6,
 18,
 6,
 13,
 16,
 7,
 12,
 13,
 4,
 14,
 14,
 9,
 11,
 8,
 15,
 13,
 4,
 13,
 11,
 4,
 14,
 4,
 16,
 0,
 1,
 14,
 0,
 15,
 1,
 12,
 17,
 3,
 14,
 11,
 14,
 15,
 11,
 4,
 4,
 1,
 4,
 3,
 15,
 8,
 1,
 12,
 16,
 8,
 1,
 16,
 14,
 3,
 13,
 3,
 4,
 6,
 5,
 4,
 9,
 3,
 12,
 6,
 12,
 18,
 12,
 16,
 14,
 11,
 12,
 14,
 17,
 8,
 16,
 11,
 9,
 11,
 9,
 11,
 4,
 8,
 12,
 11,
 14,
 13,
 0,
 0,
 3,
 11,
 16,
 1,
 4,
 16,
 9,
 8,
 16,
 19,
 19,
 8,
 0,
 1,
 19,
 16,
 12,
 14,
 13,

In [39]:
type(train_f)

list

In [40]:
len(train_f)

1087

In [41]:
train_dataset = AudioDataset(train_f, train_l)
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)

test_dataset = AudioDataset(test_f, test_l)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)


In [42]:
for idx, (inputs, labels) in enumerate(train_loader):
    print(inputs.shape)
    break

torch.Size([100, 1, 80, 157])


In [55]:
model = LSTM(input_dim, hidden_dim, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#device

LSTM(
  (lstm): LSTM(12560, 128, num_layers=5, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=20, bias=True)
)

In [57]:
def train(epoch):
    for i in range(epoch):
        for idx, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            #print(inputs.shape)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if idx % 10 == 0:
                print('epoch: {}, loss: {}'.format(i+1, loss.item()))
                #torch.save(model.state_dict(), './model/yesno_net.pkl')
                #torch.save(optimizer.state_dict(), './model/yesno_optimizer.pkl')
            idx = idx + 1

In [58]:
from sklearn.metrics import f1_score

def test(model):
    loss_list = []
    sample_num = 0
    acc_num = 0
    f1 = 0

    for idx, (inputs, labels) in enumerate(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs)

            #cur_loss = criterion(outputs, labels).cpu()
            # outputs中的每一项均为包含两个位于0和1之间的浮点数的数组，较大浮点数所在位置即为预测值
            pred = outputs.argmax(dim=1, keepdim=True).cpu()  # 将pred转移到CPU上
            labels = labels.cpu()  # 将labels转移到CPU上

            # 统计预测正确的个数
            acc_num = acc_num + pred.eq(labels.view_as(pred)).sum()
            # 记录预测的样本数
            #sample_num = sample_num + labels.size()[0]

            # 计算F1分数，注意我们需要将预测和标签转换为NumPy数组
            f1 += f1_score(labels.numpy(), pred.numpy(), average='micro')

            #loss_list.append(cur_loss)

    # 计算平均F1分数
    f1 = f1 / len(test_loader)

    print('平均准确率:{}'.format(acc_num /len(test_loader)))
    print('F1 micro 分数为：',f1)


In [59]:
#batch_size=100
train(10)

epoch: 1, loss: 2.9976468086242676
epoch: 1, loss: 2.9598212242126465
epoch: 2, loss: 2.9358997344970703
epoch: 2, loss: 2.7830705642700195
epoch: 3, loss: 2.8587982654571533
epoch: 3, loss: 2.8541505336761475
epoch: 4, loss: 2.8182289600372314
epoch: 4, loss: 2.8701162338256836
epoch: 5, loss: 2.8159255981445312
epoch: 5, loss: 2.8412883281707764
epoch: 6, loss: 2.922105312347412
epoch: 6, loss: 2.821270227432251
epoch: 7, loss: 2.7436883449554443
epoch: 7, loss: 2.843710422515869
epoch: 8, loss: 2.7752902507781982
epoch: 8, loss: 2.8272457122802734
epoch: 9, loss: 2.768798828125
epoch: 9, loss: 2.858407974243164
epoch: 10, loss: 2.813053846359253
epoch: 10, loss: 2.8590445518493652


In [20]:
torch.save(model, 'LTSM.pt') 

In [60]:
test(model)

平均准确率:4.666666507720947
F1 micro 分数为： 0.050555555555555555
