In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch import tensor

import torchaudio
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np

import librosa


In [2]:
# 定义一个自定义数据集类
class AudioDataset(Dataset):
    def __init__(self, file_list, label_list):
        self.file_list = file_list
        self.label_list = label_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = self.label_list[idx]

        # 使用 Librosa 加载音频文件，并将其转换为梅尔频谱图
        y, sr = librosa.load(file_path, sr=16000)
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=512, n_mels=80)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = np.expand_dims(mel_spec_db, axis=0)

        # 返回梅尔频谱图和标签
        return mel_spec_db, label


In [3]:
#残差模型
import torch
import torch.nn as nn
import torch.nn.functional as F
 
#定义残差块ResBlock
class ResBlock(nn.Module):
    def __init__(self, inchannel, outchannel, stride=1):
        super(ResBlock, self).__init__()
        #残差块内连续的2个卷积层
        self.left = nn.Sequential(
            nn.Conv2d(inchannel, outchannel, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(outchannel),
            nn.ReLU(inplace=True),
            nn.Conv2d(outchannel, outchannel, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(outchannel)
        )
        self.shortcut = nn.Sequential()
        if stride != 1 or inchannel != outchannel:
            #shortcut，这里为了跟2个卷积层的结果结构一致，要做处理
            self.shortcut = nn.Sequential(
                nn.Conv2d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(outchannel)
            )
            
    def forward(self, x):
        out = self.left(x)
        #将2个卷积层的输出跟处理过的x相加，实现ResNet的基本结构
        out = out + self.shortcut(x)
        out = F.relu(out)
        
        return out
 
#实现ResNet-18模型
class ResNet(nn.Module):
    def __init__(self, ResBlock, num_classes=20):
        super(ResNet, self).__init__()
        self.inchannel = 64
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        self.layer1 = self.make_layer(ResBlock, 64, 2, stride=1)
        self.layer2 = self.make_layer(ResBlock, 128, 2, stride=2)
        self.layer3 = self.make_layer(ResBlock, 256, 2, stride=2)        
        self.layer4 = self.make_layer(ResBlock, 512, 2, stride=2)        
        self.fc = nn.Linear(5120, num_classes)
    #这个函数主要是用来，重复同一个残差块    
    def make_layer(self, block, channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.inchannel, channels, stride))
            self.inchannel = channels
        return nn.Sequential(*layers)
    
    def forward(self, x):
        #在这里，整个ResNet18的结构就很清晰了
        out = self.conv1(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

In [3]:
idx_to_label = "Red-billed Starling,Intermediate Egret,Blue-and-white Flycatcher,Pin-tailed Snipe,Eastern Marsh-Harrier,Manchurian Reed Warbler,Chinese Pond-Heron,Rock Bunting,Isabelline Shrike,Japanese Scops-Owl,Red-backed Shrike,Bronzed Drongo,Claudia's Leaf Warbler,Common Myna,Koklass Pheasant,Barred Warbler,Besra,Pallid Harrier,Tickell's Leaf Warbler,Gray-cheeked Warbler".split(',')

NUM_CLASSES = len(idx_to_label)

label_to_idx = {idx_to_label[i]: i for i in range(NUM_CLASSES)}

train_data_path = 'data/train'
test_data_path = 'data/test'

label_to_idx = {value: key for key, value in label_to_idx.items()}

In [4]:
train_files=[]
train_labels =[]
for label in label_to_idx:
        label_dir = f'{train_data_path}/{label}'
        for wav_file in tqdm(os.listdir(label_dir)):
            train_files.append(label_dir + f'/{wav_file}')
            train_labels.append(label)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100054.96it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 99/99 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<?, ?it/s]
100%|███████████████████████████████████

In [6]:
len(train_files)

1359

In [7]:
len(train_labels)

1359

In [5]:
from sklearn.model_selection import train_test_split
'''
(1)random_state不填或者为0时，每次都不同；其余值表示不同随机数
(2)shuffle表示是否在分割之前对数据进行洗牌（默认True）
'''
train_f, test_f, train_l, test_l = train_test_split(train_files, train_labels, test_size=0.20,random_state=42,shuffle=True)


In [21]:
len(test_f)

272

In [6]:
train_dataset = AudioDataset(train_f, train_l)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = AudioDataset(test_f, test_l)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [10]:
#model = ResNet()
model = ResNet(ResBlock)
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [12]:
#导入模型（要导入必须得再定义一下模型（class ResNet））
model = torch.load('models/ResNet18-50.pt')
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

ResNet(
  (conv1): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (layer1): Sequential(
    (0): ResBlock(
      (left): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (shortcut): Sequential()
    )
    (1): ResBlock(
      (left): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)

In [14]:
def train(epoch):
    min_loss = 0.01
    for i in range(epoch):
        for idx, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if idx % 10 == 0:
                print('epoch: {}, loss: {}'.format(i+1, loss.item()))
                #torch.save(model.state_dict(), './model/yesno_net.pkl')
                #torch.save(optimizer.state_dict(), './model/yesno_optimizer.pkl')
            if loss < min_loss:
                min_loss = loss
                print("save model")
                # 保存模型语句
                torch.save(model, 'ResNet18-100.pt') 
            idx = idx + 1
    print(min_loss)

In [11]:
from sklearn.metrics import f1_score

def test(model):
    loss_list = []
    #sample_num = 0
    acc_num = 0
    f1 = 0

    for idx, (inputs, labels) in enumerate(test_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs)

            #cur_loss = criterion(outputs, labels).cpu()
            # outputs中的每一项均为包含两个位于0和1之间的浮点数的数组，较大浮点数所在位置即为预测值
            pred = outputs.argmax(dim=1, keepdim=True).cpu()  # 将pred转移到CPU上
            labels = labels.cpu()  # 将labels转移到CPU上

            # 统计预测正确的个数
            acc_num += pred.eq(labels.view_as(pred)).sum()
            #print(acc_num)
            # 记录预测的样本数
            #sample_num = sample_num + labels.size()[0]

            # 计算F1分数，注意我们需要将预测和标签转换为NumPy数组
            #f1 += f1_score(labels.numpy(), pred.numpy(), average='micro')

            #loss_list.append(cur_loss)

    # 计算平均F1分数
    #f1 = f1 / len(test_loader)

    print('平均准确率:{}'.format(acc_num / len(test_f)))
    #print('F1 micro 分数为：',f1)
    return acc_num / len(test_f)


In [16]:
train(50)

epoch: 1, loss: 0.0010396133875474334
save model
epoch: 1, loss: 0.4945110082626343
epoch: 2, loss: 0.309893399477005
epoch: 2, loss: 0.25253257155418396
epoch: 3, loss: 0.06578624248504639
epoch: 3, loss: 0.4016568660736084
epoch: 4, loss: 0.18287107348442078
epoch: 4, loss: 0.07961609959602356
epoch: 5, loss: 0.05695413053035736
epoch: 5, loss: 0.03353823721408844
epoch: 6, loss: 0.040758974850177765
epoch: 6, loss: 0.02274332568049431
epoch: 7, loss: 0.011924360878765583
epoch: 7, loss: 0.020801041275262833
epoch: 8, loss: 0.016204487532377243
epoch: 8, loss: 0.05376236140727997
epoch: 9, loss: 0.012404079549014568
epoch: 9, loss: 0.027407461777329445
epoch: 10, loss: 0.011023937724530697
epoch: 10, loss: 0.08822924643754959
epoch: 11, loss: 0.06331238150596619
epoch: 11, loss: 0.017375413328409195
epoch: 12, loss: 0.012449712492525578
epoch: 12, loss: 0.050837546586990356
epoch: 13, loss: 0.06738303601741791
epoch: 13, loss: 0.05519557744264603
epoch: 14, loss: 0.03657029569149017


KeyboardInterrupt: 

In [23]:
test(model)

tensor(23)
tensor(39)
tensor(63)
tensor(82)
tensor(99)
tensor(125)
tensor(146)
tensor(164)
tensor(176)
平均准确率:0.6470588445663452
F1 micro 分数为： 0.6527777777777778


In [17]:
test(model)

tensor(23)
tensor(39)
tensor(60)
tensor(80)
tensor(103)
tensor(129)
tensor(152)
tensor(173)
tensor(185)
平均准确率:0.6801470518112183
F1 micro 分数为： 0.6840277777777778


In [7]:
import torchvision.models as models

# 导入预训练的ResNet模型
model = models.resnet18(pretrained=True)
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
conv1 = model.conv1
new_conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
new_conv1.weight.data = conv1.weight.data[:, :1, :, :]
model.conv1 = new_conv1

model.fc = nn.Linear(512,20,bias=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [13]:
def train(epoch):
    min_loss = 0.05
    max_acc = 0
    for i in range(epoch):
        for idx, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if idx % 10 == 0:
                print('epoch: {}, loss: {}'.format(i+1, loss.item()))
                #torch.save(model.state_dict(), './model/yesno_net.pkl')
                #torch.save(optimizer.state_dict(), './model/yesno_optimizer.pkl')
            if loss < min_loss:
                acc = test(model)
                if(acc>max_acc):
                    max_acc = acc
                    print("save model")
                    # 保存模型语句
                    torch.save(model, 'ResNet18.pt') 
            idx = idx + 1
    print(max_acc)

In [12]:
train(10)

epoch: 1, loss: 0.8279810547828674
epoch: 1, loss: 0.822098970413208
epoch: 2, loss: 0.4800509512424469
epoch: 2, loss: 0.28710657358169556
epoch: 3, loss: 0.11849711835384369
epoch: 3, loss: 0.35848909616470337
epoch: 4, loss: 0.301026314496994
epoch: 4, loss: 0.11221900582313538
epoch: 5, loss: 0.054176609963178635
epoch: 5, loss: 0.0823306143283844
epoch: 6, loss: 0.05871972814202309
epoch: 6, loss: 0.04745430126786232
epoch: 7, loss: 0.013080925680696964
epoch: 7, loss: 0.03298962488770485
epoch: 8, loss: 0.012336009182035923
epoch: 8, loss: 0.0439365990459919
tensor(27)
tensor(51)
tensor(75)
tensor(104)
tensor(131)
tensor(159)
tensor(186)
tensor(203)
tensor(217)
平均准确率:0.7977941036224365
F1 micro 分数为： 0.8020833333333334
save model
epoch: 9, loss: 0.08921092748641968
epoch: 9, loss: 0.03342179208993912
epoch: 10, loss: 0.06828559190034866
epoch: 10, loss: 0.09834416955709457
tensor(0.7978)


In [14]:
train(20)

epoch: 1, loss: 0.0265464149415493
tensor(26)
tensor(54)
tensor(80)
tensor(109)
tensor(136)
tensor(160)
tensor(185)
tensor(208)
tensor(223)
平均准确率:0.8198529481887817
F1 micro 分数为： 0.8263888888888888
save model
tensor(24)
tensor(52)
tensor(78)
tensor(106)
tensor(132)
tensor(156)
tensor(180)
tensor(203)
tensor(218)
平均准确率:0.8014705777168274
F1 micro 分数为： 0.8090277777777778
tensor(24)
tensor(52)
tensor(78)
tensor(106)
tensor(132)
tensor(156)
tensor(181)
tensor(204)
tensor(219)
平均准确率:0.8051470518112183
F1 micro 分数为： 0.8125
tensor(24)
tensor(52)
tensor(79)
tensor(107)
tensor(133)
tensor(157)
tensor(182)
tensor(204)
tensor(218)
平均准确率:0.8014705777168274
F1 micro 分数为： 0.8055555555555556
tensor(24)
tensor(53)
tensor(81)
tensor(107)
tensor(134)
tensor(158)
tensor(186)
tensor(208)
tensor(223)
平均准确率:0.8198529481887817
F1 micro 分数为： 0.8263888888888888
tensor(26)
tensor(55)
tensor(80)
tensor(107)
tensor(135)
tensor(159)
tensor(185)
tensor(207)
tensor(223)
平均准确率:0.8198529481887817
F1 micro 分数为： 0.82986

KeyboardInterrupt: 