In [2]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torchvision
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import librosa
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchaudio
import math
import random


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/MyDrive/Colab Notebooks/sound classification

/content/drive/MyDrive/Colab Notebooks/sound classification


In [4]:
def read_noiseDir(noiseDir_path, index):
    # 列出文件夹中的所有文件
    files = os.listdir(noiseDir_path)

    #选择第index个文件
    file_to_read = files[index]
    noise_path = os.path.join(noiseDir_path, file_to_read)

    noise_audio = librosa.load(noise_path)

    return noise_audio



In [5]:
#想把train数据上的鸟叫声扩大一点
def add_audio(audio, mm):
    y, sr = audio
    # mm为原音频的放大倍数
    y2 = y * mm
    return y2, sr


#想加噪声
#def add_noise(audio, noise_path,out_path, SNR, sr=16000):
def add_noise(audio, SNR): #只传入原始音频和信噪比（SNR）
    #读取语音文件data和fs
    y, sr = audio
    random_values = np.random.rand(len(y))#生成高斯白噪声
    #计算语音信号功率Ps和噪声功率Pn1
    Ps = np.sum(y ** 2) / len(y)
    Pn1 = np.sum(random_values ** 2) / len(random_values)

    # 计算k值
    k=math.sqrt(Ps/(10**(SNR/10)*Pn1))
    #将噪声数据乘以k,
    random_values_we_need=random_values*k
    #计算新的噪声数据的功率
    Pn=np.sum(random_values_we_need**2)/len(random_values_we_need)
    #以下开始计算信噪比
    snr=10*math.log10(Ps/Pn)
    print("当前信噪比：",snr)

    """
    #单独将噪音数据写入文件
    sf.write(noise_path,random_values_we_need, sr)
    # 将叠加噪声的数据写入文件
    sf.write(out_path, outdata, sr)
    """
    #将噪声数据叠加到纯净音频上去
    out_y = y+random_values_we_need
    out_y = out_y.astype(np.float32)

    return out_y, sr


#加上风的噪声，假设时长已经相同
def add_ECS_noise(audio, noise, SNR):
    y, sr = audio
    y_noise, sr_noise = noise #不知道要不要统一noise和audio的采样率？

    print(noise)

    #计算语音信号功率Ps和噪声功率Pn1
    Ps = np.sum(y ** 2) / len(y)
    Pn1 = np.sum(y_noise ** 2) / len(y_noise)

     # 计算k值
    k=math.sqrt(Ps/(10**(SNR/10)*Pn1))
    #将噪声数据乘以k,
    y_noise_we_need = y_noise*k

    #将噪声数据叠加到纯净音频上去
    outdata = y+y_noise_we_need

    return outdata, sr


def time_shift(audio, shift_limit):
    sig,sr = audio
    sig_len = len(sig)
    #_, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (np.roll(sig, shift_amt), sr)






In [13]:
# 训练数据预处理
def preprocess_data(directory):
    features = []
    labels = []
    max_length = 5 * 22050  # 5 seconds
    i = 0 #用来遍历noise文件夹用
    for label_dir in glob.glob(os.path.join(directory, '*')):  # '*'是一个通配符，代表该目录下的所有文件
        # glob.glob(os.path.join(directory, '*'))就会返回你要遍历的目录下的所有文件和子目录的路径列表。
        # 提取标签
        label = os.path.basename(label_dir)
        # os.path.basename(label_dir) 是Python的os模块中的一个函数，用于返回路径最后的文件名
        # label_dir 是 'train/1'，那么 os.path.basename(label_dir) 就会返回 '1'
        for file in glob.glob(os.path.join(label_dir, '*.ogg')):
            # 加载音频文件
            y, sr = librosa.load(file)  # y 就是个向量，代表音频的长度。 sr是采样率，采样率是 22050 Hz
            # 如果音频文件太短，则填充到max_length
            if len(y) < max_length:
                y = np.pad(y, (0, max_length - len(y)))
            # 如果音频文件太长，则裁剪到max_length
            elif len(y) > max_length:
                y = y[:max_length]


            #遍历noise文件夹里的噪声加到当前音频
            i = (i+1)%19 #目前noise文件夹有20个文件
            noise_path = "noise"
            noise_audio = read_noiseDir(noise_path, i)
            y, sr = add_ECS_noise((y, sr), noise_audio, 18)


            #将原本的鸟叫声音增强
            #y, sr = add_audio((y, sr), mm=1.5)

            #加入高斯噪声
            #y, sr = add_noise(audio=(y, sr), SNR=25)

            #进行时移的数据增广
            #shift_aud = time_shift((y, sr), shift_limit=0.5)



            # #加入ECS里面的噪声
            # #先读几个噪声
            # rain_noise = librosa.load("1-101296-A-19.wav")
            # wind_noise = librosa.load("1-103999-A-30.wav")

            # y, sr = add_ECS_noise((y, sr), rain_noise, 15)
            # y, sr = add_ECS_noise((y, sr), wind_noise, 15)


            # 提取特征（例如梅尔谱图）
            # mfcc = librosa.feature.mfcc(y=y, sr=sr)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=80, n_fft=2048, hop_length=512)
            features.append(mfcc)
            labels.append(int(label))



    return np.array(features), np.array(labels)

In [14]:
# 封装训练验证数据
X_train, y_train = preprocess_data("data/train")
X_train = X_train.reshape(X_train.shape[0],1,X_train.shape[1],X_train.shape[2])#在第二维加上通道数，方便训练
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
train_data = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = torch.utils.data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
batch_size = 64

trainloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valloader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

(array([0.0094367 , 0.0073174 , 0.00290787, ..., 0.17852183, 0.07076039,
       0.04447392], dtype=float32), 22050)
(array([ 0.00531473,  0.00899529,  0.01308785, ..., -0.00741944,
       -0.00352798,  0.00518125], dtype=float32), 22050)
(array([ 0.00171803,  0.0075047 ,  0.00646336, ..., -0.00132402,
        0.00294633,  0.00442741], dtype=float32), 22050)
(array([0.03421553, 0.05269937, 0.05240169, ..., 0.        , 0.        ,
       0.        ], dtype=float32), 22050)
(array([0.14434524, 0.84565383, 0.8856815 , ..., 0.        , 0.        ,
       0.        ], dtype=float32), 22050)
(array([-3.1491788e-04, -5.3233549e-04,  2.4739944e-05, ...,
       -2.8975770e-02, -1.6098242e-02, -4.4580154e-02], dtype=float32), 22050)
(array([ 0.03304563,  0.01924987,  0.02399522, ..., -0.07866564,
       -0.03130605, -0.00190654], dtype=float32), 22050)
(array([-0.03270131, -0.05648448, -0.06062448, ...,  0.34354794,
        0.34124264,  0.3105651 ], dtype=float32), 22050)
(array([-0.00132208, -0.

In [15]:
print(X_train.shape)
print(y_train.shape)

(1223, 1, 80, 216)
(1223,)


In [16]:
import torchvision.models as models

# 导入预训练的ResNet模型
model = models.resnet18(pretrained=True)
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
conv1 = model.conv1
new_conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
new_conv1.weight.data = conv1.weight.data[:, :1, :, :]
model.conv1 = new_conv1

model.fc = nn.Linear(512,20,bias=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 76.0MB/s]


ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [25]:
from sklearn.metrics import f1_score
def test(model):
    loss_list = []
    #sample_num = 0
    acc_num = 0
    f1 = 0

    for idx, (inputs, labels) in enumerate(valloader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(inputs)
            #logits = outputs.logits

            #cur_loss = criterion(outputs, labels).cpu()
            # outputs中的每一项均为包含两个位于0和1之间的浮点数的数组，较大浮点数所在位置即为预测值
            pred = outputs.argmax(dim=1, keepdim=True).cpu()  # 将pred转移到CPU上
            labels = labels.cpu()  # 将labels转移到CPU上

            # 统计预测正确的个数
            acc_num += pred.eq(labels.view_as(pred)).sum()
            #print(acc_num)
            # 记录预测的样本数
            #sample_num = sample_num + labels.size()[0]

            # 计算F1分数，注意我们需要将预测和标签转换为NumPy数组
            #f1 += f1_score(labels.numpy(), pred.numpy(), average='micro')

            #loss_list.append(cur_loss)

    # 计算平均F1分数
    #f1 = f1 / len(test_loader)

    print('平均准确率:{}'.format(acc_num / len(val_data)))
    #print('F1 micro 分数为：',f1)
    return acc_num / len(val_data)


In [29]:
def train(epoch):
    min_loss = 0.05
    max_acc = 0
    for i in range(epoch):
        for idx, (inputs, labels) in enumerate(trainloader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if idx % 10 == 0:
                print('epoch: {}, loss: {}'.format(i+1, loss.item()))

            if loss < min_loss:
                acc = test(model)
                if(acc>max_acc):
                    max_acc = acc
                    print("save model")
                    # 保存模型语句
                    torch.save(model, 'ResNet18.pt')
            idx = idx + 1
    print(max_acc)

In [30]:
train(50)

epoch: 1, loss: 0.0021562804467976093
平均准确率:0.5147058963775635
save model
平均准确率:0.5147058963775635
平均准确率:0.5147058963775635
平均准确率:0.5073529481887817
平均准确率:0.5
平均准确率:0.5
平均准确率:0.5
平均准确率:0.5
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
epoch: 1, loss: 0.0014204960316419601
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
epoch: 2, loss: 0.001238782424479723
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
epoch: 2, loss: 0.0018101726891472936
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
平均准确率:0.4852941036224365
平均准确率:0.49264705181121826
平均准确率:0.49264705181121826
平均准确率: