# SVC

In [1]:
from pathlib import Path

import librosa
import numpy as np
from sklearn.svm import SVC

from env import DATA_PATH


def load_dataset(dir_path: Path) -> (np.ndarray, np.ndarray):
    data_files = list(dir_path.glob("*.wav"))
    features = []
    labels = []
    for file in data_files:
        label = file.stem.split("_")[0]

        x, sr = librosa.load(file, sr=None)
        mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        features.append(mfccs_scaled)
        labels.append(label)

    return np.array(features), np.array(labels)


def train(x, y):
    model = SVC(kernel="linear")
    model.fit(x, y)
    return model


def eval(model, x, y):
    acc = model.score(x, y)
    return acc


def test(model, dataset_path):
    # random one
    import random
    import librosa
    import numpy as np
    files = list(dataset_path.glob("*.wav"))
    a_sound = files[random.randint(0, len(files))]
    x, sr = librosa.load(a_sound, sr=None)
    mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    print(a_sound.stem, model.predict([mfccs_scaled]))


def main_svc():
    dataset_path = DATA_PATH / "animal" / "all"
    x, y = load_dataset(dataset_path)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    model = train(x_train, y_train)
    acc = eval(model, x_test, y_test)
    print(f"Accuracy: {acc}")

    test(model, dataset_path)


main_svc()

Accuracy: 0.7
bird_11-2_1595274334120 ['bird']


# Transformer

## Model

In [2]:
# -*- coding:utf-8 -*-
# @FileName : model.py
# @Time : 2024/3/20 17:48
# @Author : fiv
import math
from typing import Callable
from typing import Union

import torch.nn as nn
from torch import Tensor
from torch.nn import functional as F, TransformerEncoderLayer, TransformerEncoder
from torch.nn.modules.normalization import LayerNorm


class Transformer(nn.Module):
    def __init__(self, n_out=4, wav_length=512, d_model=40, nhead: int = 8, num_encoder_layers: int = 6,
                 dim_feedforward: int = 2048, dropout: float = 0.1,
                 activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
                 layer_norm_eps: float = 1e-5, batch_first: bool = True, norm_first: bool = False,
                 bias: bool = True, device=None, dtype=None):
        super(Transformer, self).__init__()
        factory_kwargs = {'device': device, 'dtype': dtype}
        self.d_model = d_model
        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
                                                activation, layer_norm_eps, batch_first, norm_first,
                                                bias, **factory_kwargs)
        encoder_norm = LayerNorm(d_model, eps=layer_norm_eps, bias=bias, **factory_kwargs)
        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)

        # self.embedding = nn.Embedding(n_out, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.linear = nn.Linear(d_model * wav_length, n_out)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        # self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, x):
        x = self.pos_encoder(x)
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        return x


class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

## 训练函数

In [3]:
# -*- coding:utf-8 -*-
# @FileName : train.py
# @Time : 2024/3/20 16:13
# @Author : fiv
from torch import nn
from tqdm import tqdm
from env import MODEL_PATH


#
def train(model, dataloader, total_run=10, output_path=MODEL_PATH / "animal.pth"):
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.Adam(model.parameters(), lr=0.001)
    model.train()
    pbar = tqdm(range(total_run))
    min_loss = 100
    for _ in pbar:
        loss = 0
        for x, y in dataloader:
            x = x.cuda()
            y = y.cuda()
            output = model(x)
            loss = criterion(output, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            loss += loss.item()
        loss = loss / len(dataloader)
        pbar.set_description(f"loss: {loss:.4f}")
        if loss < min_loss:
            min_loss = loss
            torch.save(model.state_dict(), output_path)
    print(f"Min loss: {min_loss}")

# 测试函数

In [4]:
# -*- coding:utf-8 -*-
# @FileName : eval.py
# @Time : 2024/3/21 16:31
# @Author : fiv


def eval(model, dataloader):
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for x, y in dataloader:
            x = x.cuda()
            y = y.cuda()
            output = model(x)
            _, predicted = torch.max(output, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
            # print(y, predicted)
    acc = correct / total
    print(f"Accuracy: {acc:.4f}")

In [None]:
## 测试集

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from script.preprocess import to_fbank


class AnimalDataset(Dataset):
    def __init__(self, dataset_dir=None):
        self.label2idx = {"bird": 0, "cat": 1, "dog": 2, "tiger": 3}
        self.idx2label = {v: k for k, v in self.label2idx.items()}
        if dataset_dir is None:
            from env import DATA_PATH
            self.dataset_dir = DATA_PATH / "animal" / "all"
        else:
            self.dataset_dir = dataset_dir
        self.file_path = list(self.dataset_dir.glob("*.wav"))

    def __len__(self):
        return len(self.file_path)

    def __getitem__(self, idx):
        fbank = to_fbank(self.file_path[idx])
        length = 512
        if fbank.shape[0] < length:
            # repeat fbank to fill length
            fbank = torch.cat([fbank] * (length // fbank.shape[0] + 1), dim=0)[:length]
        else:
            start = torch.randint(0, fbank.shape[0] - length, (1,))
            fbank = fbank[start:start + length]
        label = self.file_path[idx].stem.split("_")[0]
        return fbank, self.label2idx[label]

    def idx2label(self, idx):
        return self.idx2label[idx]


def get_animal_dataloader(dataset_dir=None, batch_size=8, shuffle=True):
    if dataset_dir is None:
        from env import DATA_PATH
        dataset_dir = DATA_PATH / "animal"

    train_dir = dataset_dir / "train"
    test_dir = dataset_dir / "test"

    train = AnimalDataset(train_dir)
    test = AnimalDataset(test_dir)
    train_loader = DataLoader(train, batch_size=batch_size, shuffle=shuffle)
    test_loader = DataLoader(test, batch_size=batch_size, shuffle=shuffle)
    return train_loader, test_loader

In [7]:
def run_animal():
    torch.cuda.empty_cache()
    train_dataloader, test_dataloader = get_animal_dataloader()
    output_path = MODEL_PATH / "animal.pth"
    if output_path.exists():
        output_path.unlink()
    total_run = 1
    model = Transformer(n_out=4, num_encoder_layers=1, dropout=0.2)
    model = model.cuda()
    train(model, train_dataloader, total_run, output_path)
    model.load_state_dict(torch.load(output_path))
    eval(model, test_dataloader)


run_animal()

loss: 0.2162: 100%|██████████| 1/1 [00:01<00:00,  1.07s/it]

Min loss: 0.2162065953016281
Accuracy: 0.6250





# 狗叫测试集 from https://github.com/suzuki256/dog-dataset

In [8]:
class DogDataset(Dataset):
    def __init__(self, file_path):
        self.label2idx = {"adult": 0, "dogs": 1, "puppy": 2}
        self.idx2label = {v: k for k, v in self.label2idx.items()}
        self.file_path = file_path

    def __len__(self):
        return len(self.file_path)

    def __getitem__(self, idx):
        fbank = to_fbank(self.file_path[idx])
        length = 512
        if fbank.shape[0] < length:
            # repeat fbank to fill length
            fbank = torch.cat([fbank] * (length // fbank.shape[0] + 1), dim=0)[:length]
        else:
            start = torch.randint(0, fbank.shape[0] - length, (1,))
            fbank = fbank[start:start + length]
        label = self.file_path[idx].stem.split("_")[0]
        return fbank, self.label2idx[label]

    def idx2label(self, idx):
        return self.idx2label[idx]


def get_dog_dataloader(batch_size=8, shuffle=True):
    from script.util import split_dataset
    from env import DATA_PATH
    dataset_dir = DATA_PATH / "dog"
    train_files, test_files = split_dataset(dataset_dir)
    train_dataset = DogDataset(train_files)
    test_dataset = DogDataset(test_files)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)
    return train_loader, test_loader


def run_animal():
    torch.cuda.empty_cache()
    train_dataloader, test_dataloader = get_animal_dataloader()
    output_path = MODEL_PATH / "animal.pth"
    if output_path.exists():
        output_path.unlink()
    total_run = 1
    model = Transformer(n_out=4, num_encoder_layers=1, dropout=0.2)
    model = model.cuda()
    train(model, train_dataloader, total_run, output_path)
    model.load_state_dict(torch.load(output_path))
    eval(model, test_dataloader)


run_animal()

loss: 0.6940: 100%|██████████| 1/1 [00:00<00:00,  1.50it/s]

Min loss: 0.6939533352851868
Accuracy: 0.6250





In [None]:
# 附

In [10]:
# -*- coding:utf-8 -*-
# @FileName : split.py
# @Time : 2024/3/21 19:46
# @Author : fiv

from sklearn.model_selection import train_test_split
from pathlib import Path


def split_dataset(dataset_dir: Path):
    dataset_class = list(dataset_dir.glob("*"))
    # print(dataset_class)
    train_files, test_files = [], []
    for cla in dataset_class:
        if cla == "train" or cla == "test":
            continue
        files_path = list(cla.glob("*"))
        train, test = train_test_split(files_path, test_size=0.2)
        train_files.extend(train)
        test_files.extend(test)
    return train_files, test_files

# if __name__ == '__main__':
#     print(split_dataset(Path("../../data/dog")))

In [9]:
# -*- coding:utf-8 -*-
# @FileName : to_fbank.py
# @Time : 2024/3/20 14:14
# @Author : fiv

import torchaudio
from pathlib import Path

"""
Fbank：FilterBank：人耳对声音频谱的响应是非线性的，Fbank就是一种前端处理算法，
以类似于人耳的方式对音频进行处理，可以提高语音识别的性能。
获得语音信号的fbank特征的一般步骤是：预加重、分帧、加窗、短时傅里叶变换（STFT）、mel滤波、去均值等。
对fbank做离散余弦变换（DCT）即可获得mfcc特征。

MFCC(Mel-frequency cepstral coefficients):梅尔频率倒谱系数。
梅尔频率是基于人耳听觉特性提出来的， 它与Hz频率成非线性对应关系。
梅尔频率倒谱系数(MFCC)则是利用它们之间的这种关系，计算得到的Hz频谱特征。
主要用于语音数据特征提取和降低运算维度。例如：对于一帧有512维(采样点)数据，
经过MFCC后可以提取出最重要的40维(一般而言)数据同时也达到了降维的目的。
"""


def to_fbank(wav_path: Path):
    # from wav to fbank
    wav, sr = torchaudio.load(wav_path)
    fbank = torchaudio.compliance.kaldi.fbank(wav, num_mel_bins=40)
    # fbank = fbank.unsqueeze(0)
    return fbank

# from env import DATA_PATH
#
# print(to_fbank(DATA_PATH / "demo.wav").shape)
