## config

In [15]:
# banknote classification config

# 超参配置
# yaml
class Hyperparameter:
    # ################################################################
    #                             Data
    # ################################################################
    device = 'cuda'
    data_root = './data/'
    origin_bgn_root = '../input/gru-speech-command-prepare/_background_noise_'
    bgn_root = './data/bgn'
    cls_mapper_path = './data/cls_mapper.json'

    metadata_train_path = './data/train_speech_commands.txt'
    metadata_eval_path = './data/eval_speech_commands.txt'
    metadata_test_path = './data/test_speech_commands.txt'

    cls_name_list = ['bgn', 'down', 'go', 'left', 'off', 'on', 'right', 'stop']
    cls_folder_name_list = [
        './data/bgn',
        '../input/gru-speech-command-prepare/down',
        '../input/gru-speech-command-prepare/go',
        '../input/gru-speech-command-prepare/left',
        '../input/gru-speech-command-prepare/off',
        '../input/gru-speech-command-prepare/on',
        '../input/gru-speech-command-prepare/right',
        '../input/gru-speech-command-prepare/stop'
    ]

    class_num = 8
    mel_size = 40
    seed = 1234  # random seed

    # ################################################################
    #                             Model Structure
    # ################################################################
    data_point_channel = mel_size
    rnn_hidden_dim = 256
    rnn_layer_num = 2
    is_bidirection = True
    fc_drop = 0.3

    # ################################################################
    #                             Experiment
    # ################################################################
    batch_size = 8
    init_lr = 5e-4
    epochs = 5
    verbose_step = 50
    save_step = 500


HP = Hyperparameter()


## utils

In [16]:
import os
from PIL import Image
import torchaudio

def gen_cls_mapper(cls_name_list):

    cls_mapper = {
        'cls2id': {},
        'id2cls': {}
    }

    for i, name in enumerate(cls_name_list):
        cls_mapper['cls2id'][name] = i
        cls_mapper['id2cls'][i] = name

    return cls_mapper

# 获取某个文件夹下面所有后缀为suffix的文件，返回path的list
def recursive_fetching(root, suffix=['jpg', 'png']):
    all_file_path = []

    def get_all_files(path):
        all_file_list = os.listdir(path)
        # 遍历该文件夹下的所有目录或者文件
        for file in all_file_list:
            filepath = os.path.join(path, file)
            # 如果是文件夹，递归调用函数
            if os.path.isdir(filepath):
                get_all_files(filepath)
            # 如果不是文件夹，保存文件路径及文件名
            elif os.path.isfile(filepath):
                all_file_path.append(filepath)

    get_all_files(root)

    file_paths = [it for it in all_file_path if os.path.split(it)[-1].split('.')[-1].lower() in suffix]

    return file_paths


def load_meta(meta_path):
    with open(meta_path, 'r') as fr:
        return [line.strip().split('|') for line in fr.readlines()]


def load_mel(audio_path):
    wave, sampling_rate = torchaudio.load(audio_path)
    mel = torchaudio.transforms.MelSpectrogram(sample_rate=sampling_rate, n_mels=40)(wave).squeeze(0)
    return mel


## preprocess

In [None]:
import os
import random
import json
from scipy.io import wavfile as wf

random.seed(HP.seed)

for foldername in ['data', 'log', 'model_save', 'data/bgn']:
    if not os.path.exists(foldername):
        os.mkdir(foldername)

def chop_bgn():
    '''
    将原始的噪声，以200ms为步长剪切为多个长度为1000ms的噪声片段
    '''

    wav_files = recursive_fetching(HP.origin_bgn_root, suffix=['wav', 'WAV'])

    for wav_file in wav_files:
        file_name = os.path.split(wav_file)[-1]
        sampling_rate, data = wf.read(wav_file)
        data_len = data.shape[0]
        len_200ms = int(sampling_rate/5) # sampling_rate * 200 / 1000
        count = round(data_len/len_200ms)

        for i in range(count):
            segment = data[i*len_200ms:i*len_200ms+sampling_rate]
            output_file_name = "seg-%04d-%s" % (i, file_name)
            wf.write(os.path.join(HP.bgn_root, output_file_name), sampling_rate, segment)

chop_bgn()

# 构建类别到id的映射
cls_mapper = gen_cls_mapper(HP.cls_name_list)
json.dump(cls_mapper, open(HP.cls_mapper_path, 'w'))

# 获取train和test的数据集，并将它们合并
dataset = []
for cls_folder_name in HP.cls_folder_name_list:
    dataset.extend(recursive_fetching(cls_folder_name, suffix=['wav']))
dataset_num = len(dataset)
print("Number of total items is", dataset_num)
random.shuffle(dataset)

dataset_dict = {}
for it in dataset:
    cls_name = os.path.split(os.path.split(it)[0])[-1]
    cls_id = cls_mapper['cls2id'][cls_name]
    if cls_id not in dataset_dict:
        dataset_dict[cls_id] = [it]
    else:
        dataset_dict[cls_id].append(it)

# 自己划分训练集、评价集和测试集
train_ratio, eval_ratio, test_ratio = 0.8, 0.1, 0.1
train_set, eval_set, test_set = [], [], [],
for _, set_list in dataset_dict.items():
    length = len(set_list)
    train_num, eval_num = int(length * train_ratio), int(length * eval_ratio)
    test_num = length - train_num - eval_num
    random.shuffle(set_list)
    train_set.extend(set_list[:train_num])
    eval_set.extend(set_list[train_num:train_num + eval_num])
    test_set.extend(set_list[train_num + eval_num:])

# 再次随机打乱
random.shuffle(train_set)
random.shuffle(eval_set)
random.shuffle(test_set)

print('num of trainset : %d' % (len(train_set)))
print('num of evalset : %d' % (len(eval_set)))
print('num of testset : %d' % (len(test_set)))

with open(HP.metadata_train_path, 'w') as fw:
    for path in train_set:
        fn_start = os.path.split(os.path.split(path)[0])[-1]
        cls_id = cls_mapper['cls2id'][fn_start]
        fw.write('%d|%s\n' % (cls_id, path))

with open(HP.metadata_eval_path, 'w') as fw:
    for path in eval_set:
        fn_start = os.path.split(os.path.split(path)[0])[-1]
        cls_id = cls_mapper['cls2id'][fn_start]
        fw.write('%d|%s\n' % (cls_id, path))

with open(HP.metadata_test_path, 'w') as fw:
    for path in test_set:
        fn_start = os.path.split(os.path.split(path)[0])[-1]
        cls_id = cls_mapper['cls2id'][fn_start]
        fw.write('%d|%s\n' % (cls_id, path))

## dataset_kws

In [17]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

class KWSDataset(torch.utils.data.Dataset):

    def __init__(self, metadata_path):
        self.dataset = load_meta(metadata_path)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        cls_id, path = int(item[0]), item[1]
        mel = load_mel(path) # [data_point_dim, sequence_len] = [40, ?]
        # [x,x,x,x,x,0,0]
        # [x,x,x,x,x,x,x]
        return mel.to(HP.device), cls_id # cls_int

    def __len__(self):
        return len(self.dataset)


# batch : 8
def collate_fn(batch):
    # [(mel cls_id),(mel cls_id),(mel cls_id),(mel cls_id)...]
    sorted_batch = sorted(batch, key=lambda b: b[0].size(1), reverse=True)
    # get all mel and pad them: mel defaul dim: [40, ?]=[datapoint_dim, L] -> [L, datapoint_dim]
    mel_list = [item[0].transpose(0, 1) for item in sorted_batch]
    # [sequence, batch, datapoint_dim], [batch, sequence, datapoint_dim]
    mel_padded = pad_sequence(mel_list, batch_first=True)
    labels = torch.LongTensor([item[1] for item in sorted_batch]) # transfer labels to long tensor
    mel_lengths = torch.LongTensor([item.size(0) for item in mel_list])
    return mel_padded, mel_lengths, labels


## model

In [18]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class SpeechCommandModel(nn.Module):

    def __init__(self):
        super(SpeechCommandModel, self).__init__()

        self.rnn = nn.GRU(
            input_size=HP.data_point_channel,
            hidden_size=HP.rnn_hidden_dim,
            num_layers=HP.rnn_layer_num,
            bidirectional=HP.is_bidirection
        )

        fc_in_dim = 2 * HP.rnn_hidden_dim if HP.is_bidirection else HP.rnn_hidden_dim

        self.fc = nn.Sequential(
            nn.Linear(fc_in_dim, 1024),
            nn.Mish(),
            nn.Dropout(HP.fc_drop),
            nn.Linear(1024, 512),
            nn.Mish(),
            nn.Dropout(HP.fc_drop),
            nn.Linear(512, HP.class_num)
        )

    def forward(self, mel_input, mel_lengths):
        mel_input = mel_input.permute(1, 0, 2)
        mel_packed = pack_padded_sequence(mel_input, mel_lengths)
        output_packed, hn = self.rnn(mel_packed)
        output, _ = pad_packed_sequence(output_packed)

        # if语句在模型迁移后使用可能出现bug，此处这么写仅为学习，实际模型搭建过程中应避免
        if HP.is_bidirection:
            forward_feature = output[-1, :, :HP.rnn_hidden_dim]
            backward_feature = output[0, :, HP.rnn_hidden_dim:]
            fc_in = torch.cat((forward_feature, backward_feature), dim=-1)
            cls_output = self.fc(fc_in)
        else:
            cls_output = self.fc(output[-1])
        
        return cls_output


## trainer

In [None]:
import os.path
import random
import torch
import numpy as np
from tensorboardX import SummaryWriter
from torch import nn
from torch import optim
from torch.utils.data import DataLoader

logger = SummaryWriter('./log')

# seed init: 保证模型的可复现性
torch.manual_seed(HP.seed)
random.seed(HP.seed)
np.random.seed(HP.seed)
torch.cuda.manual_seed(HP.seed)


def evaluate(model, devloader, crit):
    model.eval()
    sum_loss = 0.
    with torch.no_grad():
        for batch in devloader:
            x, x_lens, y = batch
            pred = model(x, x_lens)
            loss = crit(pred, y.to(HP.device))
            sum_loss += loss.item()

    model.train()
    return sum_loss / len(devloader)


def save_checkpoint(model, epoch, opt, save_path):
    save_dict = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': opt.state_dict()
    }
    torch.save(save_dict, save_path)


def train():

    model = SpeechCommandModel().to(HP.device)

    criterion = nn.CrossEntropyLoss()

    opt = optim.Adam(model.parameters(), lr=HP.init_lr)

    trainset = KWSDataset(HP.metadata_train_path)
    train_loader = DataLoader(trainset, batch_size=HP.batch_size, shuffle=True, drop_last=True, collate_fn=collate_fn)

    devset = KWSDataset(HP.metadata_eval_path)
    dev_loader = DataLoader(devset, batch_size=HP.batch_size, shuffle=True, drop_last=False, collate_fn=collate_fn)

    start_epoch, step = 0, 0

    model.train()

    for epoch in range(start_epoch, HP.epochs):
        print('Start Epoch: %d, Steps: %d' % (epoch, len(train_loader)))
        for batch in train_loader:
            x, x_len, y = batch  # 加载数据
            opt.zero_grad()  # 梯度归零
            pred = model(x, x_len)
            loss = criterion(pred, y.to(HP.device))

            loss.backward()
            opt.step()

            logger.add_scalar('Loss/Train', loss, step)

            if not step % HP.verbose_step:
                eval_loss = evaluate(model, dev_loader, criterion)
                logger.add_scalar('Loss/Dev', eval_loss, step)

            if not step % HP.save_step:
                model_path = 'model_%d_%d.model' % (epoch, step)
                save_checkpoint(model, epoch, opt, os.path.join('model_save', model_path))

            step += 1
            logger.flush()
            print('Epoch:[%d/%d], step:%d, Train Loss:%.5f, Dev Loss:%.5f' % (
                epoch, HP.epochs, step, loss.item(), eval_loss))

    torch.save(model, "kws_model.dm")

    logger.close()


## 训练

In [None]:
train()