In [95]:
import torchaudio
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import datetime
from pathlib import Path

from sklearn.model_selection import train_test_split
from torchinfo import summary

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

%pwd

'/home/cuilab/AAI2022Fall-Project/tutorial'

In [96]:
SAMPLE_RATE = 16000
SEED = 23333

if torch.cuda.is_available():
    GPU_ID = 0
    DEVICE = torch.device(f"cuda:{GPU_ID}")
else:
    DEVICE = torch.device("cpu")

In [97]:
def load_audio_files(path):

    dataset = []
    walker = sorted(str(p) for p in Path(path).glob(f"*.flac"))

    for i, file_path in enumerate(walker):
        path, filename = os.path.split(file_path)
        waveform, sample_rate = torchaudio.load(file_path)
        dataset.append(waveform)

    return dataset

In [98]:
data_spk1 = load_audio_files("../LibriSpeech-SI/train/spk001/")
data_spk2 = load_audio_files("../LibriSpeech-SI/train/spk002/")

print("Len of spk1 dataset:", len(data_spk1))
print("Len of spk2 dataset:", len(data_spk2))

Len of spk1 dataset: 100
Len of spk2 dataset: 102


In [99]:
def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
    """
    equivalent of tf.signal.frame
    """
    signal_length = signal.shape[axis]
    if pad_end:
        frames_overlap = frame_length - frame_step
        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(
            frame_length - frames_overlap
        )
        pad_size = int(frame_length - rest_samples)
        if pad_size != 0:
            pad_axis = [0] * signal.ndim
            pad_axis[axis] = pad_size
            signal = nn.functional.pad(signal, pad_axis, "constant", pad_value)
    frames = signal.unfold(axis, frame_length, frame_step)
    return frames


frame_time = 0.025  # ms
offset_time = 0.01  # ms

window_size = int(SAMPLE_RATE * frame_time)
offset = int(SAMPLE_RATE * offset_time)

frames = frame(data_spk1[0], window_size, offset)
frames.shape

torch.Size([1, 840, 400])

然而，MFCC 自带分帧功能，因为它本来的设计就是针对一帧做的，如果把一帧扔进 MFCC 那出来的图像 size 巨小 

我们的目标不是把原音频切成一帧一帧这么细，主要是切成等长，所以这里直接利用分帧函数实现一个滑动窗口，把一段音频切成 3s 一个

In [100]:
def slide_window(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
    signal_length = signal.shape[axis]

    if pad_end:
        frames_overlap = frame_length - frame_step
        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(
            frame_length - frames_overlap
        )
        pad_size = int(frame_length - rest_samples)
        if pad_size != 0:
            pad_axis = [0] * signal.ndim
            pad_axis[axis] = pad_size
            signal = nn.functional.pad(signal, pad_axis, "constant", pad_value)

    frames = signal.unfold(axis, frame_length, frame_step)
    return frames


audio_time = 3  # s
offset_time = audio_time / 2  # s

window_size = int(SAMPLE_RATE * audio_time)  # 1s 16000个采样点 所以一个window 3s 48000个点
offset = int(SAMPLE_RATE * offset_time)

sub_audios = slide_window(data_spk1[0], window_size, offset)  # 这里不用pad 不足3s的剩余部分会扔掉
sub_audios.shape  # 1声道 4个sub-sample 长度48000

torch.Size([1, 4, 48000])

In [101]:
mfcc_transformer = torchaudio.transforms.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=64,
    melkwargs={
        "n_fft": 750,
        "hop_length": 750,
        "n_mels": 64,
        "center": False,
        "normalized": True,
    },
)

mfcc_spectrogram = mfcc_transformer(sub_audios[:, 0, :])
print("Shape of spectrogram: {}".format(mfcc_spectrogram.size()))

Shape of spectrogram: torch.Size([1, 64, 64])


H=n_mfcc W=(48000-n_fft)/hop_len + 1

所以如果 W=64, 则 48000-n_fft 是 63 的倍数, 48000-750=63*750, 所以 n_fft=750 hop_len=750 就可以硬凑出 (64, 64) 的图片

但是是为了尺寸瞎调的

In [102]:
mfcc_spectrogram

tensor([[[-4.4087e+02, -4.4447e+02, -4.7305e+02,  ..., -2.4523e+02,
          -2.4430e+02, -2.6662e+02],
         [ 4.9197e+01,  5.1823e+01,  6.3053e+01,  ...,  6.6830e+01,
           6.5495e+01,  7.4473e+01],
         [-2.2102e+01, -1.9115e+01, -7.9149e+00,  ..., -6.4991e+01,
          -6.9019e+01, -3.8841e+01],
         ...,
         [ 1.6700e+00,  1.0622e+00,  8.0348e-01,  ..., -6.2060e+00,
          -6.6392e+00,  2.7292e+00],
         [ 1.2171e+00,  5.3775e-01,  1.9720e+00,  ..., -9.9025e+00,
          -2.5611e+00,  6.2259e+00],
         [ 1.2759e-01, -1.0304e+00,  2.1441e-01,  ..., -1.7091e+00,
          -6.0795e-01,  8.3641e-01]]])

In [103]:
def minmax_scale(x):
    return (x - x.min()) / (x.max() - x.min())


minmax_scale(mfcc_spectrogram)

tensor([[[0.0879, 0.0818, 0.0329,  ..., 0.4224, 0.4240, 0.3858],
         [0.9258, 0.9303, 0.9494,  ..., 0.9559, 0.9536, 0.9690],
         [0.8039, 0.8090, 0.8281,  ..., 0.7305, 0.7237, 0.7752],
         ...,
         [0.8445, 0.8435, 0.8430,  ..., 0.8310, 0.8303, 0.8463],
         [0.8437, 0.8426, 0.8450,  ..., 0.8247, 0.8373, 0.8523],
         [0.8419, 0.8399, 0.8420,  ..., 0.8387, 0.8406, 0.8431]]])

In [104]:
audio_time = 3  # s
offset_time = audio_time / 2  # s

window_size = int(SAMPLE_RATE * audio_time)  # 1s 16000个采样点 所以一个window 3s 48000个点
offset = int(SAMPLE_RATE * offset_time)

mfcc_transformer = torchaudio.transforms.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=64,
    melkwargs={
        "n_fft": 750,
        "hop_length": 750,
        "n_mels": 64,
        "center": False,
        "normalized": True,
    },
)


def create_mfccs(data):
    mfcc_list = []
    for waveform in data:
        pad = False
        if waveform.shape[-1] < window_size:
            pad = True
        sub_waveforms = slide_window(
            waveform, window_size, offset, pad_end=pad
        ).squeeze(0)

        for sub_waveform in sub_waveforms:
            mfcc = mfcc_transformer(sub_waveform[None, :])  # (1, 64, 64)
            mfcc_list.append(mfcc.numpy())

    return mfcc_list


xs = []
ys = []
for i, data in enumerate([data_spk1, data_spk2]):
    mfccs = create_mfccs(data)
    xs += mfccs
    ys += [i] * len(mfccs)

xs = np.array(xs)
ys = np.array(ys)

xs.shape  # (N_all, C, H, W)
ys.shape

(1339, 1, 64, 64)

(1339,)

In [122]:
def get_dataloaders(x, y, train_size=0.7, val_size=0.1, batch_size=32):
    num_samples = x.shape[0]
    split1 = int(num_samples * train_size)
    split2 = int(num_samples * (train_size + val_size))

    # shuffle
    indices = np.random.permutation(np.arange(num_samples))
    x = x[indices]
    y = y[indices]

    x = torch.FloatTensor(x)
    y = torch.LongTensor(y)

    x_train = x[:split1]
    x_val = x[split1:split2]
    x_test = x[split2:]

    y_train = y[:split1]
    y_val = y[split1:split2]
    y_test = y[split2:]

    print(f"Trainset:\tx-{x_train.size()}\ty-{y_train.size()}")
    print(f"Valset:  \tx-{x_val.size()}\ty-{y_val.size()}")
    print(f"Testset:\tx-{x_test.size()}\ty-{y_test.size()}")

    trainset = torch.utils.data.TensorDataset(x_train, y_train)
    valset = torch.utils.data.TensorDataset(x_val, y_val)
    testset = torch.utils.data.TensorDataset(x_test, y_test)

    trainset_loader = torch.utils.data.DataLoader(
        trainset, batch_size=batch_size, shuffle=True
    )
    valset_loader = torch.utils.data.DataLoader(
        valset, batch_size=batch_size, shuffle=True
    )
    testset_loader = torch.utils.data.DataLoader(
        testset, batch_size=batch_size, shuffle=False
    )

    return trainset_loader, valset_loader, testset_loader


def onehot_decode(label):
    return torch.argmax(label, dim=1)


def accuracy(predictions, targets):
    pred_decode = onehot_decode(predictions)
    true_decode = targets

    assert len(pred_decode) == len(true_decode)

    acc = torch.mean((pred_decode == true_decode).float())

    return float(acc)

In [128]:
class ConvBNReLU(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
        super(ConvBNReLU, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        self.conv = nn.Conv2d(
            in_channels=self.in_channels,
            out_channels=self.out_channels,
            kernel_size=self.kernel_size,
            stride=self.stride,
            padding=self.padding,
            bias=True,
        )
        self.bn = nn.BatchNorm2d(num_features=self.out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        out = self.conv(x)
        out = self.bn(out)
        out = self.relu(out)
        return out

    def combine_conv_bn(self):
        conv_result = nn.Conv2d(
            self.in_channels,
            self.out_channels,
            self.kernel_size,
            stride=self.stride,
            padding=self.padding,
            bias=True,
        )

        scales = self.bn.weight / torch.sqrt(self.bn.running_var + self.bn.eps)
        conv_result.bias[:] = (
            self.conv.bias - self.bn.running_mean
        ) * scales + self.bn.bias
        for ch in range(self.out_channels):
            conv_result.weight[ch, :, :, :] = self.conv.weight[ch, :, :, :] * scales[ch]

        return conv_result


class SimpleCLS(nn.Module):
    def __init__(self, input_size=64, num_cls=2):
        super(SimpleCLS, self).__init__()

        self.input_size = input_size

        self.backbone = nn.Sequential(
            ConvBNReLU(1, 8, 3, 2, 1),  # 64 -> 32
            nn.MaxPool2d(2, 2),  # 32 -> 16
            ConvBNReLU(8, 16, 3, 1),  # 16 -> 14
            nn.MaxPool2d(2, 2),  # 14 -> 7
            ConvBNReLU(16, 16, 3, 2, 1),  # 7 -> 4
        )

        self.classifier = nn.Sequential(
            nn.Linear(in_features=16 * 4 * 4, out_features=num_cls, bias=True)
        )

        self.softmax = nn.Softmax(dim=-1)

        self.set_params()
        self.train_phase()

    def set_params(self):
        for m in self.backbone.children():
            if isinstance(m, nn.Conv2d):
                nn.init.xavier_normal_(m.weight.data)
                m.bias.data.fill_(0.02)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        for m in self.classifier.children():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
            elif isinstance(m, nn.BatchNorm1d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def train_phase(self):
        self.phase = "train"

    def test_phase(self):
        self.phase = "test"

    def forward(self, x):
        out = self.backbone(x)
        # out = self.classifier(out.view(x.size(0), -1))
        out = torch.flatten(out, 1)
        out = self.classifier(out)

        return self.softmax(out) if self.phase == "test" else out


model = SimpleCLS()
summary(model, [32, 1, 64, 64])

Layer (type:depth-idx)                   Output Shape              Param #
SimpleCLS                                [32, 2]                   --
├─Sequential: 1-1                        [32, 16, 4, 4]            --
│    └─ConvBNReLU: 2-1                   [32, 8, 32, 32]           --
│    │    └─Conv2d: 3-1                  [32, 8, 32, 32]           80
│    │    └─BatchNorm2d: 3-2             [32, 8, 32, 32]           16
│    │    └─ReLU: 3-3                    [32, 8, 32, 32]           --
│    └─MaxPool2d: 2-2                    [32, 8, 16, 16]           --
│    └─ConvBNReLU: 2-3                   [32, 16, 14, 14]          --
│    │    └─Conv2d: 3-4                  [32, 16, 14, 14]          1,168
│    │    └─BatchNorm2d: 3-5             [32, 16, 14, 14]          32
│    │    └─ReLU: 3-6                    [32, 16, 14, 14]          --
│    └─MaxPool2d: 2-4                    [32, 16, 7, 7]            --
│    └─ConvBNReLU: 2-5                   [32, 16, 4, 4]            --
│    │    └─

In [129]:
@torch.no_grad()
def eval_model(model, valset_loader, criterion):
    model.eval()
    batch_loss_list = []
    batch_acc_list = []
    for x_batch, y_batch in valset_loader:
        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        out_batch = model.forward(x_batch)
        loss = criterion.forward(out_batch, y_batch)
        batch_loss_list.append(loss.item())

        acc = accuracy(out_batch, y_batch)
        batch_acc_list.append(acc)

    return np.mean(batch_loss_list), np.mean(batch_acc_list)


def train_one_epoch(model, trainset_loader, optimizer, criterion):
    model.train()
    batch_loss_list = []
    batch_acc_list = []
    for x_batch, y_batch in trainset_loader:
        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        out_batch = model.forward(x_batch)
        loss = criterion.forward(out_batch, y_batch)
        batch_loss_list.append(loss.item())

        acc = accuracy(out_batch, y_batch)
        batch_acc_list.append(acc)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return np.mean(batch_loss_list), np.mean(batch_acc_list)


def train(
    model,
    trainset_loader,
    valset_loader,
    optimizer,
    criterion,
    max_epochs=100,
    early_stop=10,
    verbose=1,
    plot=False,
    log="train.log",
):
    if log:
        log = open(log, "a")
        log.seek(0)
        log.truncate()

    wait = 0
    min_val_loss = np.inf

    train_loss_list = []
    train_acc_list = []
    val_loss_list = []
    val_acc_list = []

    for epoch in range(max_epochs):
        train_loss, train_acc = train_one_epoch(
            model, trainset_loader, optimizer, criterion
        )
        train_loss_list.append(train_loss)
        train_acc_list.append(train_acc)

        val_loss, val_acc = eval_model(model, valset_loader, criterion)
        val_loss_list.append(val_loss)
        val_acc_list.append(val_acc)

        if (epoch + 1) % verbose == 0:
            print(
                datetime.datetime.now(),
                "Epoch",
                epoch + 1,
                "\tTrain Loss = %.5f" % train_loss,
                "Train acc = %.5f " % train_acc,
                "Val Loss = %.5f" % val_loss,
                "Val acc = %.5f " % val_acc,
            )

            if log:
                print(
                    datetime.datetime.now(),
                    "Epoch",
                    epoch + 1,
                    "\tTrain Loss = %.5f" % train_loss,
                    "Train acc = %.5f " % train_acc,
                    "Val Loss = %.5f" % val_loss,
                    "Val acc = %.5f " % val_acc,
                    file=log,
                )
                log.flush()

        if val_loss < min_val_loss:
            wait = 0
            min_val_loss = val_loss
            best_epoch = epoch
            best_state_dict = model.state_dict()
        else:
            wait += 1
            if wait >= early_stop:
                print(f"Early stopping at epoch: {epoch+1}")
                print(f"Best at epoch {best_epoch+1}:")
                print(
                    "Train Loss = %.5f" % train_loss_list[best_epoch],
                    "Train acc = %.5f " % train_acc_list[best_epoch],
                )
                print(
                    "Val Loss = %.5f" % val_loss_list[best_epoch],
                    "Val acc = %.5f " % val_acc_list[best_epoch],
                )

                if log:
                    print(f"Early stopping at epoch: {epoch+1}", file=log)
                    print(f"Best at epoch {best_epoch+1}:", file=log)
                    print(
                        "Train Loss = %.5f" % train_loss_list[best_epoch],
                        "Train acc = %.5f " % train_acc_list[best_epoch],
                        file=log,
                    )
                    print(
                        "Val Loss = %.5f" % val_loss_list[best_epoch],
                        "Val acc = %.5f " % val_acc_list[best_epoch],
                        file=log,
                    )
                    log.flush()
                break

    if plot:
        plt.plot(range(0, epoch + 1), train_loss_list, "-", label="Train Loss")
        plt.plot(range(0, epoch + 1), val_loss_list, "-", label="Val Loss")
        plt.title("Epoch-Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()

        plt.plot(range(0, epoch + 1), train_acc_list, "-", label="Train Acc")
        plt.plot(range(0, epoch + 1), val_acc_list, "-", label="Val Acc")
        plt.title("Epoch-Accuracy")
        plt.xlabel("Epoch")
        plt.ylabel("Accuracy")
        plt.legend()
        plt.show()

    if log:
        log.close()

    # torch.save(best_state_dict, "./saved/best_state_dict.pkl")
    model.load_state_dict(best_state_dict)
    return model

In [130]:
batch_size = 32
max_epochs = 100
lr = 0.001
log_file = "temp.log"

train_loader, val_loader, test_loader = get_dataloaders(
    xs, ys, batch_size=batch_size, train_size=0.7, val_size=0.1
)

model = SimpleCLS().to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model = train(
    model,
    train_loader,
    val_loader,
    optimizer,
    criterion,
    max_epochs=max_epochs,
    early_stop=10,
    verbose=1,
    plot=False,
    log=log_file,
)

# model.test_phase()
test_loss, test_acc = eval_model(model, test_loader, criterion)
print("Test Loss = %.5f" % test_loss, "Test acc = %.5f " % test_acc)
with open(log_file, "a") as f:
    print("Test Loss = %.5f" % test_loss, "Test acc = %.5f " % test_acc, file=f)

Trainset:	x-torch.Size([937, 1, 64, 64])	y-torch.Size([937])
Valset:  	x-torch.Size([134, 1, 64, 64])	y-torch.Size([134])
Testset:	x-torch.Size([268, 1, 64, 64])	y-torch.Size([268])
2022-12-24 20:07:36.672369 Epoch 1 	Train Loss = 0.44425 Train acc = 0.79375  Val Loss = 0.29630 Val acc = 0.86458 
2022-12-24 20:07:36.836856 Epoch 2 	Train Loss = 0.11316 Train acc = 0.97396  Val Loss = 0.08759 Val acc = 0.96250 
2022-12-24 20:07:37.002215 Epoch 3 	Train Loss = 0.04687 Train acc = 0.99687  Val Loss = 0.04430 Val acc = 0.98750 
2022-12-24 20:07:37.159402 Epoch 4 	Train Loss = 0.02748 Train acc = 1.00000  Val Loss = 0.04934 Val acc = 0.97500 
2022-12-24 20:07:37.275428 Epoch 5 	Train Loss = 0.01894 Train acc = 1.00000  Val Loss = 0.02194 Val acc = 1.00000 
2022-12-24 20:07:37.393836 Epoch 6 	Train Loss = 0.01346 Train acc = 0.99792  Val Loss = 0.01981 Val acc = 1.00000 
2022-12-24 20:07:37.516273 Epoch 7 	Train Loss = 0.01268 Train acc = 1.00000  Val Loss = 0.01529 Val acc = 1.00000 
2022-1