In [1]:
import torchaudio
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import datetime
from pathlib import Path

from sklearn.model_selection import train_test_split
from torchinfo import summary

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

%pwd

'/home/cuilab/AAI2022Fall-Project/model-dz'

In [2]:
SAMPLE_RATE = 16000
SEED = 23333

if torch.cuda.is_available():
    GPU_ID = 0
    DEVICE = torch.device(f"cuda:{GPU_ID}")
else:
    DEVICE = torch.device("cpu")

In [3]:
def load_audio_files(path):

    dataset = []
    walker = sorted(str(p) for p in Path(path).glob(f"*.flac"))

    for i, file_path in enumerate(walker):
        path, filename = os.path.split(file_path)
        waveform, sample_rate = torchaudio.load(file_path)
        dataset.append(waveform)

    return dataset

In [4]:
data_spk1 = load_audio_files("../LibriSpeech-SI/train/spk001/")
data_spk2 = load_audio_files("../LibriSpeech-SI/train/spk002/")

print("Len of spk1 dataset:", len(data_spk1))
print("Len of spk2 dataset:", len(data_spk2))

Len of spk1 dataset: 100
Len of spk2 dataset: 102


In [5]:
def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
    """
    equivalent of tf.signal.frame
    """
    signal_length = signal.shape[axis]
    if pad_end:
        frames_overlap = frame_length - frame_step
        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(
            frame_length - frames_overlap
        )
        pad_size = int(frame_length - rest_samples)
        if pad_size != 0:
            pad_axis = [0] * signal.ndim
            pad_axis[axis] = pad_size
            signal = nn.functional.pad(signal, pad_axis, "constant", pad_value)
    frames = signal.unfold(axis, frame_length, frame_step)
    return frames


frame_time = 0.025  # ms
offset_time = 0.01  # ms

window_size = int(SAMPLE_RATE * frame_time)
offset = int(SAMPLE_RATE * offset_time)

frames = frame(data_spk1[0], window_size, offset)
frames.shape

torch.Size([1, 840, 400])

然而，MFCC 自带分帧功能，因为它本来的设计就是针对一帧做的，如果把一帧扔进 MFCC 那出来的图像 size 巨小 

我们的目标不是把原音频切成一帧一帧这么细，主要是切成等长，所以这里直接利用分帧函数实现一个滑动窗口，把一段音频切成 3s 一个

In [6]:
def slide_window(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
    signal_length = signal.shape[axis]

    if pad_end:
        frames_overlap = frame_length - frame_step
        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(
            frame_length - frames_overlap
        )
        pad_size = int(frame_length - rest_samples)
        if pad_size != 0:
            pad_axis = [0] * signal.ndim
            pad_axis[axis] = pad_size
            signal = nn.functional.pad(signal, pad_axis, "constant", pad_value)

    frames = signal.unfold(axis, frame_length, frame_step)
    return frames


audio_time = 3  # s
offset_time = audio_time / 2  # s

window_size = int(SAMPLE_RATE * audio_time)  # 1s 16000个采样点 所以一个window 3s 48000个点
offset = int(SAMPLE_RATE * offset_time)

sub_audios = slide_window(data_spk1[0], window_size, offset)  # 这里不用pad 不足3s的剩余部分会扔掉
sub_audios.shape  # 1声道 x个sub-sample 长度48000

torch.Size([1, 4, 48000])

In [7]:
mfcc_transformer = torchaudio.transforms.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=64,
    melkwargs={
        "n_fft": 750,
        "hop_length": 750,
        "n_mels": 64,
        "center": False,
        "normalized": True,
    },
)

mfcc_spectrogram = mfcc_transformer(sub_audios[:, 0, :])
print("Shape of spectrogram: {}".format(mfcc_spectrogram.size()))

Shape of spectrogram: torch.Size([1, 64, 64])


H=n_mfcc W=(48000-n_fft)/hop_len + 1

所以如果 W=64, 则 48000-n_fft 是 63 的倍数, 48000-750=63*750, 所以 n_fft=750 hop_len=750 就可以硬凑出 (64, 64) 的图片

但是是为了尺寸瞎调的

In [8]:
mfcc_spectrogram

tensor([[[-4.4087e+02, -4.4447e+02, -4.7305e+02,  ..., -2.4523e+02,
          -2.4430e+02, -2.6662e+02],
         [ 4.9197e+01,  5.1823e+01,  6.3053e+01,  ...,  6.6830e+01,
           6.5495e+01,  7.4473e+01],
         [-2.2102e+01, -1.9115e+01, -7.9149e+00,  ..., -6.4991e+01,
          -6.9019e+01, -3.8841e+01],
         ...,
         [ 1.6700e+00,  1.0622e+00,  8.0348e-01,  ..., -6.2060e+00,
          -6.6392e+00,  2.7292e+00],
         [ 1.2171e+00,  5.3775e-01,  1.9720e+00,  ..., -9.9025e+00,
          -2.5611e+00,  6.2259e+00],
         [ 1.2759e-01, -1.0304e+00,  2.1441e-01,  ..., -1.7091e+00,
          -6.0795e-01,  8.3641e-01]]])

In [9]:
def minmax_scale(x):
    return (x - x.min()) / (x.max() - x.min())


minmax_scale(mfcc_spectrogram)

tensor([[[0.0879, 0.0818, 0.0329,  ..., 0.4224, 0.4240, 0.3858],
         [0.9258, 0.9303, 0.9494,  ..., 0.9559, 0.9536, 0.9690],
         [0.8039, 0.8090, 0.8281,  ..., 0.7305, 0.7237, 0.7752],
         ...,
         [0.8445, 0.8435, 0.8430,  ..., 0.8310, 0.8303, 0.8463],
         [0.8437, 0.8426, 0.8450,  ..., 0.8247, 0.8373, 0.8523],
         [0.8419, 0.8399, 0.8420,  ..., 0.8387, 0.8406, 0.8431]]])

In [10]:
audio_time = 3  # s
offset_time = audio_time / 2  # s

window_size = int(SAMPLE_RATE * audio_time)  # 1s 16000个采样点 所以一个window 3s 48000个点
offset = int(SAMPLE_RATE * offset_time)

mfcc_transformer = torchaudio.transforms.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=64,
    melkwargs={
        "n_fft": 750,
        "hop_length": 750,
        "n_mels": 64,
        "center": False,
        "normalized": True,
    },
)


def data_split(data, train_size=0.7, val_size=0.1):
    num_samples = len(data)
    split1 = int(num_samples * train_size)
    split2 = int(num_samples * (train_size + val_size))

    np.random.shuffle(data)

    x_train = data[:split1]
    x_val = data[split1:split2]
    x_test = data[split2:]

    return x_train, x_val, x_test


def create_mfccs(data):
    mfcc_list = []
    for waveform in data:
        pad = False
        if waveform.shape[-1] < window_size:
            pad = True
        sub_waveforms = slide_window(
            waveform, window_size, offset, pad_end=pad
        ).squeeze(0)

        for sub_waveform in sub_waveforms:
            mfcc = mfcc_transformer(sub_waveform[None, :])  # (1, 64, 64)
            mfcc_list.append(mfcc.numpy())

    return mfcc_list


x_train, x_val, x_test = [], [], []
y_train, y_val, y_test = [], [], []
for i, data in enumerate([data_spk1, data_spk2]):
    x_train_i, x_val_i, x_test_i = data_split(data)

    mfccs_train = create_mfccs(x_train_i)
    mfccs_val = create_mfccs(x_val_i)
    mfccs_test = create_mfccs(x_test_i)

    x_train += mfccs_train
    x_val += mfccs_val
    x_test += mfccs_test

    y_train += [i] * len(mfccs_train)
    y_val += [i] * len(mfccs_val)
    y_test += [i] * len(mfccs_test)

x_train = np.array(x_train)
x_val = np.array(x_val)
x_test = np.array(x_test)

y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

x_train.shape  # (N_train, C, H, W)
y_train.shape

(946, 1, 64, 64)

(946,)

这里之前的dataloader构建错误

应该先分好train test split再进行mfcc变换

这是因为一个音频通过滑动窗口会产生好几个片段，因为窗口有重叠，所以这些片段首尾是有重复的，那么进行mfcc变换之后，就会出现相邻两个图片之间的内容有重叠

如果这时候再shuffle然后划分的话，那么训练和测试集中的图像就有交集了

所以在滑动窗口之前就应该划分好三个集，也就是相当于先把一个用户的所有flac文件先划分好

In [11]:
def shuffle_xy_totensor(x, y):
    assert len(x) == len(y)

    indices = np.random.permutation(np.arange(len(x)))
    return torch.FloatTensor(x[indices]), torch.LongTensor(y[indices])


def get_dataloaders(x_train, x_val, x_test, y_train, y_val, y_test, batch_size=32):
    x_train, y_train = shuffle_xy_totensor(x_train, y_train)
    x_val, y_val = shuffle_xy_totensor(x_val, y_val)
    x_test, y_test = shuffle_xy_totensor(x_test, y_test)

    print(f"Trainset:\tx-{x_train.size()}\ty-{y_train.size()}")
    print(f"Valset:  \tx-{x_val.size()}\ty-{y_val.size()}")
    print(f"Testset:\tx-{x_test.size()}\ty-{y_test.size()}")

    trainset = torch.utils.data.TensorDataset(x_train, y_train)
    valset = torch.utils.data.TensorDataset(x_val, y_val)
    testset = torch.utils.data.TensorDataset(x_test, y_test)

    trainset_loader = torch.utils.data.DataLoader(
        trainset, batch_size=batch_size, shuffle=True
    )
    valset_loader = torch.utils.data.DataLoader(
        valset, batch_size=batch_size, shuffle=True
    )
    testset_loader = torch.utils.data.DataLoader(
        testset, batch_size=batch_size, shuffle=False
    )

    return trainset_loader, valset_loader, testset_loader

In [12]:
from mfcc_cnn import onehot_decode, accuracy
from mfcc_cnn import eval_model, train_one_epoch, train
from mfcc_cnn import SimpleCLS

model = SimpleCLS()
summary(model, [32, 1, 64, 64])

Layer (type:depth-idx)                   Output Shape              Param #
SimpleCLS                                [32, 2]                   --
├─Sequential: 1-1                        [32, 16, 4, 4]            --
│    └─ConvBNReLU: 2-1                   [32, 8, 32, 32]           --
│    │    └─Conv2d: 3-1                  [32, 8, 32, 32]           80
│    │    └─BatchNorm2d: 3-2             [32, 8, 32, 32]           16
│    │    └─ReLU: 3-3                    [32, 8, 32, 32]           --
│    └─MaxPool2d: 2-2                    [32, 8, 16, 16]           --
│    └─ConvBNReLU: 2-3                   [32, 16, 14, 14]          --
│    │    └─Conv2d: 3-4                  [32, 16, 14, 14]          1,168
│    │    └─BatchNorm2d: 3-5             [32, 16, 14, 14]          32
│    │    └─ReLU: 3-6                    [32, 16, 14, 14]          --
│    └─MaxPool2d: 2-4                    [32, 16, 7, 7]            --
│    └─ConvBNReLU: 2-5                   [32, 16, 4, 4]            --
│    │    └─

In [13]:
batch_size = 32
max_epochs = 100
lr = 0.001
log_file = "temp.log"

train_loader, val_loader, test_loader = get_dataloaders(
    x_train, x_val, x_test, y_train, y_val, y_test, batch_size=batch_size
)

model = SimpleCLS().to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model = train(
    model,
    train_loader,
    val_loader,
    optimizer,
    criterion,
    max_epochs=max_epochs,
    early_stop=10,
    verbose=1,
    plot=False,
    log=log_file,
)

# model.test_phase()
test_loss, test_acc = eval_model(model, test_loader, criterion)
print("Test Loss = %.5f" % test_loss, "Test acc = %.5f " % test_acc)
with open(log_file, "a") as f:
    print("Test Loss = %.5f" % test_loss, "Test acc = %.5f " % test_acc, file=f)

Trainset:	x-torch.Size([946, 1, 64, 64])	y-torch.Size([946])
Valset:  	x-torch.Size([127, 1, 64, 64])	y-torch.Size([127])
Testset:	x-torch.Size([266, 1, 64, 64])	y-torch.Size([266])
2022-12-25 10:26:07.760253 Epoch 1 	Train Loss = 0.48501 Train acc = 0.75729  Val Loss = 0.52299 Val acc = 0.66028 
2022-12-25 10:26:07.860755 Epoch 2 	Train Loss = 0.10778 Train acc = 0.97708  Val Loss = 0.04629 Val acc = 1.00000 
2022-12-25 10:26:07.963804 Epoch 3 	Train Loss = 0.03652 Train acc = 0.99687  Val Loss = 0.02384 Val acc = 1.00000 
2022-12-25 10:26:08.063118 Epoch 4 	Train Loss = 0.01900 Train acc = 0.99583  Val Loss = 0.01187 Val acc = 1.00000 
2022-12-25 10:26:08.161216 Epoch 5 	Train Loss = 0.01462 Train acc = 0.99896  Val Loss = 0.00900 Val acc = 1.00000 
2022-12-25 10:26:08.261300 Epoch 6 	Train Loss = 0.01136 Train acc = 0.99896  Val Loss = 0.00643 Val acc = 1.00000 
2022-12-25 10:26:08.359687 Epoch 7 	Train Loss = 0.00717 Train acc = 1.00000  Val Loss = 0.00666 Val acc = 1.00000 
2022-1

---

In [1]:
d={"a": [1, 2], "b": [3, 4], "c": [5]}
g={"a": [1, 8], "b": [0, 4], "c": [9]}

x, y, z=d.values()

print(x, y, z)
print(d.values())
print(d.items())

[1, 2] [3, 4] [5]
dict_values([[1, 2], [3, 4], [5]])
dict_items([('a', [1, 2]), ('b', [3, 4]), ('c', [5])])


In [9]:
import numpy as np

np.savez_compressed("temp.npz", x=d, y=g)

In [10]:
npz=np.load("temp.npz", allow_pickle=True)

print(npz.files)
npz["y"].item()["a"]

['x', 'y']


[1, 8]

In [13]:
import pickle
pickle.DEFAULT_PROTOCOL

4

In [16]:
import torch
torch.save({"x": d, "y":g}, "temp.pkl", pickle_protocol=4)

In [17]:
pkl=torch.load("temp.pkl")
pkl

{'x': {'a': [1, 2], 'b': [3, 4], 'c': [5]},
 'y': {'a': [1, 8], 'b': [0, 4], 'c': [9]}}