<a href="https://colab.research.google.com/github/Da-Capo/Colabstuffs/blob/master/1906_masr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 预处理backup

In [0]:
# !wget http://www.openslr.org/resources/33/data_aishell.tgz
# !wget http://www.openslr.org/resources/33/resource_aishell.tg
# !mv data_aishell.tgz drive/My\ Drive/MASS_DATA/

In [0]:
# 转移解压数据 1
# !tar -zxvf data_aishell.tgz >/dev/null 2>&1
# !tar -zxvf resource_aishell.tgz >/dev/null 2>&1
# %cd /content/data_aishell
# !for tar in wav/*.tar.gz;  do tar xvf $tar; done >/dev/null 2>&1
# %cd /content

In [0]:
# 数据预处理
# from pathlib import Path
# with open("data_aishell/transcript/aishell_transcript_v0.8.txt") as f:
#     label_dict = {}
#     for i,line in enumerate(f):
#         key,value = line.strip().split(" ",1)
#         label_dict[key] = value.replace(" ","")
# train_f  = open("data_aishell/train-sort.manifest","w")
# dev_f    = open("data_aishell/dev.manifest","w")

# for f_path in Path("/content/data_aishell").rglob('*.wav'):
#     x = str(f_path)
#     try:
#         y = label_dict[f_path.name.split(".")[0]]
#         if "train" in x:
#             train_f.write(x+","+y+"\n")
#         if "test" in x:
#             dev_f.write(x+","+y+"\n")
#     except:
#         print(x)
# train_f.close()
# dev_f.close()

In [0]:
# 保存数据
# !zip -r data_aishell.zip data_aishell >/dev/null 2>&1
# !cp data_aishell.zip /content/drive/My\ Drive/MASS_DATA/

# run

In [0]:
!pip install gdown
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu100/torch_nightly.html
!pip install python-levenshtein
!pip install tensorboardx
!wget https://github.com/libai3/masr/archive/18905979fc091f2256be18a4328805a0b7613704.zip -O masr.zip >/dev/null 2>&1
!unzip masr.zip -y >/dev/null 2>&1
!mv masr-18905979fc091f2256be18a4328805a0b7613704 masr

In [0]:
# 准备数据
!gdown https://drive.google.com/uc?id=1FYj9p5OyvZqgJUCGWXTgJMJbqCSx_GBh
print("解压要挺久的...........")
!unzip data_aishell.zip >/dev/null 2>&1

In [11]:
#@title data.py
%%writefile /content/masr/data.py
import torch
import librosa
import wave
import numpy as np
import scipy
import json
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

sample_rate = 16000
window_size = 0.02
window_stride = 0.01
n_fft = int(sample_rate * window_size)
win_length = n_fft
hop_length = int(sample_rate * window_stride)
window = "hamming"


def load_audio(wav_path, normalize=True):  # -> numpy array
    with wave.open(wav_path) as wav:
        wav = np.frombuffer(wav.readframes(wav.getnframes()), dtype="int16")
        wav = wav.astype("float")
    if normalize:
        return (wav - wav.mean()) / wav.std()
    else:
        return wav


def spectrogram(wav, normalize=True):
    D = librosa.stft(
        wav, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window
    )

    spec, phase = librosa.magphase(D)
    spec = np.log1p(spec)
    spec = torch.FloatTensor(spec)

    if normalize:
        spec = (spec - spec.mean()) / spec.std()

    return spec


class MASRDataset(Dataset):
    def __init__(self, index_path, labels_path):
        with open(index_path) as f:
            idx = f.readlines()
        idx = [x.strip().split(",", 1) for x in idx]
        self.idx = idx
        with open(labels_path) as f:
            labels = json.load(f)
        self.labels = dict([(labels[i], i) for i in range(len(labels))])
        self.labels_str = labels

    def __getitem__(self, index):
        wav, transcript = self.idx[index]
        wav = load_audio(wav)
        spect = spectrogram(wav)
        transcript = list(filter(None, [self.labels.get(x,1) for x in transcript]))

        return spect, transcript

    def __len__(self):
        return len(self.idx)


def _collate_fn(batch):
    def func(p):
        return p[0].size(1)

    batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
    longest_sample = max(batch, key=func)[0]
    freq_size = longest_sample.size(0)
    minibatch_size = len(batch)
    max_seqlength = longest_sample.size(1)
    inputs = torch.zeros(minibatch_size, freq_size, max_seqlength)
    input_lens = torch.IntTensor(minibatch_size)
    target_lens = torch.IntTensor(minibatch_size)
    targets = []
    for x in range(minibatch_size):
        sample = batch[x]
        tensor = sample[0]
        target = sample[1]
        seq_length = tensor.size(1)
        inputs[x].narrow(1, 0, seq_length).copy_(tensor)
        input_lens[x] = seq_length
        target_lens[x] = len(target)
        targets.extend(target)
    targets = torch.IntTensor(targets)
    return inputs, targets, input_lens, target_lens


class MASRDataLoader(DataLoader):
    def __init__(self, *args, **kwargs):
        super(MASRDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = _collate_fn

Overwriting /content/masr/data.py


In [54]:
#@title train.py
%%writefile /content/masr/train.py
import torch
import torch.nn as nn
import data
from models.conv import GatedConv
from tqdm import tqdm
from decoder import GreedyDecoder
import tensorboardX as tensorboard
import torch.nn.functional as F
import json


def train(
    model,
    epochs=1000,
    batch_size=64,
    train_index_path="/content/data_aishell/train-sort.manifest",
    dev_index_path="/content/data_aishell/dev.manifest",
    labels_path="/content/data_aishell/labels.json",
    learning_rate=0.6,
    momentum=0.8,
    max_grad_norm=0.2,
    weight_decay=0,
):
    train_dataset = data.MASRDataset(train_index_path, labels_path)
    batchs = (len(train_dataset) + batch_size - 1) // batch_size
    dev_dataset = data.MASRDataset(dev_index_path, labels_path)
    train_dataloader = data.MASRDataLoader(
        train_dataset, batch_size=batch_size, num_workers=8
    )
    train_dataloader_shuffle = data.MASRDataLoader(
        train_dataset, batch_size=batch_size, num_workers=8, shuffle=True
    )
    dev_dataloader = data.MASRDataLoader(
        dev_dataset, batch_size=batch_size, num_workers=8
    )
    parameters = model.parameters()
    optimizer = torch.optim.SGD(
        parameters,
        lr=learning_rate,
        momentum=momentum,
        nesterov=True,
        weight_decay=weight_decay,
    )
    ctcloss = nn.CTCLoss()
    # lr_sched = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.985)
    writer = tensorboard.SummaryWriter()
    gstep = 0
    for epoch in range(epochs):
        epoch_loss = 0
        if epoch > 0:
            train_dataloader = train_dataloader_shuffle
        # lr_sched.step()
        lr = get_lr(optimizer)
        writer.add_scalar("lr/epoch", lr, epoch)
        for i, (x, y, x_lens, y_lens) in enumerate(train_dataloader):
            x = x.to("cuda")
            y = y.to("cuda")
            out, out_lens = model(x, x_lens)
            out = out.transpose(0, 1).transpose(0, 2).log_softmax(2)
            loss = ctcloss(out, y, out_lens, y_lens)
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            epoch_loss += loss.item()
            writer.add_scalar("loss/step", loss.item(), gstep)
            gstep += 1
            print(
                "[{}/{}][{}/{}]\tLoss = {}".format(
                    epoch + 1, epochs, i, int(batchs), loss.item()
                )
            )
        epoch_loss = epoch_loss / batchs
        cer = eval(model, dev_dataloader)
        writer.add_scalar("loss/epoch", epoch_loss, epoch)
        writer.add_scalar("cer/epoch", cer, epoch)
        print("Epoch {}: Loss= {}, CER = {}".format(epoch, epoch_loss, cer))
        torch.save(model, "pretrained/model_{}.pth".format(epoch))


def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]


def eval(model, dataloader):
    model.eval()
    decoder = GreedyDecoder(dataloader.dataset.labels_str)
    cer = 0
    print("decoding")
    with torch.no_grad():
        for i, (x, y, x_lens, y_lens) in tqdm(enumerate(dataloader)):
            x = x.to("cuda")
            outs, out_lens = model(x, x_lens)
            outs = F.softmax(outs, 1)
            outs = outs.transpose(1, 2)
            ys = []
            offset = 0
            for y_len in y_lens:
                ys.append(y[offset : offset + y_len])
                offset += y_len
            out_strings, out_offsets = decoder.decode(outs, out_lens)
            y_strings = decoder.convert_to_strings(ys)
            for pred, truth in zip(out_strings, y_strings):
                trans, ref = pred[0], truth[0]
                cer += decoder.cer(trans, ref) / float(len(ref))
        cer /= len(dataloader.dataset)
    model.train()
    return cer


if __name__ == "__main__":
    with open("/content/data_aishell/labels.json") as f:
        vocabulary = json.load(f)
        vocabulary = "".join(vocabulary)
    model = GatedConv(vocabulary)
    model.to("cuda")
    train(model)

Overwriting /content/masr/train.py


In [0]:
%cd /content/masr
!python train.py

/content/masr
[1/1000][0/1877]	Loss = 130.9761199951172
[1/1000][1/1877]	Loss = 129.25137329101562
[1/1000][2/1877]	Loss = 106.64618682861328
[1/1000][3/1877]	Loss = 24.069087982177734
[1/1000][4/1877]	Loss = 20.969749450683594
[1/1000][5/1877]	Loss = 9.54920768737793
[1/1000][6/1877]	Loss = 44.24606704711914
[1/1000][7/1877]	Loss = 12.4298677444458
[1/1000][8/1877]	Loss = 16.844316482543945
[1/1000][9/1877]	Loss = 8.909416198730469
[1/1000][10/1877]	Loss = 23.530921936035156
[1/1000][11/1877]	Loss = 11.121297836303711
[1/1000][12/1877]	Loss = 12.508390426635742
[1/1000][13/1877]	Loss = 9.294649124145508
[1/1000][14/1877]	Loss = 9.199973106384277
[1/1000][15/1877]	Loss = 9.430288314819336
[1/1000][16/1877]	Loss = 9.182867050170898
[1/1000][17/1877]	Loss = 8.904518127441406
[1/1000][18/1877]	Loss = 8.962848663330078
[1/1000][19/1877]	Loss = 8.982231140136719
[1/1000][20/1877]	Loss = 9.240804672241211
[1/1000][21/1877]	Loss = 9.356718063354492
[1/1000][22/1877]	Loss = 12.035593032836914
