# 构建语音识别系统

## 数据集下载

1. 下载数据集：[BZNSYP](https://www.data-baker.com/data/index/TNtts/)
- 大约12小时
- 来自同一个说话人
- 本次实验我们采用pinyin作为我们的识别结果，来构建一个语音识别系统


2. 构建数据集
- 将数据集放到dataset目录下，dataset目录如下：

    ```
    dataset
    ├── PhoneLabeling
    │   ├── 000001.interval
    ├── ProsodyLabeling
    │   ├── 000001-010000.txt
    ├── Wave
    │   ├── 000001.wav
    ```

- 运行 `splitdata/split_data.py` 划分数据集，最后dataset目录下会多一个split目录

    ```
    dataset
    ├── split
    │   ├── train
    │   │   ├── wav.scp
    │   │   ├── pinyin
    │   ├── dev
    │   │   ├── wav.scp
    │   │   ├── pinyin
    │   ├── test
    │   │   ├── wav.scp
    │   │   ├── pinyin
    ```

## 数据提取

数据提取的框架已经构建好，位于 data/dataloader.py 的BZNSYP类中，需要完成：
- 语音特征提取
- 文本处理

1. 语音特征提取(`data.dataloader.extract_audio_features`函数)

    -  可以把实验一完成的特征提取给放进去
    -  要保证返回的结果为一个tensor，并且维度为(L,f)

2. 文本处理
    - 构建tokenizer：`tokenizer.tokenizer.Tokenizer`
    - tokenizer需要做到：
        - 将一段字符映射成token id
        - 需要完成Tokenizer框架中的TODO

### tokenizer构建

1. 构建字典
    - 构建字典的文件已经写好，tokenizer/gen_vocab.py
    - 可以修改一下对应的路径，vocab结果如下

        ```
        huang
        cheng
        lo
        ...
        ```

2. 完成Tokenizer的TODO部分
    - call函数
    - decode函数
    - 注意特殊字符\<pad\>,\<unk\>, \<sos\>, \<eos\>,\<blk\>, " "

In [1]:
from tokenizer.tokenizer import Tokenizer
pinyin_list1 = ['wo', 'men', 'cheng', 'shi', 'de', 'fu', 'su', 'you', 'lai']
pinyin_list2 = ["A", 'wo', 'men', 'cheng']
tokenizer = Tokenizer("./tokenizer/vocab.txt")

In [2]:
id_list1 = tokenizer(pinyin_list1)
print(tokenizer.decode(id_list1))

['wo', 'men', 'cheng', 'shi', 'de', 'fu', 'su', 'you', 'lai']


In [3]:
id_list2 = tokenizer(pinyin_list2)
print(tokenizer.decode(id_list2))

['wo', 'men', 'cheng']


## 数据构建

In [5]:
import numpy as np

def pre_emphasis(x: np.ndarray, alpha: float = 0.97) -> np.ndarray:
    return np.append(x[0], x[1:] - alpha * x[:-1])

def framing(x: np.ndarray, sr: int, frame_lenth: float = 0.025, frame_gap: float = 0.010) -> np.ndarray:
    frame_len, frame_step = int(round(frame_lenth * sr)), int(round(frame_gap * sr))
    signal_len = len(x)

    if signal_len <= frame_len:
        num_frames = 1
    else:
        num_frames = 1 + int(np.ceil((signal_len - frame_len) / frame_step))

    pad_signal_length = (num_frames - 1) * frame_step + frame_len
    amount_to_pad = pad_signal_length - signal_len
    
    pad_signal = np.pad(x, (0, max(0, amount_to_pad)), mode='constant', constant_values=0)
    
    frame_indices_offset = np.arange(frame_len)
    frame_start_points = np.arange(num_frames) * frame_step
    
    indices = frame_start_points[:, np.newaxis] + frame_indices_offset[np.newaxis, :]
    
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    return frames

def add_window(frame_sig: np.ndarray, sr: int, frame_len_s: float = 0.025) -> np.ndarray:
    window = np.hamming(int(round(frame_len_s * sr)))
    return frame_sig * window

def stft(frame_sig: np.ndarray, nfft: int = 512) -> tuple[np.ndarray, np.ndarray]:
    frame_spec = np.fft.rfft(frame_sig, n=nfft)
    frame_mag = np.abs(frame_spec)
    frame_pow = (frame_mag ** 2) / nfft
    return frame_mag, frame_pow

def get_filter_banks(sr, n_filters=40, nfft=512):
    low_freq_mel = 0
    high_freq_mel = 2595 * np.log10(1 + (sr / 2) / 700)
    
    mel_points = np.linspace(low_freq_mel, high_freq_mel, n_filters + 2)
    hz_points = 700 * (10 ** (mel_points / 2595) - 1)
    bins = np.floor((nfft + 1) * hz_points / sr).astype(int)
    filter_banks = np.zeros((n_filters, nfft // 2 + 1))
    fft_freqs = np.arange(nfft // 2 + 1)
    
    for i in range(n_filters):
        left, center, right = bins[i:i+3]

        left_mask = (left <= fft_freqs) & (fft_freqs < center)
        if center != left:
            filter_banks[i, left_mask] = (fft_freqs[left_mask] - left) / (center - left)

        right_mask = (center <= fft_freqs) & (fft_freqs < right)
        if right != center:
            filter_banks[i, right_mask] = (right - fft_freqs[right_mask]) / (right - center)
    
    return filter_banks

def get_fbank(frame_pow: np.ndarray, filter_banks: np.ndarray) -> np.ndarray:
    return np.dot(frame_pow, filter_banks.T)

In [8]:
from torch.utils.data import DataLoader, Dataset
from tokenizer.tokenizer import Tokenizer
import torch
import random
import os
from utils.utils import collate_with_PAD
import librosa

def extract_audio_features(wav_file:str)->torch.Tensor:

    def calc_fbank(x: np.ndarray, sr: int = 16000, n_filters: int = 40, nfft: int = 512) -> np.ndarray:
        x = pre_emphasis(x)
        frames = framing(x, sr)
        frames = add_window(frames, sr)
        frame_mag, frame_pow = stft(frames, nfft)
        filter_banks = get_filter_banks(sr, n_filters, nfft)
        fbank = get_fbank(frame_pow, filter_banks)
        return fbank

    if not isinstance(wav_file, str):
        raise TypeError(f"Expected string for wav_file")

    y, sr = librosa.load(wav_file, sr=None)
    fbank = calc_fbank(y, sr=sr, n_filters=80, nfft=512)

    res = torch.from_numpy(fbank).float()

    if not isinstance(res, torch.Tensor):
        raise TypeError("Return value must be torch.Tensor")
    return res


class BZNSYP(Dataset):
    def __init__(self, wav_file, text_file, tokenizer):
        self.tokenizer = tokenizer
        self.wav2path = {}
        self.wav2text = {}
        self.ids = []

        with open(wav_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t", 1)
                if len(parts) == 2:
                    id = parts[0]
                    self.ids.append(id)
                    path = "./dataset/" + parts[1]
                    self.wav2path[id] = path
                else:
                    raise ValueError(f"Invalid line format: {line}")

        with open(text_file, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split("\t", 1)
                if len(parts) == 2:
                    id = parts[0]
                    pinyin_list = parts[1].split(" ")
                    self.wav2text[id] = self.tokenizer(["<sos>"]+pinyin_list+["<eos>"])
                else:
                    raise ValueError(f"Invalid line format: {line}")
    
    def __len__(self):
        return len(self.wav2path)
    
    def __getitem__(self, index):
        id = list(self.wav2path.keys())[index]
        path = self.wav2path[id]
        text = self.wav2text[id]
        return id, extract_audio_features(path), text
    

def get_dataloader(wav_file, text_file, batch_size, tokenizer, shuffle=True):
    dataset = BZNSYP(wav_file, text_file, tokenizer)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_with_PAD
    )
    return dataloader

In [9]:
tokenizer = Tokenizer()
dataloader = get_dataloader("./dataset/split/train/wav.scp", "./dataset/split/train/pinyin", 3, tokenizer, shuffle = False)
input = None
for batch in dataloader:
    input = batch
    break

audio_lens = input["audio_lens"]
audios = input["audios"]
texts = input["texts"]
text_lens = input["text_lens"]

In [10]:
print(input.keys())

dict_keys(['ids', 'audios', 'audio_lens', 'texts', 'text_lens'])


In [11]:
# audio
print(audio_lens)
print(audios[0, : , :])

tensor([265, 285, 439], dtype=torch.int32)
tensor([[0.0000e+00, 5.4336e-10, 0.0000e+00,  ..., 8.3642e-09, 3.3812e-08,
         9.6364e-08],
        [0.0000e+00, 7.9692e-10, 0.0000e+00,  ..., 9.7250e-09, 1.6037e-08,
         4.6284e-08],
        [0.0000e+00, 1.0147e-09, 0.0000e+00,  ..., 8.5312e-09, 3.1178e-08,
         7.5750e-08],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])


In [12]:
# text
texts = texts.tolist()

for text in texts:
    print(tokenizer.decode(text))
    print(tokenizer.decode(text, ignore_special=False))

['ka', 'e', 'er', 'pu', 'pei', 'wai', 'sun', 'wan', 'hua', 'ti']
['<sos>', 'ka', 'e', 'er', 'pu', 'pei', 'wai', 'sun', 'wan', 'hua', 'ti', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
['jia', 'yu', 'cun', 'yan', 'bie', 'zai', 'yong', 'bao', 'wo']
['<sos>', 'jia', 'yu', 'cun', 'yan', 'bie', 'zai', 'yong', 'bao', 'wo', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['bao', 'ma', 'pei', 'gua', 'bo', 'luo', 'an', 'diao', 'chan', 'yuan', 'zhen', 'dong', 'weng', 'ta']
['<sos>', 'bao', 'ma', 'pei', 'gua', 'bo', 'luo', 'an', 'diao', 'chan', 'yuan', 'zhen', 'dong', 'weng', 'ta', '<eos>']
