### 01 安装需要的依赖和部署预训练模型

In [None]:
!python -m pip install paddlepaddle-gpu==2.4.1.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
!python -m pip install ppasr
!pip install pypinyin
!python -m pip install paddlespeech-ctcdecoders
!git clone https://github.com/AlexandaJerry/PPASR.git
%cd /content/PPASR
!python setup.py install

In [5]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1e7t3lOPj0PAgQHWVot0dRw8_NRoP27IN' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1e7t3lOPj0PAgQHWVot0dRw8_NRoP27IN" -O "PPASR.zip" && rm -rf /tmp/cookies.txt
!unzip /content/PPASR/PPASR.zip -d /content
!cp -RT /content/PPASR_V2-conformer_online-fbank-超大数据集/configs/ /content/PPASR/configs/
!cp -RT /content/PPASR_V2-conformer_online-fbank-超大数据集/dataset/ /content/PPASR/dataset/
!cp -RT /content/PPASR_V2-conformer_online-fbank-超大数据集/models/ /content/PPASR/models/
%cd /content/PPASR
!python export_model.py --resume_model=models/conformer_online_fbank/best_model/

Archive:  /content/PPASR/PPASR.zip
   creating: /content/PPASR_V2-conformer_online-fbank-超大数据集/
   creating: /content/PPASR_V2-conformer_online-fbank-超大数据集/configs/
  inflating: /content/PPASR_V2-conformer_online-fbank-超大数据集/configs/conformer_online_zh.yml  
   creating: /content/PPASR_V2-conformer_online-fbank-超大数据集/dataset/
  inflating: /content/PPASR_V2-conformer_online-fbank-超大数据集/dataset/mean_istd.json  
  inflating: /content/PPASR_V2-conformer_online-fbank-超大数据集/dataset/vocabulary.txt  
   creating: /content/PPASR_V2-conformer_online-fbank-超大数据集/models/
   creating: /content/PPASR_V2-conformer_online-fbank-超大数据集/models/conformer_online_fbank/
   creating: /content/PPASR_V2-conformer_online-fbank-超大数据集/models/conformer_online_fbank/best_model/
  inflating: /content/PPASR_V2-conformer_online-fbank-超大数据集/models/conformer_online_fbank/best_model/model.pdparams  




---



---



### 02 导入谷歌云盘中的音频压缩文件

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
wav_file = 'data.zip'#@param {type:"string"}
wav_path = "/content/drive/MyDrive/" + wav_file
!unzip wav_path -d /content/PPASR/raw_wav



---



---



### 03 长音频自动切片为短音频

In [None]:
import glob
import os
import shutil
import wave

import librosa
import soundfile

def length(src: str):
    if os.path.isfile(src) and src.endswith('.wav'):
        with wave.open(src, 'r') as w:
            return w.getnframes() / w.getframerate() / 3600
    elif os.path.isdir(src):
        total = 0
        for ch in [os.path.join(src, c) for c in os.listdir(src)]:
            total += length(ch)
        return total
    return 0


print('Environment initialized successfully.')

# Configuration for data paths
raw_path = '/content/PPASR/raw_wav'  # Path to your raw, unsliced recordings

########################################

assert os.path.exists(raw_path) and os.path.isdir(raw_path), 'The chosen path does not exist or is not a directory.'
print('Raw recording path:', raw_path)
print()
print('===== Recording List =====')
raw_filelist = glob.glob(f'{raw_path}/*.wav', recursive=True)
raw_length = length(raw_path)
if len(raw_filelist) > 5:
    print('\n'.join(raw_filelist[:5] + [f'... ({len(raw_filelist) - 5} more)']))
else:
    print('\n'.join(raw_filelist))
print()
print(f'Found {len(raw_filelist)} valid recordings with total length of {round(raw_length, 2)} hours.')

sliced_path = '/content/PPASR/sliced_wav'  # Path to hold the sliced segments of your recordings

# Slicer arguments
db_threshold_ = -40. #@param {type:"string"}
min_length_ = 15000 #@param {type:"string"}
win_l_ = 800 #@param {type:"string"}
win_s_ = 40 #@param {type:"string"}
max_silence_kept_ = 1000 #@param {type:"string"}

# Number of threads (based on your CPU kernels)
num_workers = 5

########################################

assert 'raw_path' in locals().keys(), 'Raw path of your recordings has not been specified.'
assert not os.path.exists(sliced_path) or os.path.isdir(sliced_path), 'The chosen path is not a directory.'
os.makedirs(sliced_path, exist_ok=True)
print('Sliced recording path:', sliced_path)

from slicer import Slicer
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED

def slice_one(in_audio):
    audio, sr = librosa.load(in_audio, sr=None)
    slicer = Slicer(
        sr=sr,
        db_threshold=db_threshold_,
        min_length=min_length_,
        win_l=win_l_,
        win_s=win_s_,
        max_silence_kept=max_silence_kept_
    )
    chunks = slicer.slice(audio)
    for i, chunk in enumerate(chunks):
        soundfile.write(os.path.join(sliced_path, f'%s_slice_%04d.wav' % (os.path.basename(in_audio).rsplit('.', maxsplit=1)[0], i)), chunk, sr)


print('Slicing your recordings may take several minutes. Please wait.')
thread_pool = ThreadPoolExecutor(max_workers=num_workers)
tasks = []
for file in raw_filelist:
    tasks.append(thread_pool.submit(slice_one, file))
wait(tasks, return_when=ALL_COMPLETED)
print()
print('===== Segment List =====')
sliced_filelist = glob.glob(f'{sliced_path}/*.wav', recursive=True)
sliced_length = length(sliced_path)
if len(sliced_filelist) > 5:
    print('\n'.join(sliced_filelist[:5] + [f'... ({len(sliced_filelist) - 5} more)']))
else:
    print('\n'.join(sliced_filelist))
print()
print(f'Sliced your recordings into {len(sliced_filelist)} segments with total length of {round(sliced_length, 2)} hours.')



---



---



### 04 切片后的短音频自动转写

In [20]:
%%shell
cd /content/PPASR
#ls ./sliced_wav/*.wav

for file in ./sliced_wav/*.wav;  #一级目录下的内容-->并不递归显示！ 
  do  
     python infer_path.py --wav_path=$file;  #${file}代表的是文件的全路径
  done 

初始化解码器...
language model: model path = lm/zh_giga.no_cna_cmn.prune01244.klm, is_character_based = True, max_order = 5, dict_size = 0
初始化解码器完成!
[[32m2022-12-20 15:23:51.423210[0m [1m[37mINFO   [0m] [36minference_predictor[0m:[36m__init__[0m:[36m71[0m - [1m[37m已加载模型：models/conformer_online_fbank/infer/[0m
消耗时间：654ms, 识别结果: 朋友是律师爸爸喜欢看书哥哥是医生我喜欢看电视, 得分: -3
初始化解码器...
language model: model path = lm/zh_giga.no_cna_cmn.prune01244.klm, is_character_based = True, max_order = 5, dict_size = 0
初始化解码器完成!
[[32m2022-12-20 15:24:11.416380[0m [1m[37mINFO   [0m] [36minference_predictor[0m:[36m__init__[0m:[36m71[0m - [1m[37m已加载模型：models/conformer_online_fbank/infer/[0m
消耗时间：892ms, 识别结果: 他们是护士高鹏和徐涛是邻居他不叫高鹏, 得分: 2
executing 'slice' costed 47.126s
初始化解码器...
language model: model path = lm/zh_giga.no_cna_cmn.prune01244.klm, is_character_based = True, max_order = 5, dict_size = 0
初始化解码器完成!
executing 'slice' costed 49.326s
[[32m2022-12-20 15:24:30.897274[0m [1m[37mINFO   [0m] 



In [21]:
import os
import sys
import numpy as np
from pypinyin import pinyin, lazy_pinyin, Style
import re

root_dir = "/content/PPASR/sliced_wav"
pattern = re.compile(r'(.*)\.txt$')
r="[_.!+-=——,$%^,。?、~@#￥%……&*《》<>「」{}【】()（）/''\n ]"

for root, dir, files in os.walk(root_dir):
	for filename in files:
		#print(filename)
		output = pattern.match(filename)
		if output is not None:
			print(root, filename)
			text_file = open(root+"/"+filename)
			line = text_file.read().strip()
			line = line.replace("，", "")
			line = re.sub(r,"",line)
			pinyin =  lazy_pinyin(line, style=Style.TONE3, errors='default', strict=False, v_to_u=False, neutral_tone_with_five=True,tone_sandhi=True)
			pinyinline = ' '.join(pinyin)
			print(line)
			target_text_file = open(root+"/"+output.group(1)+".txt", "w")
			target_text_file.write(pinyinline)
			target_text_file.close()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pypinyin
  Downloading pypinyin-0.47.1-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 31.8 MB/s 
[?25hInstalling collected packages: pypinyin
Successfully installed pypinyin-0.47.1
/content/PPASR/sliced_wav alv0805_slice_0007.txt
弟弟喜欢跳舞许涛没有女儿爸爸喜欢看书哥哥是医生
/content/PPASR/sliced_wav lj1216_slice_0001.txt
他们是护士高鹏和徐涛是邻居他不叫高鹏
/content/PPASR/sliced_wav hp0317_slice_0003.txt
高鹏和许涛是邻居我七点一刻去见律师他们是护士打球可是我不喜欢打球
/content/PPASR/sliced_wav aam0511_slice_0009.txt
我叫木有房高峰是律师高峰住在二十六
/content/PPASR/sliced_wav aam0511_slice_0010.txt
我我也在二楼住我是一个中文的学生我觉得学学中文还有一次
/content/PPASR/sliced_wav alv0805_slice_0002.txt
弟弟喜欢跳舞打球可是我不喜欢打球才没有女人
/content/PPASR/sliced_wav lj1216_slice_0007.txt
哥哥是医生我喜欢看电视八月八号是星期五朋友是律师他叫高鹏
/content/PPASR/sliced_wav aam0511_slice_0018.txt
十一三十在二十六现在五十五十三十
/content/PPASR/sliced_wav alv0805_slice_0013.txt
二十五现在六点半我们去听音乐会吧好吗
/content/PPASR/sliced_wav al

In [None]:
#from IPython.display import Audio
#wn = Audio('/content/PPASR/sliced_wav/aam0511_slice_0006.wav', autoplay=True)
#with open('/content/PPASR/sliced_wav/aam0511_slice_0006.txt', 'r') as f:
    print(f.read())
#display(wn)

### 05 保存短音频和对应转写到谷歌云盘

In [24]:
import os, tarfile
import os
from google.colab import files
save_as = 'mfa_prepare.tar'#@param {type:"string"}
def make_targz_one_by_one(output_filename, source_dir):
  tar = tarfile.open(output_filename,"w")
  for root,dir_name,files_list in os.walk(source_dir):
    for file in files_list:
      pathfile = os.path.join(root, file)
      tar.add(pathfile)
  tar.close()
 
  # files.download(output_filename)
 
make_targz_one_by_one(save_as , '/content/PPASR/sliced_wav')
savename = "/content/PPASR" + save_as
!cp -r savename /content/drive/MyDrive