In [2]:
task = "automatic-speech-recognition"
language = "zh"

model_name = "openai/whisper-base"
model_dir = "../models/"

dataset_name = "mozilla-foundation/common_voice_17_0"
dataset_dir = "../data"

In [3]:
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name, cache_dir="../model/")
cv_zh_train = load_dataset(dataset_name, "zh-CN", split="train", streaming=True, cache_dir=dataset_dir)

ModuleNotFoundError: No module named 'datasets'

In [3]:
from datasets import Audio

cv_zh_train = cv_zh_train.cast_column("audio", Audio(sampling_rate=16_000))

In [4]:
sample = next(iter(cv_zh_train))

Reading metadata...: 29406it [00:02, 11835.90it/s]


In [5]:
sample

{'client_id': 'fa2b87636bc8f776d7f8e1c4fcf51892f5f5cb7556e489f2fa81d91728cac50d5525856f59cc5a7d68fc0cf7265ad89af0f7aa97d3662b3c7586197ea5f92fd8',
 'path': 'zh-CN_train_0/common_voice_zh-CN_19428636.mp3',
 'audio': {'path': 'zh-CN_train_0/common_voice_zh-CN_19428636.mp3',
  'array': array([ 3.10862447e-14, -1.06581410e-14, -1.59872116e-14, ...,
          9.38428229e-07,  2.90993967e-07, -5.94695848e-07]),
  'sampling_rate': 16000},
 'sentence': '沃内贝尔格是德国巴伐利亚州的一个市镇。',
 'up_votes': 2,
 'down_votes': 0,
 'age': 'thirties',
 'gender': 'male_masculine',
 'accent': '出生地：23 黑龙江省',
 'locale': 'zh-CN',
 'segment': '',
 'variant': ''}

In [6]:
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE

chinese_code = TO_LANGUAGE_CODE["chinese"]
japanese_code = TO_LANGUAGE_CODE["japanese"]
french_code = TO_LANGUAGE_CODE["french"]

In [7]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained(model_name, language='zh', task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def prepare_dataset(sample):
    audio = sample["audio"]

    sample = processor(
        audio=audio["array"],
        sampling_rate=audio["sampling_rate"],
        text=sample["sentence"],
    )

    # compute input length of audio sample in seconds
    sample["input_length"] = len(audio["array"]) / audio["sampling_rate"]

    return sample

In [9]:
sample = prepare_dataset(sample.copy())

In [10]:
sample

{'input_features': array([[[-0.67743826, -0.67743826, -0.67743826, ..., -0.67743826,
         -0.67743826, -0.67743826],
        [-0.67743826, -0.67743826, -0.67743826, ..., -0.67743826,
         -0.67743826, -0.67743826],
        [-0.67743826, -0.67743826, -0.67743826, ..., -0.67743826,
         -0.67743826, -0.67743826],
        ...,
        [-0.67743826, -0.67743826, -0.67743826, ..., -0.67743826,
         -0.67743826, -0.67743826],
        [-0.67743826, -0.67743826, -0.67743826, ..., -0.67743826,
         -0.67743826, -0.67743826],
        [-0.67743826, -0.67743826, -0.67743826, ..., -0.67743826,
         -0.67743826, -0.67743826]]], dtype=float32), 'labels': [50258, 50260, 50359, 50363, 3308, 225, 34742, 18464, 251, 1530, 242, 30921, 1541, 35898, 16086, 38325, 7384, 238, 23700, 1369, 248, 46974, 1546, 20182, 27261, 12373, 229, 1543, 50257], 'input_length': 9.336}

In [11]:
import torch

# Turn np.array to torch.tensor
input_features = sample['input_features']
input_features_tensor = torch.tensor(input_features, dtype=torch.float32)

with torch.no_grad():
    prediction = model.generate(input_features=input_features_tensor)

transcription = processor.decode(*prediction)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


In [14]:
type(transcription)

str

In [15]:
import re

def extract_chinese(text):
    chinese_characters = re.findall(r'[\u4e00-\u9fff]+', text)
    return ''.join(chinese_characters)

transcription = extract_chinese(transcription)
transcription

'莫內辯格式德國巴哈利亞州的一個市政'

In [16]:
import opencc
converter = opencc.OpenCC('t2s.json')
converter.convert(transcription) 

'莫内辩格式德国巴哈利亚州的一个市政'