## 1 配置

In [1]:
import os

os.environ['http_proxy'] = 'http://127.0.0.1:1087'
os.environ['https_proxy'] = 'http://127.0.0.1:1087'
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

model_name_or_path = "/home/cc/models/asr/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8"

language = "Chinese (China)"
language_abbr = "zh-CN"
language_decode = "chinese"

task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"


## 2 导入模型和测试数据集

In [2]:
from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
from peft import PeftConfig, PeftModel
import torch 


peft_config = PeftConfig.from_pretrained(model_dir)

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=False, device_map="auto"
)

peft_model = PeftModel.from_pretrained(base_model, model_dir)

In [None]:
from datasets import load_dataset

# 加载前 1000 条样本
common_voice_test = load_dataset(
    dataset_name,
    language_abbr,
    split="test[:1000]",  # 切片语法
    trust_remote_code=True
)

In [None]:
common_voice_test = common_voice_test.shuffle(seed=16).select(range(10))

In [4]:
common_voice_test

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 10
})

## 3 处理测试数据集

In [5]:
common_voice_test = common_voice_test.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# 降到16kHz
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16000))
# 预处理
def prepare_test_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    return batch
test_dataset = common_voice_test.map(prepare_test_dataset)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

## 4 评估函数
评估指标：
 - wer: 0.5，表示词错误率（Word Error Rate）为0.5（即50%）
 - cer: 0.10256410256410256，表示字错误率（Character Error Rate）约为10.26%
 - sentence_accuracy: 0.5,表示整个句子相同的概率
 - predictions: 一个包含10个预测字符串的列表
 - references: 一个包含10个参考字符串（真实值）的列表

In [40]:

from transformers import WhisperProcessor
import jiwer 
import torch
import re

def quick_evaluate(model, test_dataset, processor, term_list=None, batch_size=2):
    all_predictions = []
    all_references = []
    term_list = term_list if term_list is not None else []  

    for i in range(0, len(test_dataset), batch_size):
        batch = test_dataset[i:i+batch_size]
        inputs = {"input_features": batch["input_features"]}
        
        with torch.no_grad():
            # 得到预测结果
            generated_ids = model.generate(
                input_features=torch.tensor(inputs["input_features"]).to(model.device),
                max_new_tokens=255
            )
        
        # 解码预测结果和参考文本
        predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
        references = [sentence for sentence in batch["sentence"]]
        
        all_predictions.extend(predictions)
        all_references.extend(references)
        print(f"已处理 {min(i+batch_size, len(test_dataset))}/{len(test_dataset)} 条样本")
    
    # 指标计算
    wer = jiwer.wer(all_references, all_predictions)
    cer = jiwer.cer(all_references, all_predictions)
    correct_sentences = sum(pred == ref for pred, ref in zip(all_predictions, all_references))
    total_sentences = len(all_references)
    sentence_accuracy = correct_sentences / total_sentences if total_sentences > 0 else 0.0
    
    return {
        "wer": wer,
        "cer": cer,
        "sentence_accuracy": sentence_accuracy,
        "predictions": all_predictions,
        "references": all_references
    }


In [31]:
peft_model = peft_model.to("cuda" if torch.cuda.is_available() else "cpu")
peft_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
                  (lora_B): Mo

## 5 评估

In [41]:
evaluation_results = quick_evaluate(
    model=peft_model,
    test_dataset=test_dataset,
    processor=processor
)

已处理 2/10 条样本
已处理 4/10 条样本
已处理 6/10 条样本
已处理 8/10 条样本
已处理 10/10 条样本


In [42]:
evaluation_results

{'wer': 0.5,
 'cer': 0.10256410256410256,
 'sentence_accuracy': 0.5,
 'predictions': ['通过可以三人居住地到达现在居住。',
  '曾祖父刘文。',
  '莱斯特广场的广场。',
  '阿布罗维亚。',
  '齐。',
  '正方走出家门，看见一个有刺青的流氓在家门口前在乱丢垃圾，却对他感怒不敢言。',
  '妻子为香港行政会议成员胡红玉。',
  '西本正的摄影工作而闻名。',
  '它们还可以做早餐。',
  '近几年获奖无数。'],
 'references': ['通过科伊桑人居住地到达现在居处。',
  '曾祖父刘文。',
  '莱斯特广场的广场。',
  '阿布洛维尔。',
  '七',
  '正方走出家门，看见一个有刺青的流氓在家门口前在乱丢垃圾，却对他敢怒不敢言。',
  '妻子为香港行政会议成员胡红玉。',
  '西本正的摄影工作而闻名。',
  '他们可以做早餐',
  '近几年获奖无数。']}

## 6 预测与真实结果对比

In [18]:
for pred, ref in zip(evaluation_results['predictions'], evaluation_results['references']):
    print('预测值：' + pred)
    print('真实值：' + ref)
    print('\n')

预测值：通过可以三人居住地到达现在居住。
真实值：通过科伊桑人居住地到达现在居处。


预测值：曾祖父刘文。
真实值：曾祖父刘文。


预测值：莱斯特广场的广场。
真实值：莱斯特广场的广场。


预测值：阿布罗维亚。
真实值：阿布洛维尔。


预测值：齐。
真实值：七


预测值：正方走出家门，看见一个有刺青的流氓在家门口前在乱丢垃圾，却对他感怒不敢言。
真实值：正方走出家门，看见一个有刺青的流氓在家门口前在乱丢垃圾，却对他敢怒不敢言。


预测值：妻子为香港行政会议成员胡红玉。
真实值：妻子为香港行政会议成员胡红玉。


预测值：西本正的摄影工作而闻名。
真实值：西本正的摄影工作而闻名。


预测值：它们还可以做早餐。
真实值：他们可以做早餐


预测值：近几年获奖无数。
真实值：近几年获奖无数。




能看出有一些问题：
专业名词识别问题（科伊桑人→可以三人）
同音异义字处理缺陷（七→齐，敢→感）
冗余生成倾向（添加"还"和句号）