## 1 配置

In [1]:
import os

os.environ['http_proxy'] = 'http://127.0.0.1:1087'
os.environ['https_proxy'] = 'http://127.0.0.1:1087'
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

model_name_or_path = "/home/cc/models/asr/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8-fi"

language = "fi"
language_abbr = "fi"
language_decode = "fi"

task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"


## 2 导入模型和测试数据集

In [2]:
from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
from peft import PeftConfig, PeftModel
from datasets import Audio
import torch 


peft_config = PeftConfig.from_pretrained(model_dir)

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=False, device_map="auto"
)

peft_model = PeftModel.from_pretrained(base_model, model_dir)

In [3]:
from datasets import load_dataset

# 加载前 1000 条样本
common_voice_test = load_dataset(
    dataset_name,
    language_abbr,
    split="test",  
    trust_remote_code=True
)

In [4]:
common_voice_test = common_voice_test.shuffle(seed=16).select(range(100))

In [5]:
common_voice_test

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
    num_rows: 100
})

## 3 处理测试数据集

In [6]:
common_voice_test = common_voice_test.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# 降到16kHz
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16000))
# 预处理
def prepare_test_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    return batch
test_dataset = common_voice_test.map(prepare_test_dataset)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## 4 评估函数
评估指标：
 - wer: 0.5，表示词错误率（Word Error Rate）为0.5（即50%）
 - cer: 0.10256410256410256，表示字错误率（Character Error Rate）约为10.26%
 - sentence_accuracy: 0.5,表示整个句子相同的概率
 - predictions: 一个包含10个预测字符串的列表
 - references: 一个包含10个参考字符串（真实值）的列表

In [12]:

from transformers import WhisperProcessor
import jiwer 
import torch
import re

def quick_evaluate(model, test_dataset, processor, term_list=None, batch_size=20):
    all_predictions = []
    all_references = []
    term_list = term_list if term_list is not None else []  

    for i in range(0, len(test_dataset), batch_size):
        batch = test_dataset[i:i+batch_size]
        inputs = {"input_features": batch["input_features"]}
        
        with torch.no_grad():
            # 得到预测结果
            generated_ids = model.generate(
                input_features=torch.tensor(inputs["input_features"]).to(model.device),
                max_new_tokens=255
            )
        
        # 解码预测结果和参考文本
        predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
        references = [sentence for sentence in batch["sentence"]]
        
        all_predictions.extend(predictions)
        all_references.extend(references)
        print(f"已处理 {min(i+batch_size, len(test_dataset))}/{len(test_dataset)} 条样本")
    
    # 指标计算
    wer = jiwer.wer(all_references, all_predictions)
    cer = jiwer.cer(all_references, all_predictions)
    correct_sentences = sum(pred == ref for pred, ref in zip(all_predictions, all_references))
    total_sentences = len(all_references)
    sentence_accuracy = correct_sentences / total_sentences if total_sentences > 0 else 0.0
    
    return {
        "wer": wer,
        "cer": cer,
        "sentence_accuracy": sentence_accuracy,
        "predictions": all_predictions,
        "references": all_references
    }


In [13]:
peft_model = peft_model.to("cuda" if torch.cuda.is_available() else "cpu")
peft_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
                  (lora_B): Mo

## 5 评估

In [14]:
evaluation_results = quick_evaluate(
    model=peft_model,
    test_dataset=test_dataset,
    processor=processor
)

已处理 20/100 条样本
已处理 40/100 条样本
已处理 60/100 条样本
已处理 80/100 条样本
已处理 100/100 条样本


In [16]:
print('wer: ' + str(evaluation_results['wer']))
print('cer: '+ str(evaluation_results['cer']))
print('sentence_accuracy: ' + str(evaluation_results['sentence_accuracy']))

wer: 0.1831831831831832
cer: 0.031862745098039214
sentence_accuracy: 0.38


## 6 预测与真实结果对比

In [17]:
for pred, ref in zip(evaluation_results['predictions'][:10], evaluation_results['references'][:10]):
    print('预测值：' + pred)
    print('真实值：' + ref)
    print('\n')

预测值：Kulin naoparin oven ohimyhäilen, sillä heidän aamujotlauksensa eivät enää pian häiritse sekätään.
真实值：Kuljin naapurin oven ohi myhäillen, sillä heidän aamujotlauksensa eivät enää pian häiritsisi ketään.


预测值：Kaikki hänet tuntevat.
真实值：Kaikki hänet tuntevat.


预测值： Takapäin tulleen hampojan luodeista ensimmäinen osui pääministeriä selkeään haavoittaina häntä vaikeasti.
真实值：Takaapäin tulleen ampujan luodeista ensimmäinen osui pääministeriä selkään haavoittaen häntä vaikeasti


预测值：Rosina juoksee sinne.
真实值：Rosina juoksee sinne.


预测值：Ei mitään mielenkiintoista.
真实值：Ei mitään mielenkiintoista.


预测值：Mihin sitä olinkoainen pääni taas pistänyt?
真实值：Mihin sitä olinkaan pääni taas pistänyt.


预测值：Hän alkoi haukkamaan henkeään, vaan happea ei enää ollut saatavilla.
真实值：Hän alkoi haukkomaan henkeään, vaan happea ei enää ollut saatavilla.


预测值：Tämä on olennaista yhteisöllisyyden kannalta.
真实值：Tämä on olennaista yhteisöllisyyden kannalta.


预测值：Kyyneleet valuvat pientä nenää pitkin.
真实值：Kyy

## 7 评价：  
    WER	0.183 (18.3%)	词错误率较高，平均每5-6个词出现1个错误  
    CER	0.032 (3.2%)	字错误率优秀，字符级别准确率96.8%  
    句子准确率	0.38 (38%)	10句中有3.8句完全正确（实际4句完全正确）  
    简单陈述句处理优秀（句2,4,5,8,9完全正确）  
    短句识别准确率高（平均长度<5词的句子100%正确）  
    基础词汇识别稳健（如"juoksee", "mielenkiintoista"）    
    
后续改进需聚焦于语法建模与上下文理解