## 全局参数设置

In [1]:
model_name_or_path = "/home/cc/models/asr/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8-fi"

language = "fi"
language_abbr = "fi"


task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_11_0"

batch_size=4
save_path = "/home/cc/projects/my_tokenized_datasets/common_voice/fi"

### 加载数据集

把tokenized dataset处理完，存下来（参考save_dataset.ipynb），方便后续调试，因此这里只需加载处理好的数据集

In [2]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path) 
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)  
processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)  

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from datasets import load_from_disk

tokenized_common_voice = load_from_disk(save_path)

In [4]:
tokenized_common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 2165
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'input_features', 'labels'],
        num_rows: 1650
    })
})

In [5]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any 

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch 

In [6]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## 模型准备

### 加载预训练模型（int8 精度）

In [7]:
from transformers import AutoModelForSpeechSeq2Seq

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")

In [8]:
model.config.forced_decoder_ids = None  

model.config.suppress_tokens = []  

### PEFT 微调前的模型处理


In [9]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)



### LoRA Adapter 配置

In [10]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(
    r=4,  # LoRA的秩，影响LoRA矩阵的大小
    lora_alpha=64,  # LoRA适应的比例因子
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,  # 在LoRA模块中使用的dropout率
    bias="none",  # 设置bias的使用方式，这里没有使用bias
)

### 使用get_peft_model函数和给定的配置来获取一个PEFT模型

In [11]:
peft_model = get_peft_model(model, config)

### 打印 LoRA 微调训练的模型参数

In [12]:
peft_model.print_trainable_parameters()

trainable params: 1,966,080 || all params: 1,545,271,040 || trainable%: 0.12723204856023188


## 模型训练

#### Seq2SeqTrainingArguments 训练参数

**关于设置训练步数和评估步数**

基于 epochs 设置：

```python
    num_train_epochs=3,  # 训练的总轮数
    evaluation_strategy="epoch",  # 设置评估策略，这里是在每个epoch结束时进行评估
    warmup_steps=50,  # 在训练初期增加学习率的步数，有助于稳定训练
```

基于 steps 设置：

```python
    max_steps=100, # 训练总步数
    evaluation_strategy="steps", 
    eval_steps=25, # 评估步数
```

In [13]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    per_device_train_batch_size=batch_size,  
    per_device_eval_batch_size=batch_size, 
    
    learning_rate=1e-3,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    generation_max_length=96,  
    logging_steps=50,
    remove_unused_columns=False,
    label_names=["labels"],
    
    gradient_accumulation_steps=4,  # 累积4步再更新梯度，等效于batch_size=2×4=8
    
    # 混合精度参数微调（确保稳定性）
    fp16=True,
    fp16_opt_level="O1",  # O1比O2更稳定，适合显存紧张场景
    
    save_strategy="epoch",  # 按epoch保存模型，避免频繁保存
    load_best_model_at_end=True,  # 在最后加载最优模型
)

### 实例化 Seq2SeqTrainer 训练器

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=peft_model,
    train_dataset=tokenized_common_voice["train"],
    eval_dataset=tokenized_common_voice["validation"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)
peft_model.config.use_cache = False

In [15]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,0.0397,0.047375
2,0.0112,0.045726




TrainOutput(global_step=405, training_loss=0.18507922877684052, metrics={'train_runtime': 2275.8925, 'train_samples_per_second': 2.854, 'train_steps_per_second': 0.178, 'total_flos': 1.3763738087424e+19, 'train_loss': 0.18507922877684052, 'epoch': 2.99})

cc@js:~$ nvidia-smi
Fri Aug 15 17:02:52 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.230.02             Driver Version: 535.230.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0  On |                  Off |
| 62%   51C    P2             152W / 450W |  24144MiB / 24564MiB |     62%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+


训练损失快速收敛

### 保存 LoRA 模型(Adapter)

In [16]:
trainer.save_model(model_dir)

In [17]:
peft_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear8bitLt(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear8bitLt(
                  (base_layer): Linear8bitLt(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
            

## 模型推理（可能需要重启 Notebook）

**再次加载模型会额外占用显存，如果显存已经达到上限，建议重启 Notebook 后再进行以下操作**


In [2]:


language = "fi"
language_abbr = "fi"
language_decode = "fi"
task = "transcribe"

model_dir = "models/whisper-large-v2-asr-int8-fi"




### 使用 `PeftModel` 加载 LoRA 微调后 Whisper 模型

使用 `PeftConfig` 加载 LoRA Adapter 配置参数，使用 `PeftModel` 加载微调后 Whisper 模型

In [3]:
from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
from peft import PeftConfig, PeftModel

peft_config = PeftConfig.from_pretrained(model_dir)

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto"
)

peft_model = PeftModel.from_pretrained(base_model, model_dir)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

### 使用 Pipeline API 部署微调后 Whisper 实现中文语音识别任务

In [20]:
test_audio = "test_zh.flac"
language_decode = "chinese"  # 与 Whisper 要求一致

In [21]:
from transformers import AutomaticSpeechRecognitionPipeline

pipeline = AutomaticSpeechRecognitionPipeline(model=peft_model, tokenizer=tokenizer, feature_extractor=feature_extractor)

forced_decoder_ids = processor.get_decoder_prompt_ids(language=language_decode, task=task)

In [22]:
import torch

with torch.cuda.amp.autocast():
    text = pipeline(test_audio, max_new_tokens=255)["text"]



In [23]:
text

' 大家好，今天给大家带来一款重磅产品，性能提升了八十，但是价格只要有商的一半，这不仅仅是一个产品，更是一个精品，让我问问大家，这个价格你们觉得怎么样？'

## Homework

1. 使用完整的数据集训练，对比 Train Loss 和 Validation Loss 变化。训练完成后，使用测试集进行模型评估.
2. [Optional]使用其他语种（如：德语、法语等）的数据集进行微调训练，并进行模型评估模型评估。