pre：  

配置proxy环境

In [None]:
import os

os.environ['http_proxy'] = 'http://127.0.0.1:1087'
os.environ['https_proxy'] = 'http://127.0.0.1:1087'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

任务类型：中文转录任务

In [None]:
language = "Chinese (China)"
language_abbr = "zh-CN"
task = "transcribe"  # 转录任务

本次微调使用的数据库和模型：  

mozilla-foundation/common_voice_11_0数据库：   
Common Voice 11.0 是 Mozilla 于 2022 年 9 月发布的第 11 版开源语音数据集。它是全球最大的多语言语音数据库之一，包含 24,210 小时的语音录音，覆盖 100 种语言（新增 4 种语言），其中 33 种语言的语音数据超过 100 小时。  
特点：  

    多语言支持：涵盖主流语言（如英语、中文、西班牙语）和小众语言（如阿塞拜疆语、科西嘉语），尤其注重低资源语言的覆盖。
    标注与质量：包含文本转录和发音评分，数据经过社区验证，适合训练高精度的语音识别模型。
    应用场景：常用于语音助手开发、多语言交互系统、无障碍技术（如实时字幕）等领域。


whisper-large-v2模型：
Whisper-large-v2 是 OpenAI 于 2022 年 12 月发布的自动语音识别（ASR）模型，基于 Transformer 架构。
特点：

    多语言支持：原生支持 90 余种语言，包括低资源语言（如斯瓦希里语、泰米尔语），并可通过微调进一步优化特定语言。
    多任务能力：
        语音识别：直接转录音频为同语言文本（如英语转英语）。
        语音翻译：将语音翻译成目标语言（如西班牙语转英语），零样本翻译准确率较高。
    长音频处理：通过分块算法支持任意长度音频的转录，适合处理讲座、播客等长内容。


In [None]:
dataset_name = "mozilla-foundation/common_voice_11_0"
model_name_or_path = "/home/cc/models/asr/whisper-large-v2"

微调后模型的存储路径：

In [1]:
model_dir = "/home/cc/models/finetuned-models/whisper-large-v2-finetuned-1"

1. 下载并处理数据集

In [1]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset(dataset_name, language_abbr, split="train", trust_remote_code=True)
common_voice["validation"] = load_dataset(dataset_name, language_abbr, split="validation", trust_remote_code=True)

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 29056it [00:00, 237945.42it/s][A


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 10581it [00:00, 460635.54it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 10581it [00:00, 473840.81it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 45203it [00:00, 452017.50it/s][A
Reading metadata...: 90405it [00:00, 448757.26it/s][A
Reading metadata...: 135379it [00:00, 449191.76it/s][A
Reading metadata...: 180300it [00:00, 441458.30it/s][A
Reading metadata...: 224808it [00:00, 442743.47it/s][A
Reading metadata...: 269534it [00:00, 444257.81it/s][A
Reading metadata...: 314163it [00:00, 444913.39it/s][A
Reading metadata...: 358662it [00:00, 436641.93it/s][A
Reading metadata...: 403582it [00:00, 440503.02it/s][A
Reading metadata...: 447803it [00:01, 441019.30it/s][A
Reading metadata...: 491926it [00:01, 440976.44it/s][A
Reading metadata...: 536039it [00:01, 439787.98it/s][A
Reading metadata...: 580028it [00:01, 439648.03it/s][A
Reading metadata...: 624311it [00:01, 440600.10it/s][A
Reading metadata...: 698486it [00:01, 442036.14it/s][A


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 21302it [00:00, 479190.92it/s]


In [2]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))  # 转为16000hz
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)  # 移除数据集中不必要的字段
common_voice["train"] = common_voice["train"].shuffle(seed=16)
common_voice["validation"] = common_voice["validation"].shuffle(seed=16)

In [7]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path) 
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)  
processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)  

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
def prepare_dataset(batch):
    audio = batch["audio"]
    # 截断音频（例如限制最长30秒，根据采样率计算点数：30s * 16000Hz = 480000）
    max_audio_length = 480000  # 需与feature_extractor的采样率匹配
    audio_array = audio["array"][:max_audio_length]  # 截断音频
    # 提取特征时指定最大长度
    batch["input_features"] = feature_extractor(
        audio_array, 
        sampling_rate=audio["sampling_rate"],
        max_length=max_audio_length,  # 强制音频特征长度
        truncation=True  # 超过则截断
    ).input_features[0]
    
    # 截断文本（例如限制最长128个token）
    max_text_length = 128
    batch["labels"] = tokenizer(
        batch["sentence"],
        max_length=max_text_length,
        truncation=True,  # 超过则截断
        padding="max_length"  # 不足则填充
    ).input_ids
    return batch

# 处理数据集
tokenized_common_voice = common_voice.map(prepare_dataset)

# 定义保存路径
save_path = "./tokenized_common_voice"

# 创建保存目录（如果不存在）
os.makedirs(save_path, exist_ok=True)

# 保存数据集
tokenized_common_voice.save_to_disk(save_path)
print(f"数据集已保存到 {save_path}")


Map:   0%|          | 0/29056 [00:00<?, ? examples/s]

Map:   0%|          | 0/10581 [00:00<?, ? examples/s]

Saving the dataset (0/66 shards):   0%|          | 0/29056 [00:00<?, ? examples/s]

Saving the dataset (0/24 shards):   0%|          | 0/10581 [00:00<?, ? examples/s]

数据集已保存到 ./tokenized_common_voice


2. 加载数据集

In [2]:
from datasets import load_from_disk

okenized_common_voice = load_from_disk(save_path)

In [4]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

# 定义一个针对语音到文本任务的数据整理器类
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any  # 处理器结合了特征提取器和分词器

    # 整理器函数，将特征列表处理成一个批次
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 从特征列表中提取输入特征，并填充以使它们具有相同的形状
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # 从特征列表中提取标签特征（文本令牌），并进行填充
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 使用-100替换标签中的填充区域，-100通常用于在损失计算中忽略填充令牌
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 如果批次中的所有序列都以句子开始令牌开头，则移除它
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        # 将处理过的标签添加到批次中
        batch["labels"] = labels

        return batch  # 返回最终的批次，准备好进行训练或评估
# 用给定的处理器实例化数据整理器
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [5]:
from transformers import AutoModelForSpeechSeq2Seq

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto", torch_dtype=torch.float16)
# 设置模型配置中的forced_decoder_ids属性为None
model.config.forced_decoder_ids = None  # 这通常用于指定在解码（生成文本）过程中必须使用的特定token的ID，设置为None表示没有这样的强制要求

# 设置模型配置中的suppress_tokens列表为空
model.config.suppress_tokens = []  # 这用于指定在生成过程中应被抑制（不生成）的token的列表，设置为空列表表示没有要抑制的token


from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)

from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

# 创建一个LoraConfig对象，用于设置LoRA（Low-Rank Adaptation）的配置参数
config = LoraConfig(
    r=4,  # LoRA的秩，影响LoRA矩阵的大小
    lora_alpha=64,  # LoRA适应的比例因子
    # 指定将LoRA应用到的模型模块，通常是attention和全连接层的投影。
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,  # 在LoRA模块中使用的dropout率
    bias="none",  # 设置bias的使用方式，这里没有使用bias
)
peft_model = get_peft_model(model, config)
peft_model.gradient_checkpointing_disable()  # 禁用梯度检查点



In [6]:
peft_model.print_trainable_parameters()

trainable params: 1,966,080 || all params: 1,545,271,040 || trainable%: 0.12723204856023188


In [7]:
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    per_device_train_batch_size=2,  # 根据显存调整
    per_device_eval_batch_size=4,
    learning_rate=2e-3,
    num_train_epochs=3,
    bf16=True,
    bf16_full_eval=True,
    warmup_steps=100,
    evaluation_strategy="epoch",
    generation_max_length=128,
    logging_steps=500,
    remove_unused_columns=False,
    label_names=["labels"],
    gradient_accumulation_steps=2,  # 梯度累积缓解显存压力
)


In [8]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=peft_model,
    train_dataset=tokenized_common_voice["train"],
    eval_dataset=tokenized_common_voice["validation"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)
peft_model.config.use_cache = False

第一次训练时候，出现了断点，从断点的前一个检查点继续训练

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.92,0.895715
2,0.9089,0.86993




从检查点继续训练

In [9]:
trainer.train(resume_from_checkpoint="/home/cc/models/finetuned-models/whisper-large-v2-finetuned-1/checkpoint-20500")



Epoch,Training Loss,Validation Loss
3,0.8693,0.852651




TrainOutput(global_step=21792, training_loss=0.05177815055006926, metrics={'train_runtime': 2493.2015, 'train_samples_per_second': 34.962, 'train_steps_per_second': 8.741, 'total_flos': 1.85319357677568e+20, 'train_loss': 0.05177815055006926, 'epoch': 3.0})

Epoch 	Training Loss 	Validation Loss
1 	0.920000 	0.895715
2 	0.908900 	0.869930
3 	0.869300 	0.852651
Training Loss训练损失趋势：
从第1轮到第3轮，训练损失从0.920000逐步下降到0.869300，呈持续降低趋势。
这说明模型在不断 “学习” 训练数据中的规律，对训练集的拟合程度在提升。

Validation Loss验证损失趋势：
从第1轮到第3轮，验证损失从 0.895715 逐步下降到 0.852651，同样呈持续降低趋势。
这是一个积极信号，说明模型不仅在训练集上表现更好，对未见过的验证数据的预测能力也在提升（泛化能力增强）。

In [10]:
trainer.save_model(model_dir)

In [3]:
peft_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear8bitLt(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear8bitLt(
                  (base_layer): Linear8bitLt(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
            