In [1]:
import os

os.environ['http_proxy'] = 'http://127.0.0.1:1087'
os.environ['https_proxy'] = 'http://127.0.0.1:1087'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
language = "fi
language_abbr = "fi"  # 芬兰语
task = "transcribe"  # 转录任务
dataset_name = "mozilla-foundation/common_voice_11_0"

model_name_or_path = "/home/cc/models/asr/whisper-large-v2"
model_dir = "/home/cc/models/finetuned-models/whisper-large-v2-finetuned-fi"

In [3]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset(dataset_name, language_abbr, split="train", trust_remote_code=True)
common_voice["validation"] = load_dataset(dataset_name, language_abbr, split="validation", trust_remote_code=True)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/65.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/163M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.75M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/505k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/385k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/389k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.1k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 2165it [00:00, 369162.87it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 1650it [00:00, 415651.75it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 1704it [00:00, 426259.56it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 5779it [00:00, 425027.32it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 197it [00:00, 271533.98it/s]


In [4]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)  # 移除数据集中不必要的字段
common_voice["train"] = common_voice["train"].shuffle(seed=16)
common_voice["validation"] = common_voice["validation"].shuffle(seed=16)

In [5]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path) 
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)  
processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)  

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def prepare_dataset(batch):
    audio = batch["audio"]
    # 截断音频（例如限制最长30秒，根据采样率计算点数：30s * 16000Hz = 480000）
    max_audio_length = 480000  # 需与feature_extractor的采样率匹配
    audio_array = audio["array"][:max_audio_length]  # 截断音频
    # 提取特征时指定最大长度
    batch["input_features"] = feature_extractor(
        audio_array, 
        sampling_rate=audio["sampling_rate"],
        max_length=max_audio_length,  # 强制音频特征长度
        truncation=True  # 超过则截断
    ).input_features[0]
    
    # 截断文本（例如限制最长128个token）
    max_text_length = 128
    batch["labels"] = tokenizer(
        batch["sentence"],
        max_length=max_text_length,
        truncation=True,  # 超过则截断
        padding="max_length"  # 不足则填充
    ).input_ids
    return batch

# 处理数据集
tokenized_common_voice = common_voice.map(prepare_dataset)

# 定义保存路径
save_path = "/home/cc/projects/my_tokenized_datasets/common_voice/fi"

# 创建保存目录（如果不存在）
os.makedirs(save_path, exist_ok=True)

# 保存数据集
tokenized_common_voice.save_to_disk(save_path)
print(f"数据集已保存到 {save_path}")


Map:   0%|          | 0/2165 [00:00<?, ? examples/s]

Map:   0%|          | 0/1650 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/2165 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/1650 [00:00<?, ? examples/s]

数据集已保存到 /home/cc/projects/my_tokenized_datasets/common_voice/fi
