<a href="https://colab.research.google.com/github/CityHuman/Auto-GPT/blob/master/ats%E5%BE%AE%E8%B0%831.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install transformers datasets torch torchaudio librosa evaluate scikit-learn accelerate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import torch
import numpy as np
import warnings
from datasets import load_dataset, Audio
from transformers import ASTFeatureExtractor, ASTForAudioClassification, TrainingArguments, Trainer
import evaluate

# 忽略一些不必要的警告
warnings.filterwarnings("ignore")

# 核心优化策略
# 增大 Batch Size (4 -> 64/96): AST 的输入序列长度约为 512（patch数量），在 80G 显存上，你可以轻松将 Batch Size 开到 64 甚至 96（AST-Base 模型）。这能让 GPU 核心跑满。
# 移除梯度累积: 既然 Batch Size 足够大，就不需要 Gradient Accumulation 了，直接每一步更新权重，速度更快。
# 多进程数据加载 (Workers 0 -> 16): 你有 160G 内存，应该利用多核 CPU 快速将数据送入 GPU，消除 I/O 瓶颈。
# 开启 TF32 / BF16: 如果你是 A100/H100，使用 bf16（BFloat16）通常比 fp16 更快且更稳定。同时开启 TF32 加速矩阵运算。
# 多进程预处理: dataset.map 开启多核处理，瞬间完成特征提取。

# -----------------------------------------------------------------------------
# 1. 配置参数
# -----------------------------------------------------------------------------
MODEL_CHECKPOINT = "MIT/ast-finetuned-audioset-10-10-0.4593"
BATCH_SIZE = 64
GRADIENT_ACCUMULATION_STEPS = 8
LEARNING_RATE = 5e-5
NUM_EPOCHS = 5
MAX_AUDIO_LENGTH = 1024  # AST 默认的 input_values 长度 (10.24秒)

# -----------------------------------------------------------------------------
# 2. 加载数据 (ESC-50)
# -----------------------------------------------------------------------------
print(">> 正在加载 ESC-50 数据集...")
# 使用 ashraq/esc50，这是一个结构清晰的 ESC-50 版本
dataset = load_dataset("ashraq/esc50", split="train")

# 划分训练集和测试集 (80% 训练, 20% 测试)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# -----------------------------------------------------------------------------
# 3. 标签处理 (Label Mapping)
# -----------------------------------------------------------------------------
print(">> 正在处理标签...")
# 获取所有唯一的类别名称
labels_list = dataset["train"].unique("category")
labels_list.sort()  # 排序以保证顺序一致
num_labels = len(labels_list)

# 创建映射字典
label2id = {label: i for i, label in enumerate(labels_list)}
id2label = {i: label for i, label in enumerate(labels_list)}

print(f"   检测到 {num_labels} 个类别。")

# 将 category 转换为数字 ID 并存为 'labels' 列
def encode_labels(example):
    example["labels"] = label2id[example["category"]]
    return example

dataset = dataset.map(encode_labels)

# -----------------------------------------------------------------------------
# 4. 音频预处理 (Feature Extraction)
# -----------------------------------------------------------------------------
print(">> 正在预处理音频数据...")

# 初始化特征提取器
feature_extractor = ASTFeatureExtractor.from_pretrained(MODEL_CHECKPOINT)
target_sampling_rate = feature_extractor.sampling_rate # 通常是 16000

# 这一步非常关键：使用 datasets 库的 Audio 功能自动重采样
dataset = dataset.cast_column("audio", Audio(sampling_rate=target_sampling_rate))

def preprocess_function(examples):
    # 提取音频数据 (list of numpy arrays)
    audio_arrays = [x["array"] for x in examples["audio"]]

    # 使用 ASTFeatureExtractor 处理
    # max_length=1024 对应 AST 的标准输入 (约10.24秒)
    # ESC-50 音频只有5秒，feature_extractor 会自动补零 (padding)
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=target_sampling_rate,
        max_length=MAX_AUDIO_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="np" # 返回 numpy 数组，让 Trainer 的 Collator 转换为 Tensor
    )

    # inputs 是一个字典，包含 'input_values'
    return inputs

# 执行预处理 map
# 注意：这里我们移除了原始的 'audio' 和其他不需要的文本列，保留 'labels'
cols_to_remove = [col for col in dataset["train"].column_names if col not in ["labels"]]

encoded_dataset = dataset.map(
    preprocess_function,
    remove_columns=cols_to_remove, # 移除除了 labels 和 input_values 之外的所有列
    batched=True,
    batch_size=10,
    num_proc=1, # 如果报错，可以设为 1；通常 4 更快
    desc="Preprocessing audio"
)

# 检查一下数据格式是否正确
print(">> 数据预处理完成。样本格式示例：")
print(encoded_dataset["train"][0].keys()) # 应该包含 'labels' 和 'input_values'

# -----------------------------------------------------------------------------
# 5. 加载模型
# -----------------------------------------------------------------------------
print(">> 正在加载预训练模型...")
model = ASTForAudioClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True # 忽略预训练模型分类头尺寸不匹配的警告
)

# -----------------------------------------------------------------------------
# 6. 设置评估指标
# -----------------------------------------------------------------------------
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    # eval_pred 包含 (logits, labels)
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

# -----------------------------------------------------------------------------
# 7. 训练设置
# -----------------------------------------------------------------------------
training_args = TrainingArguments(
    output_dir="./ast_esc50_result",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    # gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=NUM_EPOCHS,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True if torch.cuda.is_available() else False, # GPU上开启混合精度
    save_total_limit=1, # 只保留最好的模型
    remove_unused_columns=False, # 防止 Trainer 错误地移除 input_values
    # dataloader_num_workers=0 # Windows下设为0，Linux下可设为4
    dataloader_num_workers=16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics,
)

# -----------------------------------------------------------------------------
# 8. 开始训练
# -----------------------------------------------------------------------------
print(">> 开始训练...")
trainer.train()

# -----------------------------------------------------------------------------
# 9. 保存模型
# -----------------------------------------------------------------------------
print(">> 保存模型中...")
trainer.save_model("./ast_esc50_finetuned_final")
feature_extractor.save_pretrained("./ast_esc50_finetuned_final")
print(">> 全部完成！")

>> 正在加载 ESC-50 数据集...


README.md:   0%|          | 0.00/345 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00002-2f1ab7b824ec75(…):   0%|          | 0.00/387M [00:00<?, ?B/s]

data/train-00001-of-00002-27425e5c1846b4(…):   0%|          | 0.00/387M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

>> 正在处理标签...


Flattening the indices:   0%|          | 0/1600 [00:00<?, ? examples/s]

   检测到 50 个类别。


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

>> 正在预处理音频数据...


preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Preprocessing audio:   0%|          | 0/1600 [00:00<?, ? examples/s]

Preprocessing audio:   0%|          | 0/400 [00:00<?, ? examples/s]

>> 数据预处理完成。样本格式示例：
dict_keys(['labels', 'input_values'])
>> 正在加载预训练模型...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/203 [00:00<?, ?it/s]

ASTForAudioClassification LOAD REPORT from: MIT/ast-finetuned-audioset-10-10-0.4593
Key                     | Status   |                                                                                        
------------------------+----------+----------------------------------------------------------------------------------------
classifier.dense.weight | MISMATCH | Reinit due to size mismatch ckpt: torch.Size([527, 768]) vs model:torch.Size([50, 768])
classifier.dense.bias   | MISMATCH | Reinit due to size mismatch ckpt: torch.Size([527]) vs model:torch.Size([50])          

Notes:
- MISMATCH	:ckpt weights were loaded, but they did not match the original empty weight shapes.


Downloading builder script: 0.00B [00:00, ?B/s]

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


>> 开始训练...


Epoch,Training Loss,Validation Loss,Accuracy
1,1.957085,0.604851,0.92
2,0.128496,0.209351,0.95
3,0.036101,0.157143,0.97
4,0.012786,0.130016,0.9725
5,0.006619,0.125276,0.97


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

>> 保存模型中...


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

>> 全部完成！
