In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requi

In [2]:
import logging
import os  # 用于创建目录
import evaluate  # Hugging Face 评估库
import numpy as np
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DebertaV2Config,
    DebertaV2Model,
    TrainingArguments,  # 用于教师训练
    Trainer,  # 用于教师训练
    DataCollatorWithPadding
)
from transformers.models.deberta_v2.modeling_deberta_v2 import DebertaV2Encoder


2025-04-26 07:35:27.476518: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745652927.665901      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745652927.721566      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        # 明确指定输出到 sys.stdout 或 sys.stderr，有时更可靠
        logging.StreamHandler(sys.stdout)
    ],
    force=True  # <--- 添加这个参数
)

# 获取 logger 实例
logger = logging.getLogger(__name__)

# 检查是否有 GPU 可用，后续训练会用到
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 创建结果目录
os.makedirs("./result", exist_ok=True)
os.makedirs("./logs", exist_ok=True)
os.makedirs("./checkpoint", exist_ok=True) # Trainer 会用到


Using device: cuda


In [4]:
# %% [markdown]
# ## 第一步：加载和预处理 IMDB 数据集
# 1. 加载 IMDB 数据集
logger.info("Loading IMDB dataset...")
imdb_dataset = load_dataset("imdb")

2025-04-26 07:35:44,073 [INFO] Loading IMDB dataset...


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [6]:
# 2. 加载教师模型的分词器 (后续学生也用同一个)
teacher_model_id = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(teacher_model_id)


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [7]:
# 3. 定义分词函数
def tokenize_function(examples):
    # 对 "text" 列进行分词，进行填充和截断
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) # DeBERTa 通常最大长度为 512


In [8]:
# 4. 对整个数据集应用分词函数 (使用 batched=True 加速)
logger.info("Tokenizing dataset...")
tokenized_datasets = imdb_dataset.map(tokenize_function, batched=True)


2025-04-26 07:35:52,476 [INFO] Tokenizing dataset...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
# 5. 准备数据整理器 (Data Collator)
# 它会在每个批次内部动态填充到该批次最长序列的长度
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. 准备评估指标计算函数
accuracy_metric = evaluate.load("accuracy") # 加载准确率指标

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions 是 logits，需要先找到概率最高的索引
    predictions = np.argmax(predictions, axis=1)
    # 使用加载的指标计算准确率
    return accuracy_metric.compute(predictions=predictions, references=labels)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [10]:
# 7. 划分验证集和测试集
# IMDB 默认只有 train/test，我们从 test 中分出一部分做验证
# 例如，使用前 12500 个样本作为验证集，后 12500 个作为测试集
tokenized_train = tokenized_datasets["train"] # .shuffle(seed=42).select(range(1000)) # DEBUG: 可选，使用少量数据快速测试
tokenized_val = tokenized_datasets["test"].shard(num_shards=2, index=0) # 前一半测试集做验证
tokenized_test = tokenized_datasets["test"].shard(num_shards=2, index=1) # 后一半测试集做最终测试

logger.info(f"Train dataset size: {len(tokenized_train)}")
logger.info(f"Validation dataset size: {len(tokenized_val)}")
logger.info(f"Test dataset size: {len(tokenized_test)}")


2025-04-26 07:36:54,292 [INFO] Train dataset size: 25000
2025-04-26 07:36:54,292 [INFO] Validation dataset size: 12500
2025-04-26 07:36:54,293 [INFO] Test dataset size: 12500


In [11]:

#第二步：微调教师模型 (Fine-tuning the Teacher Model) 或者加载已经训练好的教师模型
teacher_model_finetuned_path = '/kaggle/input/deberta-v3-base-finetuned-imdb/deberta-v3-base-finetuned-imdb'

In [12]:
# ## 第三步：创建并初始化学生模型
#
# 使用与文档中相同的逻辑，根据微调好的教师模型创建层数减半的学生模型。

# %%
# 定义权重拷贝函数 (与文档中相同)
def copy_deberta_weights(teacher, student):
    if isinstance(teacher, DebertaV2Model) or type(teacher).__name__.startswith('DebertaV2For'):
        for teacher_part, student_part in zip(teacher.children(), student.children()):
            copy_deberta_weights(teacher_part, student_part)
    elif isinstance(teacher, DebertaV2Encoder):
        teacher_encoding_layers = [layer for layer in next(teacher.children())]
        student_encoding_layers = [layer for layer in next(student.children())]
        for i in range(len(student_encoding_layers)):
            student_encoding_layers[i].load_state_dict(teacher_encoding_layers[2*i].state_dict())
    else:
        student.load_state_dict(teacher.state_dict())

# 定义创建学生模型的函数 (与文档中相同)
def create_student(teacher_model):
    configuration = teacher_model.config.to_dict()
    original_num_layers = configuration["num_hidden_layers"]
    configuration["num_hidden_layers"] //= 2
    logger.info(f"Teacher layers: {original_num_layers}, Student layers: {configuration['num_hidden_layers']}")
    student_config = DebertaV2Config.from_dict(configuration)
    # 确保学生模型也知道分类任务的标签数量
    student_config.num_labels = teacher_model.config.num_labels
    student_model = type(teacher_model)(config=student_config)
    copy_deberta_weights(teacher_model, student_model)
    return student_model

In [13]:
# %%
# 1. 加载微调好的教师模型
logger.info(f"Loading fine-tuned teacher model from {teacher_model_finetuned_path}")
teacher_model_distill = AutoModelForSequenceClassification.from_pretrained(teacher_model_finetuned_path, num_labels=2)

# 2. 创建学生模型
logger.info("Creating and initializing student model...")
student_model = create_student(teacher_model_distill)

# 3. 定义学生模型保存路径
student_model_init_path = 'deberta-v3-student-init-imdb'

# 4. 保存初始化后的学生模
logger.info(f"Saving initialized student model to {student_model_init_path}")
student_model.save_pretrained(student_model_init_path)
# 分词器也保存一份到学生目录，虽然内容和教师的一样
tokenizer.save_pretrained(student_model_init_path)

2025-04-26 07:36:54,346 [INFO] Loading fine-tuned teacher model from /kaggle/input/deberta-v3-base-finetuned-imdb/deberta-v3-base-finetuned-imdb
2025-04-26 07:36:54,977 [INFO] Creating and initializing student model...
2025-04-26 07:36:54,978 [INFO] Teacher layers: 12, Student layers: 6
2025-04-26 07:37:00,135 [INFO] Saving initialized student model to deberta-v3-student-init-imdb


('deberta-v3-student-init-imdb/tokenizer_config.json',
 'deberta-v3-student-init-imdb/special_tokens_map.json',
 'deberta-v3-student-init-imdb/spm.model',
 'deberta-v3-student-init-imdb/added_tokens.json',
 'deberta-v3-student-init-imdb/tokenizer.json')

In [14]:
# 清理教师模型显存 (如果需要)
del teacher_model_distill
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [15]:
# ## 第四步：设置并运行知识蒸馏训练
# 定义蒸馏训练参数类 (与文档中相同)
class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

# 定义自定义蒸馏训练器 (与文档中相同)
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # 确保教师模型和学生模型在同一个设备上
        self._move_model_to_device(self.teacher, self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False,num_items_in_batch=None):
        outputs_student = model(**inputs)
        student_loss = outputs_student.loss
        logits_student = outputs_student.logits

        with torch.no_grad():
            outputs_teacher = self.teacher(**inputs)
        logits_teacher = outputs_teacher.logits

        temperature = self.args.temperature
        alpha = self.args.alpha

        loss_fct = nn.KLDivLoss(reduction="batchmean")
        distillation_loss = loss_fct(
            F.log_softmax(logits_student / temperature, dim=-1),
            F.softmax(logits_teacher / temperature, dim=-1)
        ) * (temperature ** 2)

        loss = alpha * student_loss + (1.0 - alpha) * distillation_loss
        return (loss, outputs_student) if return_outputs else loss


In [16]:
# 1. 加载教师模型 (用于蒸馏指导) 和学生模型 (待训练)
logger.info(f"Loading teacher model for distillation from {teacher_model_finetuned_path}")
teacher_model_for_distill = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_finetuned_path, num_labels=2
).to(device) # 移动到 GPU

logger.info(f"Loading initialized student model from {student_model_init_path}")
student_model_for_distill = AutoModelForSequenceClassification.from_pretrained(
    student_model_init_path, num_labels=2
).to(device) # 移动到 GPU


2025-04-26 07:37:01,183 [INFO] Loading teacher model for distillation from /kaggle/input/deberta-v3-base-finetuned-imdb/deberta-v3-base-finetuned-imdb
2025-04-26 07:37:02,902 [INFO] Loading initialized student model from deberta-v3-student-init-imdb


In [17]:
# 2. 加载分词器 (再次加载以确保一致性)
tokenizer = AutoTokenizer.from_pretrained(student_model_init_path) # 从学生或教师目录加载都行

# 3. 配置蒸馏训练参数
distill_output_dir = "./distill_checkpoints"
distill_logging_dir = './distill_logs'

distillation_args = DistillationTrainingArguments(
    output_dir=distill_output_dir,
    # max_steps=1,#为了快速演示,实际使用epoch
    warmup_ratio=0.1,                   # 学习率预热
    num_train_epochs=3,                 # **注意: 蒸馏 epochs，可调整**
    per_device_train_batch_size=8,      # 根据显存调整
    per_device_eval_batch_size=16,     # 根据显存调整
    gradient_accumulation_steps=2,
    learning_rate=3e-5,                 # **注意: 学习率，可调整**
    weight_decay=0.01,
    logging_dir=distill_logging_dir,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",              # 每轮保存，以便后续选择最佳
    load_best_model_at_end=True,        # 结束时加载验证集上最好的模型
    metric_for_best_model="accuracy",
    fp16=torch.cuda.is_available(),
    report_to="tensorboard",
    # --- 蒸馏特定参数 ---
    alpha=0.5,                          # 硬/软损失权重，可调整 (例如 0.2 硬, 0.8 软)
    temperature=4.0                     # 温度，可调整 (例如 2.0, 3.0, 4.0)
)


In [18]:
# 4. 创建 DistillationTrainer 实例
distill_trainer = DistillationTrainer(
    model=student_model_for_distill,    # 学生模型
    teacher_model=teacher_model_for_distill, # 教师模型
    args=distillation_args,             # 蒸馏训练参数
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,         # 使用验证集选择最佳模型
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  super().__init__(*args, **kwargs)


In [19]:
# 5. 开始蒸馏训练
logger.info("Starting knowledge distillation training...")
distill_trainer.train()
logger.info("Knowledge distillation training finished.")


2025-04-26 07:37:03,750 [INFO] Starting knowledge distillation training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5612,0.34451,0.91344
2,0.2947,0.305363,0.91288


2025-04-26 09:22:07,931 [INFO] Knowledge distillation training finished.


In [20]:
# 6. 保存最终训练好的 (最佳) 学生模型
final_student_model_path = "deberta-v3-student-distilled-imdb"
logger.info(f"Saving final distilled student model to {final_student_model_path}")
distill_trainer.save_model(final_student_model_path)
tokenizer.save_pretrained(final_student_model_path) # 分词器也保存


2025-04-26 09:22:07,950 [INFO] Saving final distilled student model to deberta-v3-student-distilled-imdb


('deberta-v3-student-distilled-imdb/tokenizer_config.json',
 'deberta-v3-student-distilled-imdb/special_tokens_map.json',
 'deberta-v3-student-distilled-imdb/spm.model',
 'deberta-v3-student-distilled-imdb/added_tokens.json',
 'deberta-v3-student-distilled-imdb/tokenizer.json')

In [21]:
# 清理显存
del teacher_model_for_distill
del student_model_for_distill
del distill_trainer
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [22]:
# ## 第五步：评估蒸馏后的学生模型

# %%
# 1. 加载最终蒸馏好的学生模型
logger.info(f"Loading final distilled student model from {final_student_model_path}")
final_student_model = AutoModelForSequenceClassification.from_pretrained(
    final_student_model_path, num_labels=2
).to(device) # 加载到 GPU 进行评估

# 2. 创建一个新的 Trainer 用于评估 (不需要教师模型了)
#    使用简单的 TrainingArguments，因为蒸馏参数不再需要
eval_args = TrainingArguments(
    output_dir='./eval_output', # 评估输出目录 (预测结果等会存这里)
    per_device_eval_batch_size=32, # 评估时可以用更大的批次
    do_train=False, # 不训练
    do_eval=True,  # 只评估
    report_to="none" # 不需要报告给 tensorboard
)

eval_trainer = Trainer(
    model=final_student_model,
    args=eval_args,
    eval_dataset=tokenized_test, # **在最终测试集上评估**
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

2025-04-26 09:22:09,660 [INFO] Loading final distilled student model from deberta-v3-student-distilled-imdb


  eval_trainer = Trainer(


In [23]:
# 3. 在测试集上进行预测和评估
logger.info("Evaluating the final distilled student model on the test set...")
evaluation_results = eval_trainer.evaluate() # evaluate 会调用 compute_metrics

logger.info("Final Student Model Evaluation Results on Test Set:")
print(evaluation_results) # 打印评估结果 (例如 {'eval_loss': ..., 'eval_accuracy': ...})

# (可选) 获取详细预测结果并保存
# prediction_outputs = eval_trainer.predict(tokenized_test)
# test_pred_labels = np.argmax(prediction_outputs.predictions, axis=-1)
# print("Sample predictions:", test_pred_labels[:20])
#
# # 保存预测结果 (IMDB 没有 ID，可以只保存预测标签)
# output_df = pd.DataFrame({'predictions': test_pred_labels})
# output_csv_path = "./result/distilled_student_predictions_imdb.csv"
# output_df.to_csv(output_csv_path, index=False)
# logger.info(f"Predictions saved to {output_csv_path}")


2025-04-26 09:22:09,931 [INFO] Evaluating the final distilled student model on the test set...


2025-04-26 09:24:49,893 [INFO] Final Student Model Evaluation Results on Test Set:
{'eval_loss': 0.4851151704788208, 'eval_model_preparation_time': 0.0017, 'eval_accuracy': 0.83632, 'eval_runtime': 159.9553, 'eval_samples_per_second': 78.147, 'eval_steps_per_second': 2.444}
