In [None]:
import logging
import os
import evaluate
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# --- 配置基础设置 ---
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("./logs/training.log"),  # 保存到文件
        logging.StreamHandler()                      # 输出到控制台
    ]
)
logger = logging.getLogger(__name__)

# 检查 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 创建必要的目录
os.makedirs("./result", exist_ok=True)
os.makedirs("./logs", exist_ok=True)
os.makedirs("./teacher_checkpoints", exist_ok=True) # 教师训练检查点目录
os.makedirs("./teacher_logs", exist_ok=True)      # 教师训练日志目录


In [None]:
# --- 数据加载与预处理 ---
logger.info("Loading IMDB dataset...")
imdb_dataset = load_dataset("imdb")

# 定义教师模型 ID
teacher_model_id = 'microsoft/deberta-v3-base'

logger.info(f"Loading tokenizer for {teacher_model_id}...")
tokenizer = AutoTokenizer.from_pretrained(teacher_model_id)

# 定义分词函数
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

logger.info("Tokenizing dataset...")
tokenized_datasets = imdb_dataset.map(tokenize_function, batched=True)

# 数据整理器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
# 评估指标
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)


In [None]:
# 划分数据集 (训练集和验证集用于教师训练)
tokenized_train = tokenized_datasets["train"]
tokenized_val = tokenized_datasets["test"].shard(num_shards=2, index=0) # 使用测试集前一半作为验证集

logger.info(f"Train dataset size: {len(tokenized_train)}")
logger.info(f"Validation dataset size: {len(tokenized_val)}")

In [None]:
# --- 教师模型微调与保存 ---

# 定义教师模型最终保存路径
teacher_model_finetuned_path = 'deberta-v3-base-finetuned-imdb'

# 直接执行训练流程（不再检查是否存在）
logger.info(f"Starting teacher model fine-tuning...")

# 1. 加载预训练模型
teacher_model_for_finetune = AutoModelForSequenceClassification.from_pretrained(
    teacher_model_id,
    num_labels=2
).to(device)

# 2. 配置训练参数
teacher_training_args = TrainingArguments(
    output_dir='./teacher_checkpoints',
    num_train_epochs=3,
    warmup_ratio=0.1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    logging_dir='./teacher_logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=torch.cuda.is_available(),
    report_to="tensorboard",
)

# 3. 初始化Trainer
teacher_trainer = Trainer(
    model=teacher_model_for_finetune,
    args=teacher_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 4. 开始训练
teacher_trainer.train()

# 5. 保存最终模型
logger.info(f"Saving fine-tuned teacher model to {teacher_model_finetuned_path}")
teacher_trainer.save_model(teacher_model_finetuned_path)
tokenizer.save_pretrained(teacher_model_finetuned_path)

print(f"Teacher model training process complete. Final model saved at: {teacher_model_finetuned_path}")