In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding,EarlyStoppingCallback
import evaluate
import pandas as pd
import zipfile


In [2]:
# 设置随机种子
torch.manual_seed(42)

<torch._C.Generator at 0x7c9a1a2fea90>

GLUE（General Language Understanding Evaluation）是一个常用的自然语言处理（NLP）基准数据集集合。"sst2": 这是 GLUE 数据集中的一个子集，全称是 Stanford Sentiment Treebank v2，用于情感分析任务。它包含句子及其对应的情感标签（正面或负面）。

In [3]:
# 加载数据集
train_dataset = load_dataset("glue", "sst2", split="train[:16]")
val_dataset = load_dataset("glue", "sst2", split="validation")
test_dataset = load_dataset("glue", "sst2", split="test")

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

[GLUE数据集的详细介绍](https://mp.weixin.qq.com/s?__biz=MzUzMDk1MjUzMQ==&mid=2247483653&idx=1&sn=cae5edc4fb48bc668325d4197e92b3c8&chksm=fbc2783f20713519586dd6b0aab98e52fe89ff344417d00a352164c316fef4f3adf4ea5e9687#rd)

In [4]:
# 加载 DeBERTa 的分词器和模型
model_name = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 修改 tokenize_function，不再使用固定的 max_length，而是使用 padding=True
def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding=True,       # 使用动态填充
        truncation=True,    # 保证文本不会超过最大长度
        max_length=512
    )


# 对训练集和验证集进行分词处理
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [6]:
# 使用 DataCollatorWithPadding 来确保每个批次按最大序列长度动态填充
data_collator = DataCollatorWithPadding(tokenizer)

In [7]:
# 加载准确率指标
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)  # 将 logits 转换为预测标签
    return metric.compute(predictions=predictions, references=labels)

datasets 库默认返回的数据格式是 Python 的字典或列表。如果你使用 PyTorch 进行训练，需要将数据转换为 PyTorch 张量（torch.Tensor），因为 PyTorch 模型和优化器只能处理张量。如果你使用 transformers.Trainer：不需要手动执行 set_format。Trainer 会自动处理数据格式转换

In [9]:
# 设置训练参数
training_args = TrainingArguments(
    output_dir="./results",           # 输出目录
    evaluation_strategy="steps",      # 每 10 步后进行评估
    eval_steps=1,                    # 每 10 步评估一次
    learning_rate=1e-5,               # 学习率
    per_device_train_batch_size=16,  # 训练批次大小
    per_device_eval_batch_size=256,   # 评估批次大小
    num_train_epochs=3,               # 训练 epoch 数
    weight_decay=0.01,                # 权重衰减
    save_strategy="steps",            # 每个 epoch 后保存模型
    save_total_limit=5,               # 最多保存一个检查点
    logging_dir="./logs",             # 日志目录
    logging_steps=1,                 # 每 10 步记录一次日志
    gradient_accumulation_steps=1,    # 梯度累积步数
    dataloader_num_workers=4,        # 增加数据加载进程
    report_to="none",                # 不报告到任何系统（如 Wandb）
    metric_for_best_model="eval_loss", 
    load_best_model_at_end=True,      # 在训练结束时加载最好的模型
)





In [10]:
# 设置早停回调
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=10)  # 5次没有改进时停止
# 定义 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,  # 使用 compute_metrics 函数
    data_collator=data_collator,     # 使用数据整理器来动态填充
    callbacks=[early_stopping_callback] # 加入早停回调
)
# 开始训练
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1,0.7448,0.695856,0.509174
2,0.7581,0.695481,0.509174
3,0.7362,0.695294,0.509174


TrainOutput(global_step=3, training_loss=0.7463723421096802, metrics={'train_runtime': 11.7024, 'train_samples_per_second': 4.102, 'train_steps_per_second': 0.256, 'total_flos': 948536322624.0, 'train_loss': 0.7463723421096802, 'epoch': 3.0})

In [11]:
# 对测试集进行分词处理
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [12]:
# 在测试集上进行预测
predictions = trainer.predict(tokenized_test)

# 获取预测结果
predicted_labels = predictions.predictions.argmax(axis=-1)  # 将 logits 转换为预测标签

In [13]:
import pandas as pd
import zipfile

# 生成提交文件
submission = pd.DataFrame({
    "id": test_dataset["idx"],  # 测试集的索引列
    "label": predicted_labels  # 预测标签
})

# 保存为 TSV 文件（制表符分隔）
submission.to_csv("SST-2.tsv", sep="\t", index=False)

# 压缩为 ZIP 文件
with zipfile.ZipFile("submission.zip", "w") as zipf:
    zipf.write("SST-2.tsv", arcname="SST-2.tsv")

print("提交文件已生成并压缩为 submission.zip")

提交文件已生成并压缩为 submission.zip
