In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import evaluate

# 1. 加载数据
df = pd.read_csv("../DATASETS/work_arrangements_development_set.csv")

# 2. 映射标签
label2id = {"Remote": 0, "OnSite": 1, "Hybrid": 2}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["y_true"].map(label2id)

# 3. 转换为 HuggingFace 数据集格式
dataset = Dataset.from_pandas(df[["job_ad", "label"]].rename(columns={"job_ad": "text"}))


# 4. Tokenization
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length")
dataset = dataset.map(preprocess, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
# 用 train_test_split 进行切分
dataset = dataset.train_test_split(test_size=0.1, seed=60)


# 你现在可以这样用：
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# 5. 加载模型
model_name = "microsoft/deberta-v3-base"  # 使用 DeBERTa 模型

# 加载 tokenizer 和 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,  
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
)

# 6. 准备训练参数
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=20,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,  # 较小的学习率
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

# 7. 指标评估
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

# 8. 训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 9. 开始训练
trainer.train()

# 10. 保存模型
trainer.save_model("./deberta-wa-classifier")

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0929,1.032068,0.5
2,1.0689,1.042497,0.5
3,1.1561,1.031065,0.5
4,1.0353,1.029095,0.5
5,1.0701,1.044209,0.5
6,1.0593,1.050876,0.5
7,1.1059,1.049693,0.5
8,1.0671,1.031708,0.5
9,1.0781,1.034598,0.5
10,1.0771,1.066519,0.3


