In [28]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import load_dataset


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# 加载数据
dataset = load_dataset('csv', data_files={
  'train': '../datasets/train.csv',
  'validation': '../datasets/test.csv'
})
print(dataset)

cpu
DatasetDict({
    train: Dataset({
        features: ['comment_sum', 'content', 'create_time', 'dun_num', 'hot', 'jinghua', 'jubao_count', 'topic_id', 'zan_sum', 'label'],
        num_rows: 10124
    })
    validation: Dataset({
        features: ['comment_sum', 'content', 'create_time', 'dun_num', 'hot', 'jinghua', 'jubao_count', 'topic_id', 'zan_sum', 'label'],
        num_rows: 2532
    })
})


In [29]:
MODEL_NAME = 'hfl/chinese-roberta-wwm-ext'
# 加载预训练分词器
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
# 加载预训练模型，总共7个标签
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=7)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [30]:
# 数据预处理
def process(examples):
  encoding = tokenizer(examples['content'], padding='max_length', truncation=True, max_length=128)
  return encoding

# 编码数据
encoded_dataset = dataset.map(process, batched=True,remove_columns=['content'])

In [31]:
# 划分数据集
train_dataset = encoded_dataset["train"].shuffle(seed=42)
eval_dataset = encoded_dataset["validation"]

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",  # 避免非必要的外部日志
)

# 使用 Trainer 进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {"accuracy": (p.predictions.argmax(-1) == p.label_ids).mean()},
)

# 模型训练
trainer.train()
# 模型评估
results = trainer.evaluate()
print(results)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
import pandas as pd
from dataclasses import dataclass
from tqdm import tqdm


@dataclass
class Result:
  pid: int
  create_time: int
  content: str
  label: int

# 模型预测（12万条数据）
total_df = pd.read_csv('../datasets/filter_data.csv')
results = []
for idx, row in tqdm(total_df):
  pid = row['pid']
  create_time = row['create_time']
  content = str(row['content'])
  inputs = tokenizer(content, return_tensors="pt", padding=True, truncation=True, max_length=128)
  # 将输入数据移动到正确的设备（GPU/CPU）
  inputs = {key: value.to(device) for key, value in inputs.items()}

  # 模型预测
  outputs = model(**inputs)
  logits = outputs.logits
  result = int(torch.argmax(logits, dim=-1))
  results.append(Result(pid, create_time, content, result))

results_df = pd.DataFrame(results)
results_df.to_csv('results_no_metadata.csv', index=False)