## 全局参数

In [1]:
MAX_LENGTH = 128  # 每条数据最大长度
BATCH_SIZE = 8  # 批处理大小
NUM_LABELS = 3  # NER标记数量 (e.g., B-LOC, I-LOC, O, etc.)
MODEL_NAME = 'bert-base-chinese'  # 模型名称
# MODEL_PATH = 'model/'  # 模型路径
MODEL_PATH = r'E:/JupyterLab//LLM//Large-Model//bert//'  # 模型路径
LABEL_DATA_PATH = 'data/label_data.json'  # 标注数据路径
OUT_DIR = 'model/'  # 输出路径
LOG_DIR = 'log/'  # 日志路径

In [2]:
label_list = ['O','B-PLACE','I-PLACE']  # 根据你自己的标记集合进行修改
id2label = {
    i: label for i,label in enumerate(label_list)
}
label2id = {
    label: i for i,label in enumerate(label_list)
}

In [3]:
id2label

{0: 'O', 1: 'B-PLACE', 2: 'I-PLACE'}

In [4]:
label2id

{'O': 0, 'B-PLACE': 1, 'I-PLACE': 2}

## 数据处理

In [5]:
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [6]:
# 来自标注好的JSON文件
with open(LABEL_DATA_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

texts = []
labels = []

for entry in data:
    text = entry['content']
    label_sequence = ['O'] * len(text)  # 初始化所有字符的标签为 'O'

    for tag in entry['tags']:
        if tag['name'] == 'PLACE':
            start = tag['start']
            end = tag['end']

            # 将开始位置标记为 'B-PLACE'
            label_sequence[start] = 'B-PLACE'

            # 将后续位置标记为 'I-PLACE'
            for i in range(start + 1, end):
                label_sequence[i] = 'I-PLACE'

    # 将标签转换为标签索引
    label_indices = [label2id[label] for label in label_sequence]

    texts.append(text)
    labels.append(label_indices)

# 检查转换后的格式
print("Texts:", texts[-2:])
print("Labels:", labels[-2:])

Texts: ['拜登总统对国会两院联席会议发表讲话', '中国政府中东问题特使翟隽出席金砖国家中东事务副外长/特使磋商']
Labels: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 2, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [7]:
# 划分数据集--训练测试和验证
texts_train, texts_temp, labels_train, labels_temp = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
texts_val, texts_test, labels_val, labels_test = train_test_split(
    texts_temp, labels_temp, test_size=0.5, random_state=42
)

In [8]:
# 构造字典形式的数据
def create_dataset(texts, labels):
    ids = list(range(len(texts)))
    tokens_list = [list(text) for text in texts]
    return {'id': ids, 'tokens': tokens_list, 'ner_tags': labels}

train_data = create_dataset(texts_train, labels_train)
val_data = create_dataset(texts_val, labels_val)
test_data = create_dataset(texts_test, labels_test)

In [9]:
# 创建 Dataset 和 DatasetDict
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

ner_data = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [10]:
ner_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 305
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 38
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 39
    })
})

In [11]:
ner_data['train'][0]

{'id': 0,
 'tokens': ['美',
  '国',
  '宣',
  '布',
  '向',
  '加',
  '沙',
  '及',
  '该',
  '地',
  '区',
  '的',
  '巴',
  '勒',
  '斯',
  '坦',
  '平',
  '民',
  '提',
  '供',
  '更',
  '多',
  '人',
  '道',
  '主',
  '义',
  '援',
  '助'],
 'ner_tags': [1,
  2,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

## 编码

In [12]:
from transformers import BertTokenizerFast

In [13]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH+MODEL_NAME)  # 自己下载的中文 BERT 模型
tokenizer

BertTokenizerFast(name_or_path='E:/JupyterLab//LLM//Large-Model//bert//bert-base-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [14]:
def preprocess_function(examples):
    # 对输入的 tokens 进行分词和编码
    tokenized_inputs = tokenizer(
        examples['tokens'], 
        truncation=True, 
        padding=True, 
        is_split_into_words=True
    )

    # 处理 NER 标签与分词后的对齐问题
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # 获取分词后单词的索引
        previous_word_idx = None  # 上一个单词的索引
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:  # 如果是特殊标记（如[CLS], [SEP], [PAD]）
                label_ids.append(-100)  # 设置为忽略值
            elif word_idx != previous_word_idx:  # 如果是新单词
                label_ids.append(label[word_idx])  # 使用原始标签
            else:  # 如果是同一单词的子词
                label_ids.append(-100)  # 设置为忽略值
            previous_word_idx = word_idx  # 更新上一个单词索引

        labels.append(label_ids)
    
    # 将处理后的标签添加到分词结果中
    tokenized_inputs['labels'] = labels

    return tokenized_inputs

# 应用预处理函数到数据集
tokenized_datasets = ner_data.map(preprocess_function, batched=True)


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [15]:
tokenized_datasets['train'][0]

{'id': 0,
 'tokens': ['美',
  '国',
  '宣',
  '布',
  '向',
  '加',
  '沙',
  '及',
  '该',
  '地',
  '区',
  '的',
  '巴',
  '勒',
  '斯',
  '坦',
  '平',
  '民',
  '提',
  '供',
  '更',
  '多',
  '人',
  '道',
  '主',
  '义',
  '援',
  '助'],
 'ner_tags': [1,
  2,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  5401,
  1744,
  2146,
  2357,
  1403,
  1217,
  3763,
  1350,
  6421,
  1765,
  1277,
  4638,
  2349,
  1239,
  3172,
  1788,
  2398,
  3696,
  2990,
  897,
  3291,
  1914,
  782,
  6887,
  712,
  721,
  3001,
  1221,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,

## 定义模型

In [16]:
import torch
import torch.nn as nn
from transformers import BertModel, BertPreTrainedModel
from crf import CRF

In [17]:
class BERTCRF(BertPreTrainedModel):
    def __init__(self, config, num_tags=None):
        super(BERTCRF, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(p=0.3)

        self.word_embeds = 768  # BERT hidden size
        self.num_tags = num_tags

        self.classifier = nn.Linear(self.word_embeds, self.num_tags)
        self.crf = CRF(num_tags=self.num_tags, batch_first=True)

        # 使用BERT的权重初始化方法
        self.init_weights()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        # BERT模型输出
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]  # [batch_size, seq_len, hidden_size]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        outputs = (logits,)

        # 如果提供了标签，计算CRF的损失
        if labels is not None:
            loss = self.crf(emissions=logits, tags=labels, mask=attention_mask)
            outputs = (-1*loss,) + outputs  # 损失是负的，因为我们希望最小化损失

        return outputs  # 返回（损失，logits）

In [18]:
# 初始化模型
model = BERTCRF.from_pretrained(MODEL_PATH+MODEL_NAME, NUM_LABELS)

Some weights of BERTCRF were not initialized from the model checkpoint at E:/JupyterLab//LLM//Large-Model//bert//bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight', 'crf.end_transitions', 'crf.start_transitions', 'crf.transitions']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 训练

In [19]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
from torch.optim import AdamW

In [20]:
def calculate_ner_metrics(true_labels, pred_labels):
    """
    自定义评估函数，输入为二维列表，输出为各指标
    """
    assert len(true_labels) == len(pred_labels), "true_labels 和 pred_labels 的长度必须一致"
    
    # 初始化统计变量
    total_true = 0  # 总的真实实体数
    total_pred = 0  # 总的预测实体数
    total_correct = 0  # 预测正确的实体数
    total_tokens = 0  # 总的标注的token数
    correct_tokens = 0  # 预测正确的token数
    
    # 遍历每个序列
    for true_seq, pred_seq in zip(true_labels, pred_labels):
        assert len(true_seq) == len(pred_seq), "每个序列的长度必须一致"
        
        for true, pred in zip(true_seq, pred_seq):
            # 统计 token-level 准确性
            total_tokens += 1
            if true == pred:
                correct_tokens += 1
            
            # 如果是实体标签，更新统计
            if true != "O":  # 真实标签为实体
                total_true += 1
                if true == pred:  # 预测正确的实体
                    total_correct += 1
            
            if pred != "O":  # 预测标签为实体
                total_pred += 1
    
    # 计算指标
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    precision = total_correct / total_pred if total_pred > 0 else 0.0
    recall = total_correct / total_true if total_true > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }
    return metrics

def compute_metrics(pred):
    pred_logits, labels = pred
    pred_logits = pred_logits.argmax(-1)
    # 取去除 padding 的部分
    predictions = [
        [id2label[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [id2label[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
   ]
    result = calculate_ner_metrics(
        true_labels,
        predictions
    )
    return result

In [21]:
# 重写 Trainer 类
class CustomTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            # 获取模型参数
            decay_parameters = [
                p for n, p in self.model.named_parameters() if n.endswith("weight")
            ]
            no_decay_parameters = [
                p for n, p in self.model.named_parameters() if n.endswith("bias")
            ]
            # 将参数分组
            optimizer_grouped_parameters = [
                {"params": decay_parameters, "weight_decay": self.args.weight_decay},
                {"params": no_decay_parameters, "weight_decay": 0.0},
            ]
            # 使用 AdamW 作为优化器
            self.optimizer = AdamW(
                optimizer_grouped_parameters, lr=self.args.learning_rate
            )
        return self.optimizer


# 创建训练参数
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir=LOG_DIR,
    save_total_limit=1,
)

In [22]:
# 数据收集器，用于将数据转换为ner可接受的格式
data_collator = DataCollatorForTokenClassification(tokenizer)  

In [23]:
# 定义 Trainer
trainer = CustomTrainer(
    model=model,  # 替换为你的模型
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [24]:
model

BERTCRF(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [25]:
# 训练 model
trainer.train()

  0%|          | 0/117 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  score = torch.where(mask[i].unsqueeze(1), next_score, score)


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 1.0242048501968384, 'eval_accuracy': 0.990990990990991, 'eval_precision': 0.9900497512437811, 'eval_recall': 0.9660194174757282, 'eval_f1_score': 0.977886977886978, 'eval_runtime': 0.2226, 'eval_samples_per_second': 170.739, 'eval_steps_per_second': 22.466, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.4407471716403961, 'eval_accuracy': 0.991991991991992, 'eval_precision': 0.9712918660287081, 'eval_recall': 0.9854368932038835, 'eval_f1_score': 0.9783132530120482, 'eval_runtime': 0.2009, 'eval_samples_per_second': 189.123, 'eval_steps_per_second': 24.885, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.7782583832740784, 'eval_accuracy': 0.992992992992993, 'eval_precision': 0.9805825242718447, 'eval_recall': 0.9805825242718447, 'eval_f1_score': 0.9805825242718447, 'eval_runtime': 0.2191, 'eval_samples_per_second': 173.408, 'eval_steps_per_second': 22.817, 'epoch': 3.0}
{'train_runtime': 28.6383, 'train_samples_per_second': 31.95, 'train_steps_per_second': 4.085, 'train_loss': 2.547114967280983, 'epoch': 3.0}


TrainOutput(global_step=117, training_loss=2.547114967280983, metrics={'train_runtime': 28.6383, 'train_samples_per_second': 31.95, 'train_steps_per_second': 4.085, 'total_flos': 31034027498760.0, 'train_loss': 2.547114967280983, 'epoch': 3.0})

In [26]:
best_ckpt_path = trainer.state.best_model_checkpoint
best_ckpt_path

'model/checkpoint-78'

## 评估

In [27]:
trainer.evaluate(eval_dataset=tokenized_datasets['test'])

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.4621560573577881,
 'eval_accuracy': 0.9962013295346629,
 'eval_precision': 0.9880952380952381,
 'eval_recall': 0.9880952380952381,
 'eval_f1_score': 0.9880952380952381,
 'eval_runtime': 0.1972,
 'eval_samples_per_second': 197.767,
 'eval_steps_per_second': 25.355,
 'epoch': 3.0}

## 测试

In [28]:
# 测试文本
input_text = "今天，美利坚合众国国防部发言人乔治说中华人民共和国的歼20战机很优秀。"
encoding = tokenizer(input_text, return_tensors="pt", is_split_into_words=False, truncation=True)
encoding = {k: v.to(model.device) for k, v in encoding.items()}
input_text

'今天，美利坚合众国国防部发言人乔治说中华人民共和国的歼20战机很优秀。'

In [29]:
# 模型预测
with torch.no_grad():
    outputs = model(**encoding)  # 获得模型的输出
    logits = outputs[0]  # 获取logits，维度: [batch_size, seq_len, num_tags]

    # 通过 CRF 层来获取最佳路径标签（最可能的标签序列）
    predicted_class_ids = model.crf.decode(logits, mask=encoding['attention_mask'])  # 预测标签
    predicted_class_ids = predicted_class_ids.squeeze().tolist()  # 把预测的 class ids 转换成列表


# 将预测结果映射为标签，并将标签与原始文本对应起来
predicted_labels = [id2label[class_id] for class_id in predicted_class_ids]
tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze().tolist())
results = list(zip(tokens, predicted_labels))

# 打印预测结果
print("输入文本:", input_text)
print("预测结果:")
for token, label in results:
    print(f"{token:15} -> {label}")

输入文本: 今天，美利坚合众国国防部发言人乔治说中华人民共和国的歼20战机很优秀。
预测结果:
[CLS]           -> O
今               -> O
天               -> O
，               -> O
美               -> B-PLACE
利               -> I-PLACE
坚               -> I-PLACE
合               -> I-PLACE
众               -> I-PLACE
国               -> I-PLACE
国               -> O
防               -> O
部               -> O
发               -> O
言               -> O
人               -> O
乔               -> O
治               -> O
说               -> O
中               -> B-PLACE
华               -> I-PLACE
人               -> I-PLACE
民               -> I-PLACE
共               -> I-PLACE
和               -> I-PLACE
国               -> I-PLACE
的               -> O
歼               -> O
20              -> O
战               -> O
机               -> O
很               -> O
优               -> O
秀               -> O
。               -> O
[SEP]           -> O


## 使用Fine-tuning模型预测

In [30]:
import torch.nn as nn
from transformers import BertModel, BertTokenizer, BertPreTrainedModel
from crf import CRF

import pandas as pd

In [31]:
class BERTCRF(BertPreTrainedModel):
    def __init__(self, config, num_tags=None):
        super(BERTCRF, self).__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(p=0.3)

        self.word_embeds = 768  # BERT hidden size
        self.num_tags = num_tags

        self.classifier = nn.Linear(self.word_embeds, self.num_tags)
        self.crf = CRF(num_tags=self.num_tags, batch_first=True)

        # 使用BERT的权重初始化方法
        self.init_weights()

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        # BERT模型输出
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0]  # [batch_size, seq_len, hidden_size]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        outputs = (logits,)

        # 如果提供了标签，计算CRF的损失
        if labels is not None:
            loss = self.crf(emissions=logits, tags=labels, mask=attention_mask)
            outputs = (-loss,) + outputs  # 损失是负的，因为我们希望最小化损失

        return outputs  # 返回（损失，logits）

In [33]:
# 记载训练好的模型
best_ckpt_path = 'model/checkpoint-78'
model = BERTCRF.from_pretrained(best_ckpt_path, num_tags=3)  # 需要自己输出标签数量
tokenizer = BertTokenizer.from_pretrained(best_ckpt_path)  # 使用与训练时相同的 tokenizer

In [39]:
# 假设需要预测的文本在一个 DataFrame 中
texts = [
    '中国政府中东问题特使翟隽会见伊拉克驻华大使赛义德',
    '外交部阿富汗事务特使岳晓勇会见联合国秘书长阿富汗问题独立评估特别协调员',
    '中国政府中东问题特使翟隽会见巴林外交次大臣阿卜杜拉',
    '多哥总理多贝会见中国外交部非洲司司长吴鹏',
    '国务卿安东尼·布林肯（Antony J. Blinken）在切萨皮克湾基金会（Chesapeake Bay Foundation）发表讲话： “应对危机，抓住时机：发挥美国在全球气候问题上的领导作用”',
    '外交部亚非司司长王镝访问中东六国并赴俄罗斯举行中俄中东事务司局级磋商',
    '美国对印太地区的持久承诺：本届政府印太战略发布两周年',
    '关于被非法关押的美国人和俄罗斯政治犯获释的声明',
    '中国政府非洲事务特别代表许镜湖访问纳米比亚',
]
df = pd.DataFrame(data={'text':texts})

In [40]:
id2label = {
    0: 'O',
    1: 'B-PLACE',
    2: 'I-PLACE'
}

def predict(texts):
    model.eval()  
    predictions = []

    for input_text in texts:
        encoding = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        encoding = {k: v.to(model.device) for k, v in encoding.items()}

        # 模型预测
        with torch.no_grad():
            outputs = model(**encoding)  # 获得模型的输出
            logits = outputs[0]  # 获取logits，维度: [batch_size, seq_len, num_tags]

            # 通过 CRF 层来获取最佳路径标签（最可能的标签序列）
            predicted_class_ids = model.crf.decode(logits, mask=encoding['attention_mask'])  # 预测标签
            predicted_class_ids = predicted_class_ids.squeeze().tolist()  # 把预测的 class ids 转换成列表

        # 将预测结果映射为标签
        predicted_labels = [id2label[class_id] for class_id in predicted_class_ids[1:-1]]  # 去掉 [CLS] 和 [SEP] 标签
        predictions.append(predicted_labels)

    return predictions

# 执行预测
df['predictions'] = predict(df['text'].tolist())

In [41]:
df

Unnamed: 0,text,predictions
0,中国政府中东问题特使翟隽会见伊拉克驻华大使赛义德,"[B-PLACE, I-PLACE, O, O, B-PLACE, I-PLACE, O, ..."
1,外交部阿富汗事务特使岳晓勇会见联合国秘书长阿富汗问题独立评估特别协调员,"[O, O, O, B-PLACE, I-PLACE, I-PLACE, O, O, O, ..."
2,中国政府中东问题特使翟隽会见巴林外交次大臣阿卜杜拉,"[B-PLACE, I-PLACE, O, O, B-PLACE, I-PLACE, O, ..."
3,多哥总理多贝会见中国外交部非洲司司长吴鹏,"[B-PLACE, I-PLACE, O, O, O, O, O, O, B-PLACE, ..."
4,国务卿安东尼·布林肯（Antony J. Blinken）在切萨皮克湾基金会（Chesape...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
5,外交部亚非司司长王镝访问中东六国并赴俄罗斯举行中俄中东事务司局级磋商,"[O, O, O, B-PLACE, I-PLACE, O, O, O, O, O, O, ..."
6,美国对印太地区的持久承诺：本届政府印太战略发布两周年,"[B-PLACE, I-PLACE, O, B-PLACE, O, O, O, O, O, ..."
7,关于被非法关押的美国人和俄罗斯政治犯获释的声明,"[O, O, O, O, O, O, O, O, B-PLACE, I-PLACE, O, ..."
8,中国政府非洲事务特别代表许镜湖访问纳米比亚,"[B-PLACE, I-PLACE, O, O, B-PLACE, I-PLACE, O, ..."


In [45]:
# 打印预测结果
for i, text in enumerate(df['text']):
    print(f"Text: {text}")
    print("Predicted Labels:")
    for token, label in zip(tokenizer.tokenize(text), df.loc[i,'predictions']):
        print(f"{token:15} -> {label}")
    print("\n")

Text: 中国政府中东问题特使翟隽会见伊拉克驻华大使赛义德
Predicted Labels:
中               -> B-PLACE
国               -> I-PLACE
政               -> O
府               -> O
中               -> B-PLACE
东               -> I-PLACE
问               -> O
题               -> O
特               -> O
使               -> O
翟               -> O
隽               -> O
会               -> O
见               -> O
伊               -> B-PLACE
拉               -> I-PLACE
克               -> I-PLACE
驻               -> O
华               -> O
大               -> O
使               -> O
赛               -> O
义               -> O
德               -> O


Text: 外交部阿富汗事务特使岳晓勇会见联合国秘书长阿富汗问题独立评估特别协调员
Predicted Labels:
外               -> O
交               -> O
部               -> O
阿               -> B-PLACE
富               -> I-PLACE
汗               -> I-PLACE
事               -> O
务               -> O
特               -> O
使               -> O
岳               -> O
晓               -> O
勇               -> O
会               -> O
见               -> O
联         