参考：https://github.com/vaibhavdangar09/NER-WITH-BERT/blob/main/NER_WITH_BERT.ipynb

## 提取数据

In [1]:
import pandas as pd
from sqlalchemy import create_engine, text

dbname = 'other' 
engine = create_engine(f'postgresql://postgres:123@localhost:5432/{dbname}')

In [2]:
try:
    sql = """
    SELECT text FROM mfa_cn
    UNION
    SELECT title as text FROM mfa_usa;
    """
    df = pd.read_sql(text(sql), con=engine)
except:
    df = pd.read_csv('data/data.csv')

- 将数据写入 txt
- 利用标注工具标注实体
- 使用[MarkStudio](https://github.com/cuiwang/MarkStudio)进行标注

In [3]:
# 将每一行数据写入txt文件
txt_file = 'data/ner_label_in.txt'
with open(txt_file, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        f.write(row['text'] + '\n')  #

print(f"数据已成功写入 {txt_file} 文件！")

数据已成功写入 data/ner_label_in.txt 文件！


In [4]:
MAX_LENGTH = 128  # 每条数据最大长度
BATCH_SIZE = 8  # 批处理大小
NUM_LABELS = 3  # NER标记数量 (e.g., B-LOC, I-LOC, O, etc.)
MODEL_NAME = 'bert-base-chinese'  # 模型名称
# MODEL_PATH = 'model/'  # 模型路径
MODEL_PATH = r'E:/JupyterLab//LLM//Large-Model//bert//'  # 模型路径
LABEL_DATA_PATH = 'data/label_data.json'  # 标注数据路径
OUT_DIR = 'model/'  # 输出路径
LOG_DIR = 'log/'  # 日志路径

In [5]:
label_list = ['O','B-PLACE','I-PLACE']  # 根据你自己的标记集合进行修改
id2label = {
    i: label for i,label in enumerate(label_list)
}
label2id = {
    label: i for i,label in enumerate(label_list)
}

In [6]:
id2label

{0: 'O', 1: 'B-PLACE', 2: 'I-PLACE'}

In [7]:
label2id

{'O': 0, 'B-PLACE': 1, 'I-PLACE': 2}

## 数据处理

- MarkStudio输出的格式为JOSN文件
- 我们预取的数据格式为：

    ```python
    text: ['外交部中阿合作论坛事务大使李琛访问卡塔尔']
    label: [0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2]
    ```
- 我们使用 [BIO](https://blog.csdn.net/HappyRocking/article/details/79716212) 法则标注数据
- 0,1,2分别表示非实体，实体开始，实体中间

In [8]:
import json
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [9]:
# 来自标注好的JSON文件
with open(LABEL_DATA_PATH, 'r', encoding='utf-8') as f:
    data = json.load(f)

texts = []
labels = []

for entry in data:
    text = entry['content']
    label_sequence = ['O'] * len(text)  # 初始化所有字符的标签为 'O'

    for tag in entry['tags']:
        if tag['name'] == 'PLACE':
            start = tag['start']
            end = tag['end']

            # 将开始位置标记为 'B-PLACE'
            label_sequence[start] = 'B-PLACE'

            # 将后续位置标记为 'I-PLACE'
            for i in range(start + 1, end):
                label_sequence[i] = 'I-PLACE'

    # 将标签转换为标签索引
    label_indices = [label2id[label] for label in label_sequence]

    texts.append(text)
    labels.append(label_indices)

# 检查转换后的格式
print("Texts:", texts[-2:])
print("Labels:", labels[-2:])

Texts: ['拜登总统对国会两院联席会议发表讲话', '中国政府中东问题特使翟隽出席金砖国家中东事务副外长/特使磋商']
Labels: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 2, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


- 还需要对数据进行处理
- 首先是进行数据集划分
- 然后构建为以下字典形式：
```python
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 305
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 38
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 39
    })
})
```

In [10]:
# 划分数据集--训练测试和验证
texts_train, texts_temp, labels_train, labels_temp = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)
texts_val, texts_test, labels_val, labels_test = train_test_split(
    texts_temp, labels_temp, test_size=0.5, random_state=42
)

In [11]:
# 构造字典形式的数据
def create_dataset(texts, labels):
    ids = list(range(len(texts)))
    tokens_list = [list(text) for text in texts]
    return {'id': ids, 'tokens': tokens_list, 'ner_tags': labels}

train_data = create_dataset(texts_train, labels_train)
val_data = create_dataset(texts_val, labels_val)
test_data = create_dataset(texts_test, labels_test)

In [12]:
# 创建 Dataset 和 DatasetDict
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

ner_data = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [13]:
ner_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 305
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 38
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 39
    })
})

In [14]:
ner_data['train'][0]

{'id': 0,
 'tokens': ['美',
  '国',
  '宣',
  '布',
  '向',
  '加',
  '沙',
  '及',
  '该',
  '地',
  '区',
  '的',
  '巴',
  '勒',
  '斯',
  '坦',
  '平',
  '民',
  '提',
  '供',
  '更',
  '多',
  '人',
  '道',
  '主',
  '义',
  '援',
  '助'],
 'ner_tags': [1,
  2,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

## 编码

In [15]:
from transformers import BertTokenizerFast

In [16]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH+MODEL_NAME)  # 自己下载的中文 BERT 模型
tokenizer

BertTokenizerFast(name_or_path='E:/JupyterLab//LLM//Large-Model//bert//bert-base-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [17]:
example_text = ner_data['train'][0]
tokenized_input = tokenizer(example_text['tokens'],is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
word_ids = tokenized_input.word_ids()

In [18]:
print(tokenized_input)
print("\n")
print(tokens)
print("\n")
print(word_ids)

{'input_ids': [101, 5401, 1744, 2146, 2357, 1403, 1217, 3763, 1350, 6421, 1765, 1277, 4638, 2349, 1239, 3172, 1788, 2398, 3696, 2990, 897, 3291, 1914, 782, 6887, 712, 721, 3001, 1221, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


['[CLS]', '美', '国', '宣', '布', '向', '加', '沙', '及', '该', '地', '区', '的', '巴', '勒', '斯', '坦', '平', '民', '提', '供', '更', '多', '人', '道', '主', '义', '援', '助', '[SEP]']


[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, None]


In [19]:
print(f'Length of the tokens is : {len(tokens)}')
print(f'Length of the ner tags is: {len(ner_data["train"][0]["ner_tags"])}')

Length of the tokens is : 30
Length of the ner tags is: 28


- 在这里，文字标记和token的尺寸是不同的，所以为了使代币和文字标记的尺寸相同，我们在文字标记的第一个和最后一个位置加上 -100。
- 在训练过程中，BERT 模型避开了 -100。

In [20]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [21]:
example_text = tokenize_and_align_labels(ner_data['train'][1:2])
example_text

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[101, 1744, 2157, 2128, 1059, 7560, 7309, 3345, 1046, 185, 3763, 1164, 3152, 1068, 754, 915, 5384, 3172, 1415, 1104, 5468, 1394, 1744, 2128, 4415, 833, 517, 1912, 2231, 4958, 7313, 3340, 5276, 518, 1104, 6379, 4638, 1898, 3209, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}

In [22]:
for token, label in zip(tokenizer.convert_ids_to_tokens(example_text["input_ids"][0]), example_text["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
国_______________________________________ 0
家_______________________________________ 0
安_______________________________________ 0
全_______________________________________ 0
顾_______________________________________ 0
问_______________________________________ 0
杰_______________________________________ 0
克_______________________________________ 0
·_______________________________________ 0
沙_______________________________________ 0
利_______________________________________ 0
文_______________________________________ 0
关_______________________________________ 0
于_______________________________________ 0
俄_______________________________________ 1
罗_______________________________________ 2
斯_______________________________________ 2
否_______________________________________ 0
决_______________________________________ 0
联_______________________________________ 0
合_______________________________________ 0
国_______________________________________ 0
安_______

In [23]:
# 应用于整个数据
tokenized_datasets = ner_data.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

In [24]:
tokenized_datasets['train'][0]

{'id': 0,
 'tokens': ['美',
  '国',
  '宣',
  '布',
  '向',
  '加',
  '沙',
  '及',
  '该',
  '地',
  '区',
  '的',
  '巴',
  '勒',
  '斯',
  '坦',
  '平',
  '民',
  '提',
  '供',
  '更',
  '多',
  '人',
  '道',
  '主',
  '义',
  '援',
  '助'],
 'ner_tags': [1,
  2,
  0,
  0,
  0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  1,
  2,
  2,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_ids': [101,
  5401,
  1744,
  2146,
  2357,
  1403,
  1217,
  3763,
  1350,
  6421,
  1765,
  1277,
  4638,
  2349,
  1239,
  3172,
  1788,
  2398,
  3696,
  2990,
  897,
  3291,
  1914,
  782,
  6887,
  712,
  721,
  3001,
  1221,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  1,
  2,
  0,
  0,
  0,
 

## 定义模型

In [25]:
import torch
from transformers import AutoModelForTokenClassification

In [26]:
# 初始化模型
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH+MODEL_NAME, num_labels=NUM_LABELS)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at E:/JupyterLab//LLM//Large-Model//bert//bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 训练

In [27]:
from transformers import Trainer, TrainingArguments, AdamW
from transformers import DataCollatorForTokenClassification

In [28]:
def calculate_ner_metrics(true_labels, pred_labels):
    """
    自定义评估函数，输入为二维列表，输出为各指标
    """
    assert len(true_labels) == len(pred_labels), "true_labels 和 pred_labels 的长度必须一致"
    
    # 初始化统计变量
    total_true = 0  # 总的真实实体数
    total_pred = 0  # 总的预测实体数
    total_correct = 0  # 预测正确的实体数
    total_tokens = 0  # 总的标注的token数
    correct_tokens = 0  # 预测正确的token数
    
    # 遍历每个序列
    for true_seq, pred_seq in zip(true_labels, pred_labels):
        assert len(true_seq) == len(pred_seq), "每个序列的长度必须一致"
        
        for true, pred in zip(true_seq, pred_seq):
            # 统计 token-level 准确性
            total_tokens += 1
            if true == pred:
                correct_tokens += 1
            
            # 如果是实体标签，更新统计
            if true != "O":  # 真实标签为实体
                total_true += 1
                if true == pred:  # 预测正确的实体
                    total_correct += 1
            
            if pred != "O":  # 预测标签为实体
                total_pred += 1
    
    # 计算指标
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0.0
    precision = total_correct / total_pred if total_pred > 0 else 0.0
    recall = total_correct / total_true if total_true > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }
    return metrics

def compute_metrics(pred):
    pred_logits, labels = pred
    pred_logits = pred_logits.argmax(-1)
    # 取去除 padding 的部分
    predictions = [
        [id2label[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [id2label[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
   ]
    result = calculate_ner_metrics(
        true_labels,
        predictions
    )
    return result

In [29]:
# 重写 Trainer 类
class CustomTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            # 获取模型参数
            decay_parameters = [
                p for n, p in self.model.named_parameters() if n.endswith("weight")
            ]
            no_decay_parameters = [
                p for n, p in self.model.named_parameters() if n.endswith("bias")
            ]
            # 将参数分组
            optimizer_grouped_parameters = [
                {"params": decay_parameters, "weight_decay": self.args.weight_decay},
                {"params": no_decay_parameters, "weight_decay": 0.0},
            ]
            # 使用 AdamW 作为优化器
            self.optimizer = AdamW(
                optimizer_grouped_parameters, lr=self.args.learning_rate
            )
        return self.optimizer


# 创建训练参数
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir=LOG_DIR,
    save_total_limit=1,
)



In [30]:
# 数据收集器，用于将数据转换为模型可接受的格式
data_collator = DataCollatorForTokenClassification(tokenizer)  

In [31]:
# 定义 Trainer
trainer = CustomTrainer(
    model=model,  # 替换为你的模型
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [32]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [33]:
# 训练 model
trainer.train()



  0%|          | 0/117 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.029824865981936455, 'eval_accuracy': 0.993993993993994, 'eval_precision': 0.9854368932038835, 'eval_recall': 0.9854368932038835, 'eval_f1_score': 0.9854368932038835, 'eval_runtime': 0.1217, 'eval_samples_per_second': 312.205, 'eval_steps_per_second': 41.08, 'epoch': 1.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.01185962837189436, 'eval_accuracy': 0.998998998998999, 'eval_precision': 0.9951690821256038, 'eval_recall': 1.0, 'eval_f1_score': 0.9975786924939467, 'eval_runtime': 0.1151, 'eval_samples_per_second': 330.147, 'eval_steps_per_second': 43.44, 'epoch': 2.0}


  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.012030001729726791, 'eval_accuracy': 0.995995995995996, 'eval_precision': 0.9950980392156863, 'eval_recall': 0.9854368932038835, 'eval_f1_score': 0.9902439024390244, 'eval_runtime': 0.1166, 'eval_samples_per_second': 325.959, 'eval_steps_per_second': 42.889, 'epoch': 3.0}
{'train_runtime': 19.4364, 'train_samples_per_second': 47.077, 'train_steps_per_second': 6.02, 'train_loss': 0.10041725941193409, 'epoch': 3.0}


TrainOutput(global_step=117, training_loss=0.10041725941193409, metrics={'train_runtime': 19.4364, 'train_samples_per_second': 47.077, 'train_steps_per_second': 6.02, 'total_flos': 19394825045526.0, 'train_loss': 0.10041725941193409, 'epoch': 3.0})

In [34]:
best_ckpt_path = trainer.state.best_model_checkpoint
best_ckpt_path

'model/checkpoint-78'

## 评估

In [35]:
trainer.evaluate(eval_dataset=tokenized_datasets['test'])

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.0154577000066638,
 'eval_accuracy': 0.9952516619183286,
 'eval_precision': 0.9822485207100592,
 'eval_recall': 0.9880952380952381,
 'eval_f1_score': 0.9851632047477745,
 'eval_runtime': 0.1442,
 'eval_samples_per_second': 270.544,
 'eval_steps_per_second': 34.685,
 'epoch': 3.0}

## 测试

In [36]:
# 测试文本
input_text = "今天，美利坚合众国国防部发言人乔治表示中华人民共和国的歼20战机很优秀。"
encoding = tokenizer(input_text, return_tensors="pt", is_split_into_words=False, truncation=True)
encoding = {k: v.to(model.device) for k, v in encoding.items()}
input_text

'今天，美利坚合众国国防部发言人乔治表示中华人民共和国的歼20战机很优秀。'

In [37]:
# 模型预测
with torch.no_grad():
    outputs = model(**encoding)

# 提取预测结果
logits = outputs.logits
predicted_class_ids = torch.argmax(logits, dim=-1).squeeze().tolist()

# 将预测结果映射为标签，并将标签与原始文本对应起来
predicted_labels = [id2label[class_id] for class_id in predicted_class_ids]
tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze().tolist())
results = list(zip(tokens, predicted_labels))

# 打印预测结果
print("输入文本:", input_text)
print("预测结果:")
for token, label in results:
    print(f"{token:15} -> {label}")

输入文本: 今天，美利坚合众国国防部发言人乔治表示中华人民共和国的歼20战机很优秀。
预测结果:
[CLS]           -> O
今               -> O
天               -> O
，               -> O
美               -> B-PLACE
利               -> I-PLACE
坚               -> I-PLACE
合               -> I-PLACE
众               -> I-PLACE
国               -> I-PLACE
国               -> O
防               -> O
部               -> O
发               -> O
言               -> O
人               -> O
乔               -> O
治               -> O
表               -> O
示               -> O
中               -> B-PLACE
华               -> I-PLACE
人               -> I-PLACE
民               -> I-PLACE
共               -> I-PLACE
和               -> I-PLACE
国               -> I-PLACE
的               -> O
歼               -> O
20              -> O
战               -> O
机               -> O
很               -> O
优               -> O
秀               -> O
。               -> O
[SEP]           -> O


## 预测

In [38]:
import torch
from transformers import AutoModelForTokenClassification
from transformers import BertTokenizerFast

import pandas as pd


In [39]:
# 假设需要预测的文本在一个 DataFrame 中
texts = [
    "日本首相菅义伟说日本将继续与中国合作。",
    "今天，美利坚合众国国防部发言人乔治说中华人民共和国的歼20战机很优秀。",
    "美国总统拜登说美国将继续支持乌克兰。",
    "中国国家主席习近平说中国将继续推进全球化。",
    "俄罗斯总统普京说俄罗斯将继续支持叙利亚。",
]
df = pd.DataFrame(data={'text':texts})

In [40]:
# 加载模型
best_ckpt_path = 'model/checkpoint-78'  # 自己切换为训练得到的最佳模型路径
model = AutoModelForTokenClassification.from_pretrained(best_ckpt_path)
tokenizer = BertTokenizerFast.from_pretrained(best_ckpt_path) 

In [41]:
# 用于预测的函数
def predict(texts):
    # 对每个文本进行tokenization
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=MAX_LENGTH)
    
    # 将模型放到评估模式
    model.eval()
    
    with torch.no_grad():
        # 获取模型输出 (logits)
        outputs = model(**encodings)
        
    # 获取预测结果
    logits = outputs.logits
    
    # 根据最大概率选择预测的标签
    predictions = torch.argmax(logits, dim=-1)
    
    # 移除 [CLS] 和 [SEP] token 的预测结果
    # [CLS] 是第一个token， [SEP] 是最后一个token
    # 因此在进行预测后，需要去掉第一个和最后一个预测
    input_ids = encodings['input_ids']
    filtered_predictions = []
    
    for i, input_id in enumerate(input_ids):
        # 获取当前句子的实际token部分（去掉[CLS]和[SEP]）
        valid_pred = predictions[i][1:-1]  # 移除第一个和最后一个预测
        filtered_predictions.append(valid_pred)
    
    return filtered_predictions

# 执行预测
df['predictions'] = predict(df['text'].tolist())

In [42]:
# 输出结果 (根据模型标签数量，可以进一步解码为标签名称)
for i, text in enumerate(df['text']):
    print(f"Text: {text}")
    print(f"Predicted Labels: {df.loc[i,'predictions'].tolist()}")

Text: 日本首相菅义伟说日本将继续与中国合作。
Predicted Labels: [1, 2, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Text: 今天，美利坚合众国国防部发言人乔治说中华人民共和国的歼20战机很优秀。
Predicted Labels: [0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Text: 美国总统拜登说美国将继续支持乌克兰。
Predicted Labels: [1, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]
Text: 中国国家主席习近平说中国将继续推进全球化。
Predicted Labels: [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Text: 俄罗斯总统普京说俄罗斯将继续支持叙利亚。
Predicted Labels: [1, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0]


In [43]:
id2label = {
    0: 'O',
    1: 'B-PLACE',
    2: 'I-PLACE'
}

# 打印预测结果
for i, text in enumerate(df['text']):
    print(f"Text: {text}")
    print("Predicted Labels:")
    for token, label_id in zip(tokenizer.tokenize(text), df.loc[i,'predictions']):
        label = id2label[label_id.item()]
        print(f"{token:15} -> {label}")
    print("\n")

Text: 日本首相菅义伟说日本将继续与中国合作。
Predicted Labels:
日               -> B-PLACE
本               -> I-PLACE
首               -> O
相               -> O
菅               -> O
义               -> O
伟               -> O
说               -> O
日               -> B-PLACE
本               -> I-PLACE
将               -> O
继               -> O
续               -> O
与               -> O
中               -> B-PLACE
国               -> I-PLACE
合               -> O
作               -> O
。               -> O


Text: 今天，美利坚合众国国防部发言人乔治说中华人民共和国的歼20战机很优秀。
Predicted Labels:
今               -> O
天               -> O
，               -> O
美               -> B-PLACE
利               -> I-PLACE
坚               -> I-PLACE
合               -> I-PLACE
众               -> I-PLACE
国               -> I-PLACE
国               -> O
防               -> O
部               -> O
发               -> O
言               -> O
人               -> O
乔               -> O
治               -> O
说               -> O
中               -> B-PLACE
华               ->

In [44]:
# 转换df['predictions']为标签
df['labels'] = df['predictions'].apply(lambda x: [id2label[i.item()] for i in x])
df

Unnamed: 0,text,predictions,labels
0,日本首相菅义伟说日本将继续与中国合作。,"[tensor(1), tensor(2), tensor(0), tensor(0), t...","[B-PLACE, I-PLACE, O, O, O, O, O, O, B-PLACE, ..."
1,今天，美利坚合众国国防部发言人乔治说中华人民共和国的歼20战机很优秀。,"[tensor(0), tensor(0), tensor(0), tensor(1), t...","[O, O, O, B-PLACE, I-PLACE, I-PLACE, I-PLACE, ..."
2,美国总统拜登说美国将继续支持乌克兰。,"[tensor(1), tensor(2), tensor(0), tensor(0), t...","[B-PLACE, I-PLACE, O, O, O, O, O, B-PLACE, I-P..."
3,中国国家主席习近平说中国将继续推进全球化。,"[tensor(1), tensor(2), tensor(0), tensor(0), t...","[B-PLACE, I-PLACE, O, O, O, O, O, O, O, O, B-P..."
4,俄罗斯总统普京说俄罗斯将继续支持叙利亚。,"[tensor(1), tensor(2), tensor(2), tensor(0), t...","[B-PLACE, I-PLACE, I-PLACE, O, O, O, O, O, B-P..."


## 使用 pipeline 预测

In [45]:
from transformers import pipeline
from transformers import AutoModelForTokenClassification, AutoTokenizer

device = 0 if torch.cuda.is_available() else 'cpu'


In [46]:
model = AutoModelForTokenClassification.from_pretrained(best_ckpt_path)
tokenizer = AutoTokenizer.from_pretrained(best_ckpt_path) 

In [50]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device)
text = "今天，美利坚合众国国防部发言人乔治表示中华人民共和国的歼20战机很优秀。"
ner_results = nlp(text)
print(ner_results)

[{'entity': 'LABEL_0', 'score': 0.97941077, 'index': 1, 'word': '今', 'start': 0, 'end': 1}, {'entity': 'LABEL_0', 'score': 0.99748343, 'index': 2, 'word': '天', 'start': 1, 'end': 2}, {'entity': 'LABEL_0', 'score': 0.99748224, 'index': 3, 'word': '，', 'start': 2, 'end': 3}, {'entity': 'LABEL_1', 'score': 0.9746602, 'index': 4, 'word': '美', 'start': 3, 'end': 4}, {'entity': 'LABEL_2', 'score': 0.9916592, 'index': 5, 'word': '利', 'start': 4, 'end': 5}, {'entity': 'LABEL_2', 'score': 0.9965677, 'index': 6, 'word': '坚', 'start': 5, 'end': 6}, {'entity': 'LABEL_2', 'score': 0.906812, 'index': 7, 'word': '合', 'start': 6, 'end': 7}, {'entity': 'LABEL_2', 'score': 0.993753, 'index': 8, 'word': '众', 'start': 7, 'end': 8}, {'entity': 'LABEL_2', 'score': 0.9981445, 'index': 9, 'word': '国', 'start': 8, 'end': 9}, {'entity': 'LABEL_0', 'score': 0.9979563, 'index': 10, 'word': '国', 'start': 9, 'end': 10}, {'entity': 'LABEL_0', 'score': 0.9983614, 'index': 11, 'word': '防', 'start': 10, 'end': 11}, {'e