In [None]:
# pip install evaluate
# pip install seqeval

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate
from datasets import load_dataset
import numpy as np

In [None]:
ds = load_dataset('doushabao4766/msra_ner_k_V3')
ds

In [None]:
for items in ds['train']:
    print(f"tokens: {items['tokens']}, ner_tags: {items['ner_tags']}")
    break

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

In [None]:
# 验证tag标签数量
tags_id = set()
for item in ds['train']:
    tags_id.update(item['ner_tags'])
tags_id

In [None]:
# tag对应字符串处理
entities = ['0'] + list({'PER', 'LOC', 'ORG'})
tags = ['0']
for entity in entities[1:]:
    tags.append('B-' + entity.upper())
    tags.append('I-' + entity.upper())
entity_idx = {entity:i for i, entity in enumerate(entities)}
print(tags)
print(entity_idx)

In [None]:
def data_input_proc(item):
    # 中文文本分词于英文不同(word_ids)
    # 利用is_split_into_words=True,导入已拆分的中文字符
    input_data = tokenizer(item['tokens'],
                           truncation = True,
                           add_special_tokens = False,
                           max_length = 512,
                           is_split_into_words = True
                          )
    # 标签长度也要截取
    labels = [lbl[:512] for lbl in item['ner_tags']]
    input_data['labels'] = labels
    return input_data
    
ds1 = ds.map(data_input_proc, batched = True)
print(ds1)

In [None]:
ds1.set_format('torch', columns = ['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
for item in ds1['train']:
    print(item)
    break

In [None]:
# 建模
id2lbl = {i: tags for i, tags in enumerate(tags)}
lbl2id = {tags : i for i, tags in enumerate(tags)}

model = AutoModelForTokenClassification.from_pretrained('bert-base-chinese',
                                                        num_labels = len(tags),
                                                        id2label = id2lbl,
                                                        label2id = lbl2id
                                                       )
model

In [None]:
# 参数
args = TrainingArguments(
    output_dir = 'msr_ner_train',
    num_train_epochs = 3,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    report_to = 'tensorboard',
    eval_strategy = 'epoch'
)

In [None]:
def compute_metric(result):
    # 获取评估对象
    seqeval = evaluate.load('seqeval')
    predicts, labels = result
    predicts = np.argmax(predicts, axis = 2)

    # 准备评估数据
    predicts = [
        [tags[p] for p,l in zip(ps, ls) if l != -100]
        for ps, ls in zip(predicts, labels)
    ]
    labels = [
        [tags[l] for p,l in zip(ps, ls) if l != -100]
        for ps, ls in zip(predicts, labels)
    ]
    results = seqeval.compute(predictions = predicts, references = labels)
    return results

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding = True)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset = ds1['train'],
    eval_dataset = ds1['test'],
    data_collator = data_collator,
    compute_metrics = compute_metric
)

In [None]:
trainer.train()

In [None]:
from transformers import pipeline

pipeline = pipeline('token-classification', 'msr_ner_train/checkpoint-2112')
words_result = pipeline('双方确定了今后发展中美关系的指导方针')

entity_result = []
for result in words_result:
    if result['entity'] != '0':
        entity_result.append(result)
print(entity_result)