# 数据预处理

## 数据加载

In [61]:
import pandas as pd

In [62]:
with open('D:/model/web/nlp02/data/train.txt', 'r', encoding='utf-8') as f:
    txt = f.read().split('\n\n')
    # 先使用\n\n划分出每个地址，然后在每个地址里再划分每个字

In [63]:
# txt

In [64]:
train = pd.DataFrame([[t.split()[::2], t.split()[1::2]] for t in txt])
train.columns = ['loc', 'label']

In [65]:
train.head()

Unnamed: 0,loc,label
0,"[浙, 江, 杭, 州, 市, 江, 干, 区, 九, 堡, 镇, 三, 村, 村, 一, 区]","[B-prov, E-prov, B-city, I-city, E-city, B-dis..."
1,"[浙, 江, 省, 温, 州, 市, 平, 阳, 县, 海, 西, 镇, 宋, 埠, 公, ...","[B-prov, I-prov, E-prov, B-city, I-city, E-cit..."
2,"[浙, 江, 省, 余, 姚, 市, 模, 具, 城, 金, 型, 路, 0, 0, 0, ...","[B-prov, I-prov, E-prov, B-district, I-distric..."
3,"[浙, 江, 省, 杭, 州, 市, 江, 干, 区, 白, 杨, 街, 道, 下, 沙, ...","[B-prov, I-prov, E-prov, B-city, I-city, E-cit..."
4,"[秋, 菱, 路, 浙, 江, 兰, 溪, 金, 立, 达, 框, 业, 有, 限, 公, 司]","[B-road, I-road, E-road, B-poi, I-poi, I-poi, ..."


In [66]:
with open('D:/model/web/nlp02/data/dev.txt', 'r', encoding='utf-8') as f:
    txt = f.read().split('\n\n')
dev = pd.DataFrame([[t.split()[::2], t.split()[1::2]] for t in txt[:-1]])
dev.columns = ['loc', 'label']

In [67]:
dev.head()

Unnamed: 0,loc,label
0,"[杭, 州, 五, 洲, 国, 际]","[B-city, E-city, B-poi, I-poi, I-poi, E-poi]"
1,"[浙, 江, 省, 杭, 州, 市, 余, 杭, 乔, 司, 街, 道, 博, 卡, 路, ...","[B-prov, I-prov, E-prov, B-city, I-city, E-cit..."
2,"[浙, 江, 诸, 暨, 市, 暨, 阳, 八, 一, 新, 村, 0, 0, 幢]","[B-prov, E-prov, B-district, I-district, E-dis..."
3,"[杭, 州, 市, 武, 林, 广, 场, 杭, 州, 大, 厦, 商, 城, A, 座, ...","[B-city, I-city, E-city, B-poi, I-poi, I-poi, ..."
4,"[浙, 江, 省, 杭, 州, 市, 拱, 墅, 区, 登, 云, 路, 0, 0, 0, ...","[B-prov, I-prov, E-prov, B-city, I-city, E-cit..."


## 数据准备

In [68]:
# 加载分词器
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('D:/model/web/nlp02/bert-base-chinese/')

In [69]:
# 简单的示例
test = '浙 江 省 温 州 市 平 阳 县 海 西 镇 宋 埠 公 园 南 路 0 0 0 0 号' 
# '浙江省温州市平阳县海西镇宋埠公园南路0000号' # ''.join(train[0][0])
print(tokenizer.tokenize(test))

['浙', '江', '省', '温', '州', '市', '平', '阳', '县', '海', '西', '镇', '宋', '埠', '公', '园', '南', '路', '0', '0', '0', '0', '号']


In [70]:
tokenizer(test)

{'input_ids': [101, 3851, 3736, 4689, 3946, 2336, 2356, 2398, 7345, 1344, 3862, 6205, 7252, 2129, 1819, 1062, 1736, 1298, 6662, 121, 121, 121, 121, 1384, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [71]:
print(tokenizer.decode(tokenizer(test)['input_ids']))

[CLS] 浙 江 省 温 州 市 平 阳 县 海 西 镇 宋 埠 公 园 南 路 0 0 0 0 号 [SEP]


In [72]:
# 查看标签数量
from tkinter import _flatten
total_labels = set(_flatten(list(train['label'])+ list(dev['label'])))

In [73]:
len(total_labels) 

57

In [74]:
label2id = {j:i for i, j in enumerate(total_labels)}
label2id['<UNK>'] = len(label2id)  # 验证集中存在训练集没有出现的标识
id2label = dict(zip(label2id.values(), label2id.keys()))

In [75]:
total_labels 

{'B-assist',
 'B-cellno',
 'B-city',
 'B-community',
 'B-devzone',
 'B-distance',
 'B-district',
 'B-floorno',
 'B-houseno',
 'B-intersection',
 'B-poi',
 'B-prov',
 'B-road',
 'B-roadno',
 'B-subpoi',
 'B-town',
 'B-village_group',
 'E-assist',
 'E-cellno',
 'E-city',
 'E-community',
 'E-devzone',
 'E-distance',
 'E-district',
 'E-floorno',
 'E-houseno',
 'E-intersection',
 'E-poi',
 'E-prov',
 'E-road',
 'E-roadno',
 'E-subpoi',
 'E-town',
 'E-village_group',
 'I-assist',
 'I-cellno',
 'I-city',
 'I-community',
 'I-devzone',
 'I-distance',
 'I-district',
 'I-floorno',
 'I-houseno',
 'I-intersection',
 'I-poi',
 'I-prov',
 'I-road',
 'I-roadno',
 'I-subpoi',
 'I-town',
 'I-village_group',
 'O',
 'S-assist',
 'S-community',
 'S-district',
 'S-intersection',
 'S-poi'}

In [76]:
# 数据异常检测，验证集中存在一个训练集里没有的标签
a = set(_flatten(list(train['label'])))
b = set(_flatten(list(dev['label'])))
print(len(a), len(b), len(a.union(b)), len(a.difference(b)), a.difference(b))

55 56 57 1 {'S-poi'}


## 转化数据为id向量

In [77]:
def get_data(sentences, labels):
    result = tokenizer(sentences, is_split_into_words=True)
    result['labels'] = [[label2id.get(i, label2id['<UNK>']) for i in label] for label in labels]
    return result

In [78]:
train_inputs = get_data(list(train['loc']), train['label'])
dev_inputs = get_data(list(dev['loc']), dev['label'])

## 封装成数据集类

In [79]:
from datasets import Dataset
train_ds = Dataset.from_dict(train_inputs)
dev_ds = Dataset.from_dict(dev_inputs)

In [80]:
train_ds

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 8856
})

# 模型加载

最常见的token级别分类任务:

- NER (Named-entity recognition 名词-实体识别) 分辨出文本中的名词和实体 (person人名, organization组织机构名, location地点名...).
- POS (Part-of-speech tagging词性标注) 根据语法对token进行词性标注 (noun名词, verb动词, adjective形容词...)
- Chunk (Chunking短语组块) 将同一个短语的tokens组块放在一起。

对于以上任务，我们将展示如何使用简单的加载数据集，同时针对相应的任务使用transformer中的`Trainer`接口对模型进行微调。

只要预训练的transformer模型最顶层有一个token分类的神经网络层（由于transformer的tokenizer新特性，还需要对应的预训练模型有fast tokenizer，参考[这个表](https://huggingface.co/transformers/index.html#bigtable)），那么本notebook理论上可以使用各种各样的transformer模型（[模型面板](https://huggingface.co/models)），解决任何token级别的分类任务。

In [81]:
task = "ner" #需要是"ner", "pos" 或者 "chunk"
model_checkpoint = "D:/model/web/nlp02/bert-base-chinese/"
batch_size = 32

In [82]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [83]:
num_labels = len(total_labels)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at D:/model/web/nlp02/bert-base-chinese/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

## 模型微调

In [85]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"D:/model/web/nlp02/test-{task}",
    eval_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [86]:
from transformers import DataCollatorForTokenClassification # 数据收集器

data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding='longest', # 默认，按照最大的进行填充
    label_pad_token_id=-100,
)

In [87]:
from evaluate import load
# pip install seqeval
metric = load("D:/model/web/nlp02/seqeval")

In [88]:
# 评估指标例子
predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
metric.compute(predictions=predictions, references=references)
# 没去细究这些指标，但是总体的p,r,f1,acc还是知道的

{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.8}

In [89]:
# 处理预测结果
# 1.选择预测分类最大概率的下标 2.将下标转化为label 3.忽略-100即padding所在的地方
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # 去掉特殊tokens的下标
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }  # 我们计算所有类别总的precision/recall/f1，所以会扔掉单个类别的precision/recall/f1 

In [90]:
# 微调
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [91]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.313058,0.8207,0.862864,0.841254,0.913028
2,0.496300,0.266648,0.865612,0.896339,0.880708,0.926529
3,0.496300,0.281541,0.869766,0.896946,0.883147,0.928035
4,0.175100,0.285197,0.880884,0.90271,0.891664,0.931501
5,0.175100,0.280257,0.884987,0.907362,0.896035,0.934816
6,0.124400,0.295663,0.885293,0.904632,0.894858,0.932917
7,0.124400,0.314006,0.882673,0.903115,0.892777,0.931531
8,0.092800,0.318059,0.886716,0.907969,0.897217,0.933309
9,0.092800,0.338088,0.888801,0.906958,0.897788,0.931802
10,0.073300,0.346988,0.886792,0.903115,0.894879,0.930808


TrainOutput(global_step=2770, training_loss=0.18008959319186985, metrics={'train_runtime': 494.2504, 'train_samples_per_second': 179.18, 'train_steps_per_second': 5.604, 'total_flos': 1550149964384112.0, 'train_loss': 0.18008959319186985, 'epoch': 10.0})

# 模型推理

In [92]:
model_path = 'D:/model/web/nlp02/test-ner'

In [93]:
model_test = AutoModelForTokenClassification.from_pretrained(model_path + '/checkpoint-2770/')
tokenizer_test = AutoTokenizer.from_pretrained(model_path + '/checkpoint-2770/')

In [94]:
test_sample = '朝阳区小关北里000-0号'
test_inputs = tokenizer_test(list(test_sample), is_split_into_words=True)
print(test_inputs, len(test_inputs['input_ids']))

{'input_ids': [101, 3308, 7345, 1277, 2207, 1068, 1266, 7027, 121, 121, 121, 118, 121, 1384, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 15


In [95]:
tokenizer_test.decode(test_inputs['input_ids'])

'[CLS] 朝 阳 区 小 关 北 里 0 0 0 - 0 号 [SEP]'

In [96]:
import torch
model.to('cpu')
inputs = tokenizer_test(list(test_sample), is_split_into_words=True, return_tensors="pt")
with torch.no_grad():
    logits = model_test(**inputs).logits
predictions = torch.argmax(logits, dim=2)
predicted_token_class = [id2label[t.item()] for t in predictions[0]]
print(len(predicted_token_class), predicted_token_class)

15 ['B-district', 'I-district', 'E-district', 'B-road', 'I-road', 'I-road', 'E-road', 'B-roadno', 'I-roadno', 'I-roadno', 'I-roadno', 'I-roadno', 'E-roadno', 'O', 'I-houseno']


In [100]:
# 根据预测结果进行数据划分（predicted_token_class, test_sample）
res = []
lab = []
for i, char in enumerate(test_sample):
    sign = predicted_token_class[i]
    if 'B' in sign:
        word = char
    elif 'E' in sign:
        word = word + char
        res.append(word)
        lab.append(sign.split('-')[-1])
    elif 'I' in sign:
        word = word + char
    elif 'S' in sign:
        res.append(char)
        lab.append(sign.split('-')[-1])
        
print(test_sample, '\n',
      res, '\n',
      lab)

朝阳区小关北里000-0号 
 ['朝阳区', '小关北里', '000-0号'] 
 ['district', 'road', 'roadno']


## 具体化输出结果，把模型推理封装成函数 

In [101]:
lab2zh = {'prov': '省级行政区', 'city': '地级行政区', 'district': '县级行政区', 'devzone': '广义的上的开发区', 
 'town': '乡级行政区', 'community': '村/社区', 
 'village_group': '组/队/社', 'road': '道路', 'roadno': '路号', 'poi': '兴趣点', 'subpoi': '子兴趣点', 'houseno': '楼栋号', 
 'cellno': '单元号', 'floorno': '楼层号', 'roomno': '房间号/户号', 
 'detail': 'poi内部的四层关系（house,cell,floor,room）没明确是哪一层，如 xx-xx-x-x，则整体标注 detail', 
 'assist': '普通辅助定位词', 'distance': '距离辅助定位词', 
 'intersection': '道路口，口、交叉口、道路（高速）出入口，一定与 road 同时出现，注意：小区出入口和车库出入口为 poi，“与”“和”两条路中间的修饰词为 redundant', 
 'redundant': '非地址元素', 'O': '以上标签未覆盖的情况', '<UNK>': '未知', }

In [102]:
def get_spilt_location(test_sample):
    inputs = tokenizer_test(list(test_sample), is_split_into_words=True, return_tensors="pt")
    with torch.no_grad():
        logits = model_test(**inputs).logits
    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [id2label[t.item()] for t in predictions[0]]
#     print(predicted_token_class)
    res = []
    for i in range(len(test_sample)):
        sign, char = predicted_token_class[i],  test_sample[i]
        if 'B' in sign:
            word = char
        elif 'E' in sign:
            word = word + char
            res.append([lab2zh[sign.split('-')[-1]], word])
        elif 'I' in sign:
            word = word + char
        elif 'S' in sign:
            res.append([lab2zh[sign.split('-')[-1]], char])
        elif '<UNK>' == sign:
            pass
        elif 'O' == sign:
            if predicted_token_class[i-1] != 'O':
                word = char
            else:
                word = word + char
            if (predicted_token_class[i+1] != 'O') or (i == len(test_sample)-1):
                res.append([lab2zh['O'], word])
    return res

In [103]:
get_spilt_location('北京市丰台区草桥欣园四区0号楼底商一层')

[['地级行政区', '北京市'],
 ['县级行政区', '丰台区'],
 ['兴趣点', '草桥欣园'],
 ['子兴趣点', '四区'],
 ['楼栋号', '0号楼'],
 ['以上标签未覆盖的情况', '底商'],
 ['楼层号', '一层']]

In [104]:
get_spilt_location('北京市朝阳区广渠路00号院甲000（珠江帝景东北角底商）')

[['地级行政区', '北京市'],
 ['县级行政区', '朝阳区'],
 ['道路', '广渠路'],
 ['兴趣点', '00号院'],
 ['楼层号', '甲000'],
 ['以上标签未覆盖的情况', '（'],
 ['兴趣点', '珠江帝景'],
 ['普通辅助定位词', '东北角'],
 ['以上标签未覆盖的情况', '商）']]