In [1]:

from paddlenlp.datasets import load_dataset
train_ds,dev_ds,test_ds =load_dataset(
    'msra_ner',splits=('train','test','test'),lazy=False
)

In [2]:
label_vocab={label:ind for ind,label in enumerate(train_ds.label_list)}

In [3]:
train_ds.label_list

['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']

In [4]:
label_vocab

{'B-PER': 0,
 'I-PER': 1,
 'B-ORG': 2,
 'I-ORG': 3,
 'B-LOC': 4,
 'I-LOC': 5,
 'O': 6}

In [5]:
words=set()
word_vocab=[]
for item in train_ds:
    # print(item)
    word_vocab+=item['tokens']
print(len(list(word_vocab)),len(set(word_vocab)))
word_vocab={k:v+2 for v,k in enumerate(set(word_vocab))}
word_vocab['PAD']=0
word_vocab['OOV']=1

2171516 4790


In [6]:
len(train_ds),len(list(word_vocab))

(45000, 4792)

In [7]:
def convert_tokens_to_ids(tokens,vocab,oov_token='OOV'):
    token_ids=[]
    oov_id=vocab.get(oov_token) if oov_token else None
    for token in tokens:
        token_id=vocab.get(token,oov_id)
        token_ids.append(token_id)
    return token_ids

def convert_example(example):
    tokens,labels=example['tokens'],example['labels']
    token_ids=convert_tokens_to_ids(tokens,word_vocab,"OOV")
    label_ids=labels
    return token_ids,len(token_ids),label_ids



In [8]:
train_ds.map(convert_example)
dev_ds.map(convert_example)
test_ds.map(convert_example)

<paddlenlp.datasets.dataset.MapDataset at 0x7f14b04d8c90>

In [9]:
list(train_ds)[:10]

[([3079,
   1127,
   209,
   1133,
   2737,
   851,
   667,
   4212,
   4608,
   2800,
   4596,
   1696,
   3035,
   4359,
   104,
   775,
   2075,
   1281,
   672,
   192,
   3583,
   693,
   2529,
   3035,
   3869,
   1375,
   2075,
   737,
   457,
   4103,
   1249,
   15,
   2113,
   1742,
   4212,
   4305,
   571,
   1729,
   812,
   2075,
   709,
   1787,
   2236,
   2110,
   571,
   522,
   1118,
   3079,
   3498,
   3458],
  50,
  [6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6,
   6]),
 ([15,
   4305,
   881,
   775,
   2236,
   4764,
   4106,
   4103,
   2912,
   3905,
   1249,
   15,
   2185,
   3760,
   2622,
   4212,
   3564,
   4646,
   3689,
   2044,
   2075,
   533,
   4764,
   963,
   3193,
   2242,
   1882,
   4253,
   618,
   421

In [10]:
import paddle
from paddlenlp.data import Tuple,Stack,Pad


batchify_fn=lambda samples,fn=Tuple(
    Pad(axis=0,pad_val=word_vocab.get('OOV')),
    Stack(),
    Pad(axis=0,pad_val=label_vocab.get("O"))
):fn(samples)


train_loader=paddle.io.DataLoader(
    dataset=train_ds,batch_size=32,shuffle=True,drop_last=True,collate_fn=batchify_fn
)

dev_loader=paddle.io.DataLoader(
    dataset=dev_ds,batch_size=32,shuffle=True,drop_last=True,collate_fn=batchify_fn
)

test_loader=paddle.io.DataLoader(
    dataset=test_ds,batch_size=32,shuffle=True,drop_last=True,collate_fn=batchify_fn
)

In [11]:
import paddle.nn as nn
from paddlenlp.layers import LinearChainCrf,ViterbiDecoder,LinearChainCrfLoss
from paddlenlp.metrics import ChunkEvaluator
class BiLSTMWithCRF(nn.Layer):
    def __init__(self,emb_size,hidden_size,word_num,label_num,use_w2v_emb=False):
        super(BiLSTMWithCRF,self).__init__()
        self.word_emb=nn.Embedding(word_num,emb_size)
        self.lstm=nn.LSTM(emb_size,hidden_size,num_layers=2,direction='bidirectional')
        self.fc=nn.Linear(hidden_size*2,label_num+2) #BOS EOS
        self.crf=LinearChainCrf(label_num)
        self.decoder=ViterbiDecoder(self.crf.transitions)
    
    def forward(self,x,lens):
        embs=self.word_emb(x)
        output,_=self.lstm(embs)
        output=self.fc(output)
        _,pred=self.decoder(output,lens)

        return output,lens,pred


# 训练

In [12]:
network=BiLSTMWithCRF(300,300,len(word_vocab),len(label_vocab))

W0801 06:45:43.317390   870 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 10.1
W0801 06:45:43.323509   870 gpu_resources.cc:91] device: 0, cuDNN Version: 7.6.


In [13]:
model=paddle.Model(network)

In [14]:
optimizer=paddle.optimizer.Adam(learning_rate=0.001,parameters=model.parameters())
crf_loss=LinearChainCrfLoss(network.crf)
chunk_evaluator=ChunkEvaluator(label_list=label_vocab.keys(),suffix=True)


In [15]:
model.prepare(optimizer,crf_loss,chunk_evaluator)
model.fit(train_data=train_loader,eval_data=dev_loader,epochs=10,save_dir='./results',log_freq=100)

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/10




step  100/1406 - loss: 15.6692 - precision: 0.3257 - recall: 0.1225 - f1: 0.1780 - 439ms/step
step  200/1406 - loss: 42.9053 - precision: 0.5106 - recall: 0.3059 - f1: 0.3826 - 425ms/step
step  300/1406 - loss: 4.7607 - precision: 0.5801 - recall: 0.4049 - f1: 0.4769 - 440ms/step
step  400/1406 - loss: 0.0000e+00 - precision: 0.6189 - recall: 0.4667 - f1: 0.5321 - 469ms/step
step  500/1406 - loss: 0.0828 - precision: 0.6484 - recall: 0.5142 - f1: 0.5735 - 472ms/step
step  600/1406 - loss: 0.0000e+00 - precision: 0.6709 - recall: 0.5494 - f1: 0.6041 - 470ms/step
step  700/1406 - loss: 2.4946 - precision: 0.6878 - recall: 0.5760 - f1: 0.6270 - 472ms/step
step  800/1406 - loss: 0.0000e+00 - precision: 0.7020 - recall: 0.5986 - f1: 0.6462 - 470ms/step
step  900/1406 - loss: 0.2968 - precision: 0.7146 - recall: 0.6188 - f1: 0.6633 - 475ms/step
step 1000/1406 - loss: 1.9320 - precision: 0.7236 - recall: 0.6351 - f1: 0.6765 - 479ms/step
step 1100/1406 - loss: 2.5171 - precision: 0.7336 - reca

# 恢复模型

In [16]:
# model=paddle.Model(network)

In [17]:
# import paddle
# optimizer=paddle.optimizer.Adam(learning_rate=0.001,parameters=model.parameters())
# crf_loss=LinearChainCrfLoss(network.crf)
# chunk_evaluator=ChunkEvaluator(label_list=label_vocab.keys(),suffix=True)

In [18]:
# model.load('results/final')

In [19]:
# model.prepare(optimizer,crf_loss,chunk_evaluator)

In [36]:
model.evaluate(eval_data=test_loader,log_freq=10)

Eval begin...
step  10/107 - loss: 0.0000e+00 - precision: 0.8016 - recall: 0.8741 - f1: 0.8363 - 469ms/step
step  20/107 - loss: 0.0000e+00 - precision: 0.8239 - recall: 0.8626 - f1: 0.8428 - 414ms/step
step  30/107 - loss: 0.0000e+00 - precision: 0.8222 - recall: 0.8229 - f1: 0.8226 - 576ms/step
step  40/107 - loss: 0.0000e+00 - precision: 0.8240 - recall: 0.8263 - f1: 0.8252 - 529ms/step
step  50/107 - loss: 0.0000e+00 - precision: 0.8281 - recall: 0.8307 - f1: 0.8294 - 532ms/step
step  60/107 - loss: 0.0000e+00 - precision: 0.8271 - recall: 0.8342 - f1: 0.8306 - 524ms/step
step  70/107 - loss: 0.3066 - precision: 0.8253 - recall: 0.8364 - f1: 0.8308 - 503ms/step
step  80/107 - loss: 0.0000e+00 - precision: 0.8274 - recall: 0.8382 - f1: 0.8328 - 494ms/step
step  90/107 - loss: 0.0000e+00 - precision: 0.8280 - recall: 0.8394 - f1: 0.8336 - 481ms/step
step 100/107 - loss: 0.0000e+00 - precision: 0.8292 - recall: 0.8411 - f1: 0.8351 - 468ms/step
step 107/107 - loss: 0.0000e+00 - precis

{'loss': [0.0],
 'precision': 0.8312115335975044,
 'recall': 0.8434425528274446,
 'f1': 0.8372823779193206}

# 预测

In [22]:
outputs,lens,decodes=model.predict(test_data=test_loader)

Predict begin...
Predict samples: 3424


In [25]:
# print(decodes)
# print(len(decodes))

In [30]:
def parse_decode(ds,decodes,lens,label_vocab):
    decodes=[x for batch in decodes for x in batch]
    lens=[x for batch in lens for x in batch]
    print(len(decodes),len(lens))
    id_label=dict(zip(label_vocab.values(),label_vocab.keys()))
    outputs=[]
    i=0
    for idx,end in enumerate(lens):
        sent=ds.data[idx]['tokens'][:end]
        tags=[id_label[x] for x in decodes[idx][:end]]
        sent_out=[]
        tags_out=[]
        words=""
        for s,t in zip(sent,tags):
            if t.startswith('B-') or t=="O":
                if len(words):
                    sent_out.append(words)
                tags_out.append(t.split('-')[-1])
                words=s
            else:
                words+=s

        if(len(sent_out)<len(tags_out)):
            sent_out.append(words)

        if len(sent_out)!=len(tags_out):
            print(len(sent_out),len(tags_out))
            continue

        cs=[str((s,t)) for s,t in zip(sent_out,tags_out)]
        ss=''.join(cs) 
        i+=1
        outputs.append(ss) 
    return outputs      



In [31]:
pred=parse_decode(test_ds,decodes,lens,label_vocab)


3424 3424


In [39]:
pred[2003]

"('有', 'O')('人', 'O')('总结其过', 'ORG')('程', 'O')('曰', 'O')('：', 'O')('“看', 'LOC')('不', 'O')('惯', 'O')('；', 'O')('边', 'O')('上站', 'LOC')('；', 'O')('试试', 'ORG')('看', 'O')('；死了', 'ORG')('算', 'O')('！', 'O')"