## 使用`AlBert`进行命名实体识别
***
***
Time: 2020-09-21
Author: dsy
***

模块库导入

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud # 自定义数据集 
import math
import pandas as pd
from model.torchcrf import CRF
import os
from model.albert_pytorch.modeling_albert import AlbertConfig, AlbertForPreTraining,AlbertModel

常量定义

In [2]:
max_seq_length = 202
BATCH_SIZE = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
config_file = "model/albert_pytorch/prev_trained_model/albert_base_v2/config.json"
config_file

'model/albert_pytorch/prev_trained_model/albert_base_v2/config.json'

In [3]:

# config = AlbertConfig.from_pretrained(config_file)
# # print("Building PyTorch model from configuration: {}".format(str(config)))
# model = AlbertForPreTraining(config)
# Load weights from tf checkpoint

# Save pytorch-model
# print("Save PyTorch model to {}".format(pytorch_dump_path))
# torch.save(model.state_dict(), pytorch_dump_path)

数据处理

In [4]:
# 创建vocab
def readVocab(filepath:str="data/vocab.txt"):
    vocab = []
    with open(filepath,"r",encoding="utf-8") as f:
        for line in f:
            vocab.append(line.strip())
            
    with open("data/train.txt","r",encoding="utf-8") as fp:
        for line in fp:
            if '' != line.strip():
                vocab.append(line.strip().split(" ")[0])
            
    vocab = set(vocab)        
    vocab2id = { data:1+i for i,data in enumerate(vocab)}
    id2vocab = {1+i:data for i,data in enumerate(vocab)}
    return (vocab,vocab2id,id2vocab,len(vocab))

In [5]:
def handle(filepath:str="data/train.txt"):
    words = []
    labels = []
    vocab = []
    label_list = set()
    label_list.add("[CLS]")
    label_list.add("[SEP]")
    with open(filepath,"r",encoding="utf-8") as fp:
        word = []
        label = []
        for index,line in enumerate( fp):
            
            if '' == line.strip():
                words.append(word)
                labels.append(label)
#                 print("words:\n",pd.DataFrame(words))
#                 print("labels:\n",pd.DataFrame(labels))
                word = []
                label = []
            else:
                linesplit = line.strip().split(" ")
                word.append(linesplit[0])
                label.append(linesplit[1])
                label_list.add(linesplit[1])
                
    label2id = {data:i+1 for i,data in enumerate(label_list)}
    id2label = {1+i:data for i,data in enumerate(label_list)}
    
    input_ids = []
    for i in words:
        input_id = []
    #     break
        for j in i:
            input_id.append(vocab2id[j])
    #         break

        if len(input_id) > ( max_seq_length - 2):
            input_id = input_id[:max_seq_length - 2]
        input_id.insert(0,vocab2id["[CLS]"])   
        while len(input_id) < (max_seq_length - 1):
            input_id.append(0)
        input_id.append(vocab2id["[SEP]"])

        input_ids.append(input_id)
        
    label_ids = []
    for i in labels:
        label_id = []
    #     break
        for j in i:
            label_id.append(label2id[j])
    #         break

        if len(label_id) > ( max_seq_length - 2):
            label_id = label_id[:max_seq_length - 2]
        label_id.insert(0,label2id["[CLS]"])   
        while len(label_id) < (max_seq_length - 1):
            label_id.append(0)
        label_id.append(label2id["[SEP]"])

        label_ids.append(label_id)

    return (words,labels,label_list,label2id,id2label,len(label_list),input_ids,label_ids)

In [6]:
(vocab,vocab2id,id2vocab,vocab_size)= readVocab()

In [7]:
words,labels,label_list,label2id,id2label,label_size,input_ids,label_ids = handle()

In [8]:
input_ids = torch.Tensor(input_ids).long()
label_ids = torch.Tensor(label_ids).long()

自定义数据集

In [9]:
class BertNerDataset(tud.Dataset): 
    def __init__(self,input_ids,label_ids): 
        super(BertNerDataset,self).__init__() 
        self.label_ids = label_ids
        self.input_ids = input_ids
       
    def __len__(self): 
        return len(self.input_ids) 
    def __getitem__(self, index): 
        return (self.input_ids[index,:],self.label_ids[index,:])

dataset = BertNerDataset(input_ids[:8000],label_ids[:8000]) 
dataloader = tud.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True,num_workers=0)

In [10]:
class AlBertCRFNer(nn.Module):
    '''
    AlBert + CRF
    '''
    def __init__(self,vocab_size,config_file,num_tags=768):
        
        super(AlBertCRFNer,self).__init__()
        config = AlbertConfig.from_pretrained(config_file)
        self.albert = AlbertModel(config)
        self.crf = CRF(num_tags)

    def forward(self,x,tags=None):
        y,_= self.albert(x) # 8,202,768
        if tags is None:
            output = y.permute(1,0,2)
            return self.crf.decode(output) # 8 202
        else:
            return -self.crf(y,tags,reduction='mean') # (seq_length, batch_size, num_tags)   (seq_length, batch_size)--> (batch_size,).sum() 

In [11]:
class AlBertBiLSTMCRFNer(nn.Module):
    '''
    AlBert + BiLSTM + CRF
    '''
    def __init__(self,vocab_size,config_file,num_tags=768):
        super(AlBertBiLSTMCRFNer,self).__init__()
        config = AlbertConfig.from_pretrained(config_file)
        self.albert = AlbertModel(config)
        self.bilstm = nn.LSTM(input_size = 768,hidden_size =768 //2, bidirectional =True)
        self.crf = CRF(num_tags)
        
    def forward(self,x,tags=None):
        y,_=  self.albert(x)
        output,_ = self.bilstm(y) # 8 202 768
        
        if tags is None:
            output = output.permute(1,0,2)
            return self.crf.decode(output) # 8 202
        else:
            return -self.crf(output,tags,reduction='mean') # (seq_length, batch_size, num_tags)   (seq_length, batch_size)--> (batch_size,).sum()


In [12]:
class AlBertBiGRUCRFNer(nn.Module):
    '''
    AlBert + BiGRU + CRF
    '''
    def __init__(self,vocab_size,config_file,num_tags=768):
        super(AlBertBiGRUCRFNer,self).__init__()
        config = AlbertConfig.from_pretrained(config_file)
        self.albert = AlbertModel(config)
        self.bigru = nn.GRU(input_size=768,hidden_size=768//2,bidirectional=True)
        self.crf = CRF(num_tags)

    def forward(self,x,tags=None):
            
        y,_ = self.albert(x)
        output,_ = self.bigru(y)  
        
        if tags is None:
            output = output.permute(1,0,2) 
            return self.crf.decode(output)
        else:
            return -self.crf(output,tags,reduction='mean')


In [13]:
# 自注意模型
from  torch.nn.parameter import Parameter
class SelfAttention(nn.Module):
    def __init__(self,embed_dim):
        super(SelfAttention,self).__init__()
        self.embed_dim = embed_dim
        self.selfattention = nn.MultiheadAttention(embed_dim, num_heads=1, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None)
        
    def forward(self,x):
        L,N,E = x.shape
        W1 = Parameter(torch.empty((L,N,N)))
        W2 = Parameter(torch.empty((L,N,N)))
        W3 = Parameter(torch.empty((L,N,N)))
        
        std = 1./ math.sqrt(self.embed_dim)
        
        nn.init.uniform_(W1,-std,std)
        nn.init.uniform_(W2,-std,std)
        nn.init.uniform_(W3,-std,std)
        
        query = W1.matmul(x) # (L, N, E)
        key = W2.matmul(x) # (S,N,E)
        value = W3.matmul(x) # (S,N,E)
        attn_output,_ = self.selfattention(query, key, value) # (L,N,E)
        return attn_output
    
class AlBertBiGRUSelfAttentionCRFNer(nn.Module):
    '''
    AlBert + BiGRU + self-attention + CRF
    '''
    def __init__(self,vocab_size,config_file,embed_dim=768,num_tags=768):
        super(AlBertBiGRUSelfAttentionCRFNer,self).__init__()
        config = AlbertConfig.from_pretrained(config_file)
        self.albert = AlbertModel(config)
        self.bigru = nn.GRU(input_size=768,hidden_size=768//2,bidirectional=True)
        self.selfattention = SelfAttention(embed_dim)
        self.crf = CRF(num_tags)

    def forward(self,x,tags=None):
        y = self.albert(x)
        output,_ = self.bigru(y)
        output = self.selfattention(output) # 8 202 768
        if tags is None:
            output = output.permute(1,0,2) 
            return self.crf.decode(output)
        else:
            return -self.crf(output,tags,reduction='mean')


模型训练

In [14]:
EPOCH = 1

In [15]:
nameList = [
    'AlBert-CRF',
    'AlBert-BiLSTM-CRF',
    'AlBert-BiGRU-CRF', 
    'AlBert-BiGRU-selfattentin-CRF'
]
modelList = [
    AlBertCRFNer(vocab_size,config_file) ,
    AlBertBiLSTMCRFNer(vocab_size,config_file),
    AlBertBiGRUCRFNer(vocab_size,config_file),
    AlBertBiGRUSelfAttentionCRFNer(vocab_size,config_file)
]

In [None]:
for index,model in enumerate(modelList):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.05)   # optimize all parameters
    lossDataFrame = pd.DataFrame()
# loss_func = nn.CrossEntropyLoss()   # the target label is not one-hotted
    for epoch in range(EPOCH):
        for step,(input_ids,label_ids) in enumerate(dataloader):
            #x,tags=None,segment_ids=None
    #         y_pred = bertcrfner(input_ids,label_ids,torch.zeros(input_ids.size()).long())
    #         print("y_pred",y_pred.shape)
    #         print("label_ids",label_ids.shape)
    #         loss = loss_func(y_pred,label_ids)

            loss = model(input_ids,label_ids)
            if 0 == (step+1) % 10 :
                lossDataFrame.loc[(epoch + 1) * ((step + 1)//10),nameList[index]] = loss.item()
                print("model name:",nameList[index],"epoch:",epoch,"step:",step+1,",loss:",loss.item())

            optimizer.zero_grad()           # clear gradients for this training step
            loss.backward()                 # backpropagation, compute gradients
            optimizer.step()                # apply gradient
#             break
#     break
    torch.save(model.state_dict(), nameList[index]+".pt")

    lossDataFrame.to_csv(nameList[index]+"_loss_of_ner.csv",index=False)

model name: AlBert-CRF epoch: 0 step: 10 ,loss: 29.66684913635254
model name: AlBert-CRF epoch: 0 step: 20 ,loss: 4.569178581237793
model name: AlBert-CRF epoch: 0 step: 30 ,loss: 5.719108581542969
model name: AlBert-CRF epoch: 0 step: 40 ,loss: 4.686387062072754
model name: AlBert-CRF epoch: 0 step: 50 ,loss: 3.9020111560821533
model name: AlBert-CRF epoch: 0 step: 60 ,loss: 4.2586140632629395
model name: AlBert-CRF epoch: 0 step: 70 ,loss: 4.523194313049316
model name: AlBert-CRF epoch: 0 step: 80 ,loss: 2.929595947265625
model name: AlBert-CRF epoch: 0 step: 90 ,loss: 3.9714808464050293
model name: AlBert-CRF epoch: 0 step: 100 ,loss: 6.160427570343018
model name: AlBert-CRF epoch: 0 step: 110 ,loss: 5.508391857147217
model name: AlBert-CRF epoch: 0 step: 120 ,loss: 4.375628471374512
model name: AlBert-CRF epoch: 0 step: 130 ,loss: 3.947162389755249
model name: AlBert-CRF epoch: 0 step: 140 ,loss: 3.766493320465088
model name: AlBert-CRF epoch: 0 step: 150 ,loss: 3.021801233291626
m