# Reference, install and import
* bilstm_crf: [here](https://pypi.org/project/bi-lstm-crf/)
* Before BERT, LSTM+CRF 
* After BERT, Bert+Linear (BertForTokenClassification) is mostly enough
* BERT+LSTM+CRF seems not that powerful, do not add LSTM layer!
* Sometimes, BERT+CRF or BERT+LSTM+CRF is ok
* A detailed comparison and discussion is [here](https://posts.careerengine.us/p/60763f6009edcc27e3dfd854)


In [None]:
!pip install pytorch-crf
!pip install datasets
!pip install transformers
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torchcrf import CRF
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [75]:
#for reading original ids and labels
def read_original(x):
  data = load_dataset('conll2003', split=x)
  text=[' '.join(i) for i in data[0:-1]['tokens']]
  id=[tokenizer(i, return_tensors="pt")['input_ids'] for i in text]
  labels=[torch.tensor(i) for i in data[0:-1]['ner_tags']]
  return data, text, id, labels

train_data,train_text,train_data_id,train_labels=read_original('train')
val_data,val_text,val_data_id,val_labels=read_original('validation')
test_data,test_text,test_data_id,test_labels=read_original('test')

Reusing dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)
Reusing dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)
Reusing dataset conll2003 (/root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee)


In [76]:
#create paddings for bert tokenizer
def create_padding(x):
  paddings=[]
  for i in x:
    sentence=i.split(' ')
    m=[tokenizer.encode(w) for w in sentence]
    padding=[]
    for j in m:
      if len(j)==3: padding.append([1])
      else: 
        new_padding=(len(j)-2)*[0]
        new_padding[0]=1
        padding.append(new_padding)
    new_padding=[]
    for k in padding:
      new_padding+=k
    new_padding.insert(0,0)
    new_padding=new_padding+[0]
    paddings.append(new_padding)
  return paddings

train_padding=create_padding(train_text)
val_padding=create_padding(val_text)
test_padding=create_padding(test_text)

In [77]:
#make the length of the label equal to the bert_padding by adding label -1
#get the new labels
def get_new_labels(labels,padding):
  new_labels=[]
  for j in range (len(labels)):
    N=-1
    new_label=[]
    for i in padding[j]:
      if i==0:new_label.append(-1)
      if i!=0:N+=1; new_label.append(labels[j][N].item())
    new_labels.append(torch.tensor(new_label))
  return new_labels

new_train_labels=get_new_labels(train_labels,train_padding)
new_val_labels=get_new_labels(val_labels,val_padding)
new_test_labels=get_new_labels(test_labels,test_padding)

Up to now, the useful data is [data_id, new_labels], which have the same lengths. 

In [117]:
#[m,seq][m,seq]
train_data=[[i,j.unsqueeze(0),torch.tensor(k).unsqueeze(0).bool()]   for i, j, k   in zip(train_data_id, new_train_labels, train_padding)]
val_data  =[[i,j.unsqueeze(0),torch.tensor(k).unsqueeze(0).bool()]   for i, j, k   in zip(val_data_id, new_val_labels, val_padding)]
test_data =[[i,j.unsqueeze(0),torch.tensor(k).unsqueeze(0).bool()]   for i, j, k   in zip(test_data_id, new_test_labels, test_padding)]

# Model

In [191]:
class BertCRF(torch.nn.Module):
    def __init__(self, bertmodel, CRFmodel): 
        super().__init__()
        self.bertmodel=bertmodel
        self.linear=torch.nn.Linear(768,9,bias=True)
        self.crf=CRFmodel

    def forward(self, x,y,z): #[m,seq],[m,seq]
        x=self.bertmodel(x)[0] #m,seq,768 
        x=self.linear(x)#m,seq,9
        loss=-self.crf.forward(x,y,z,reduction='token_mean')
        return loss
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertmodel=BertModel.from_pretrained('bert-base-cased').to(device)
for param in bertmodel.parameters():
    param.requires_grad = False
torch.manual_seed(0)
CRFmodel = CRF(9, batch_first=True).to(device)
torch.manual_seed(0)
model=BertCRF(bertmodel, CRFmodel).to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [195]:
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer, num_batch, val_dataloader, len_val):

    accumulating_batch_count = 0
    for epoch in range(epochs):
      print(f"Training epoch {epoch+1}")
      model.train()
      for i, batch in enumerate(train_dataloader):
          x=batch[0].squeeze(1)[:,1:].to(device).long()#m,seq
          y=batch[1].squeeze(1)[:,1:].to(device)
          z=batch[2].squeeze(1)[:,1:].to(device)
          loss=model(x,y,z)/batchsize_grad
          loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.

          if accumulating_batch_count % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
              optimizer.step()
              #scheduler.step()
              optimizer.zero_grad()
              model.zero_grad()
              #print (i+1,'loss',loss.item())
          accumulating_batch_count += 1
      
      #eval the model
      model.eval()
      accuracy=0
      total=0
      for i, batch in enumerate(val_dataloader):
        x=batch[0].squeeze(1)[:,1:].to(device).long()#m,seq
        y=batch[1].squeeze(1)[:,1:].to(device)#m,seq
        z=batch[2].squeeze(1)[:,1:].to(device)
        with torch.no_grad():
          x=model.bertmodel(x)[0] #m,seq,768
          x=model.linear(x)#m,seq,9
          x=model.crf.decode(x,mask=z)#m,seq
          x=torch.tensor(x)
        predict_label=x.view(-1).to('cpu')
        targets=y.view(-1).to('cpu') #m
        targets=targets[targets!=-1]
        accuracy+=accuracy_score(targets,predict_label)*x.shape[1]
        total+=x.shape[1]
      print (accuracy/total)
      #save the best model
      #if accuracy/len_val>0.82: path="best_model.pt"; torch.save(model.state_dict(), path) 

In [None]:
#create training data and val data
input=train_data[0:100]
batch_size=1
torch.manual_seed(0)
train_dataloader = DataLoader(input, batch_size=batch_size, shuffle=True)
torch.manual_seed(0)
val_dataloader = DataLoader(input, batch_size=1, shuffle=True)
len_val=len(val_dataloader)

#hyperparameter
batchsize_grad=1
epochs=4 #simple model uses more epochs
lr=0.08 #simple models uses larger lr
num_batch=round(len(train_data)/batch_size)-1
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs*(len(train_dataloader)/batchsize_grad))
scheduler=None

train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer, num_batch,val_dataloader, len_val)

#Evaluation

In [None]:
path="drive/MyDrive/0414.pt"
model.load_state_dict(torch.load(path))

In [None]:
input=test_data
val_dataloader = DataLoader(input[0:100], batch_size=1, shuffle=False)

model.eval()
accuracy=0
total=0
for i, batch in enumerate(val_dataloader):
  x=batch[0].squeeze(1)[:,1:].to(device).long()#m,seq
  y=batch[1].squeeze(1)[:,1:].to(device)#m,seq
  z=batch[2].squeeze(1)[:,1:].to(device)
  with torch.no_grad():
    x=model.bertmodel(x)[0] #m,seq,768
    x=model.linear(x)#m,seq,9
    x=model.crf.decode(x,mask=z)#m,seq
    x=torch.tensor(x)
  predict_label=x.view(-1).to('cpu')
  targets=y.view(-1).to('cpu') #m
  targets=targets[targets!=-1]
  accuracy+=accuracy_score(targets,predict_label)*x.shape[1]
  total+=x.shape[1]
print (accuracy/total)