# Reference, install and import


In [None]:
!pip install pytorch-crf
!pip install datasets
!pip install transformers
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torchcrf import CRF
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
#for reading original ids and labels
def read_original(x):
  data = load_dataset('conll2003', split=x)
  text=[' '.join(i) for i in data[0:-1]['tokens']]
  id=[tokenizer(i, return_tensors="pt")['input_ids'] for i in text]
  labels=[torch.tensor(i) for i in data[0:-1]['ner_tags']]
  return data, text, id, labels

train_data,train_text,train_data_id,train_labels=read_original('train')
val_data,val_text,val_data_id,val_labels=read_original('validation')
test_data,test_text,test_data_id,test_labels=read_original('test')

In [None]:
#create paddings for bert tokenizer
def create_padding(x):
  paddings=[]
  for i in x:
    sentence=i.split(' ')
    m=[tokenizer.encode(w) for w in sentence]
    padding=[]
    for j in m:
      if len(j)==3: padding.append([1])
      else: 
        new_padding=(len(j)-2)*[0]
        new_padding[0]=1
        padding.append(new_padding)
    new_padding=[]
    for k in padding:
      new_padding+=k
    new_padding.insert(0,0)
    new_padding=new_padding+[0]
    paddings.append(new_padding)
  return paddings

train_padding=create_padding(train_text)
val_padding=create_padding(val_text)
test_padding=create_padding(test_text)

In [None]:
#make the length of the label equal to the bert_padding by adding label -1
#get the new labels
def get_new_labels(labels,padding):
  new_labels=[]
  for j in range (len(labels)):
    N=-1
    new_label=[]
    for i in padding[j]:
      if i==0:new_label.append(-1)
      if i!=0:N+=1; new_label.append(labels[j][N].item())
    new_labels.append(torch.tensor(new_label))
  return new_labels

new_train_labels=get_new_labels(train_labels,train_padding)
new_val_labels=get_new_labels(val_labels,val_padding)
new_test_labels=get_new_labels(test_labels,test_padding)

Up to now, the useful data is [data_id, new_labels], which have the same lengths. 

In [None]:
#[m,seq][m,seq]
train_data=[[i,j.unsqueeze(0)]   for i, j   in zip(train_data_id, new_train_labels)]
val_data  =[[i,j.unsqueeze(0)]   for i, j   in zip(val_data_id, new_val_labels)]
test_data =[[i,j.unsqueeze(0)]   for i, j   in zip(test_data_id, new_test_labels)]

# Model

In [None]:
class BertLinear(torch.nn.Module):
    def __init__(self, bertmodel): 
        super().__init__()
        self.bertmodel=bertmodel
        self.linear=torch.nn.Linear(768,9,bias=True)

    def forward(self, x): #[m,seq],[m,seq]
        x=self.bertmodel(x)[0] #m,seq,768 
        x=self.linear(x)#m,seq,9
        return x
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertmodel=BertModel.from_pretrained('bert-base-cased').to(device)
for param in bertmodel.parameters():
    param.requires_grad = True
torch.manual_seed(0)
model=BertLinear(bertmodel).to(device)

In [None]:
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer, num_batch, val_dataloader, len_val, criterion):

    accumulating_batch_count = 0
    for epoch in range(epochs):
      print(f"Training epoch {epoch+1}")
      model.train()
      for i, batch in enumerate(train_dataloader):
          x=batch[0].squeeze(1).to(device).long()#m,seq
          logits=model(x) #m,seq,9
          y=batch[1].squeeze(1).to(device)
          logits=logits.view(-1,9).float()
          y=y.view(-1).long()
          loss=criterion(logits,y)/batchsize_grad
          loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.

          if accumulating_batch_count % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
              optimizer.step()
              #scheduler.step()
              optimizer.zero_grad()
              model.zero_grad()
              #print (i+1,'loss',loss.item())
          accumulating_batch_count += 1
      
      #eval the model
      model.eval()
      accuracy=0
      total=0
      for i, batch in enumerate(val_dataloader):
        x=batch[0].squeeze(1).to(device).long()#m,seq
        y=batch[1].to(device)#m,seq
        with torch.no_grad():
          x=model(x) #m,seq,9
        x=torch.softmax(x,-1) #m,seq,9
        x=torch.argmax(x,-1)#m,seq
        preds=x.view(-1).to('cpu') #m*seq
        y=y.view(-1).to('cpu') #m*seq
        new_label=[]
        new_pred=[]
        for j in range (y.shape[-1]):
          if y[j]!=-1:
            new_label.append(y[j])
            new_pred.append(preds[j])
        #print (new_label)
        #print (new_pred)
        accuracy+=accuracy_score(new_label,new_pred)*len(new_pred)
        total+=len(new_pred)
      print ('accuracy', accuracy/total)
      #save the best model
      #if accuracy/len_val>0.82: path="best_model.pt"; torch.save(model.state_dict(), path) 

In [None]:
#create training data and val data
input=train_data[0:100]
torch.manual_seed(0)
train_dataloader = DataLoader(input, batch_size=batch_size, shuffle=True)
torch.manual_seed(0)
val_dataloader = DataLoader(input, batch_size=1, shuffle=True)
len_val=len(val_dataloader)

#hyperparameter
batch_size=1
batchsize_grad=8
epochs=1 #simple model uses more epochs
lr=0.00005 #simple models uses larger lr
num_batch=round(len(train_data)/batch_size)-1
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs*(len(train_dataloader)/batchsize_grad))
scheduler=None

train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer, num_batch,val_dataloader, len_val, criterion)

In [None]:
path="drive/MyDrive/0414.pt"
model.load_state_dict(torch.load(path))

#Evaluation

In [None]:
path="drive/MyDrive/0414.pt"
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [None]:
input=test_data
val_dataloader = DataLoader(input[0:100], batch_size=1, shuffle=False)

model.eval()
accuracy=0
total=0
for i, batch in enumerate(val_dataloader):
  x=batch[0].squeeze(1).to(device).long()#m,seq
  y=batch[1].to(device)#m,seq
  with torch.no_grad():
    x=model(x) #m,seq,9
  x=torch.softmax(x,-1) #m,seq,9
  x=torch.argmax(x,-1)#m,seq
  preds=x.view(-1).to('cpu') #m*seq
  y=y.view(-1).to('cpu') #m*seq
  new_label=[]
  new_pred=[]
  for j in range (y.shape[-1]):
    if y[j]!=-1:
      new_label.append(y[j])
      new_pred.append(preds[j])
  #print (new_label)
  #print (new_pred)
  accuracy+=accuracy_score(new_label,new_pred)*len(new_pred)
  total+=len(new_pred)
print ('accuracy', accuracy/total)

accuracy 0.8593530239099859
