# Install and import

In [None]:
!pip install pytorch-crf
!pip install datasets
!pip install transformers
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torchcrf import CRF
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Data preparation

In [None]:
#for reading original ids and labels
def read_original(x):
  data = load_dataset('conll2003', split=x)
  text=[' '.join(i) for i in data[0:-1]['tokens']]
  id=[tokenizer(i, return_tensors="pt")['input_ids'] for i in text]
  labels=[torch.tensor(i) for i in data[0:-1]['ner_tags']]
  return data, text, id, labels

train_data,train_text,train_data_id,train_labels=read_original('train')
val_data,val_text,val_data_id,val_labels=read_original('validation')
test_data,test_text,test_data_id,test_labels=read_original('test')

In [3]:
#create paddings for bert tokenizer
def create_padding(x):
  paddings=[]
  for i in x:
    sentence=i.split(' ')
    m=[tokenizer.encode(w) for w in sentence]
    padding=[]
    for j in m:
      if len(j)==3: padding.append([1])
      else: 
        new_padding=(len(j)-2)*[0]
        new_padding[0]=1
        padding.append(new_padding)
    new_padding=[]
    for k in padding:
      new_padding+=k
    new_padding.insert(0,0)
    new_padding=new_padding+[0]
    paddings.append(new_padding)
  return paddings

train_padding=create_padding(train_text)
val_padding=create_padding(val_text)
test_padding=create_padding(test_text)

In [4]:
#make the length of the label equal to the bert_padding by adding label -1
#get the new labels
def get_new_labels(labels,padding):
  new_labels=[]
  for j in range (len(labels)):
    N=-1
    new_label=[]
    for i in padding[j]:
      if i==0:new_label.append(-1)
      if i!=0:N+=1; new_label.append(labels[j][N].item())
    new_labels.append(torch.tensor(new_label))
  return new_labels

new_train_labels=get_new_labels(train_labels,train_padding)
new_val_labels=get_new_labels(val_labels,val_padding)
new_test_labels=get_new_labels(test_labels,test_padding)

In [5]:
#[seq][seq][seq]
train_data=[[i.squeeze(0),j,k]   for i, j, k   in zip(train_data_id, new_train_labels, train_labels)]
val_data  =[[i.squeeze(0),j,k]   for i, j, k   in zip(val_data_id, new_val_labels, val_labels)]
test_data =[[i.squeeze(0),j,k]   for i, j, k   in zip(test_data_id, new_test_labels, test_labels)]

#LSTM model for obtaining label embedding
* Autoencoder idea: "you are"--two 768 embeddings--LSTM--two 768 embeddings--predict--compare with "you are" again. We can also use Language Modelling idea to predict the next word. 
* Teacher forcing idea 

In [6]:
class LSTM(torch.nn.Module):
    def __init__(self, hidden_size): #set parameters 
        super().__init__()
        self.embedding=torch.nn.Embedding(9,768)
        self.LSTM = torch.nn.LSTM(input_size=768, hidden_size=768, batch_first=True) 
        self.linear = torch.nn.Linear(in_features=768, out_features=9)

    def forward(self, x): #m,seq
        x=self.embedding(x) #m,seq,768
        x,(hidden_state,cell_state) = self.LSTM(x) # m,seq,768
        logits = self.linear(x)  # logits：(batch size, seq_length, 9)
        return logits
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)
model1=LSTM(768).to(device)

In [7]:
def test(val_dataloader, model):
      model.eval()
      accuracy=0
      total=0
      for i, batch in enumerate(val_dataloader):
        x=batch[0].to(device).long()#m,seq
        y=batch[0].to(device)
        with torch.no_grad():
          x=model(x) #m,seq,9
        x=torch.softmax(x,-1) #m,seq,9
        x=torch.argmax(x,-1)#m,seq
        preds=x.view(-1).to('cpu') #m*seq
        y=y.view(-1).to('cpu') #m*seq
        new_label=[]
        new_pred=[]
        for j in range (y.shape[-1]):
          if y[j]!=-1:
            new_label.append(y[j])
            new_pred.append(preds[j])
        accuracy+=accuracy_score(new_label,new_pred)*len(new_pred)
        total+=len(new_pred)
      return accuracy/total

In [8]:
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer, num_batch, val_dataloader, len_val, criterion):
    accumulating_batch_count = 0
    for epoch in range(epochs):
      print(f"Training epoch {epoch+1}")
      model.train()
      for i, batch in enumerate(train_dataloader):
          x=batch[0].to(device)#seq
          logits=model(x)#seq,9
          loss=criterion(logits,x)/batchsize_grad
          loss.backward() 
          if accumulating_batch_count % batchsize_grad == 0: 
              optimizer.step()
              #scheduler.step()
              optimizer.zero_grad()
              model.zero_grad()
              print (i+1,'loss',loss.item())
          accumulating_batch_count += 1
      print ('test accuracy', test(val_dataloader, model))

In [None]:
input=train_labels[0:200]
batch_size=1
torch.manual_seed(0)
train_dataloader = DataLoader(input, batch_size=batch_size, shuffle=True)

input=test_labels[0:200]
torch.manual_seed(0)
val_dataloader = DataLoader(input, batch_size=1, shuffle=True)
len_val=len(val_dataloader)

criterion = torch.nn.CrossEntropyLoss()
batchsize_grad=4
epochs=5 #simple model uses more epochs
lr=0.03 #simple models uses larger lr
num_batch=round(len(train_data)/batch_size)-1
optimizer = torch.optim.Adam(model1.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs*(len(train_dataloader)/batchsize_grad))
train(train_dataloader,model1,batchsize_grad,epochs,scheduler,optimizer, num_batch,val_dataloader, len_val,criterion)

# Construct a model that combines LSTM embedding and bert embedding for prediction
* LSTM embedding of previous labels + Bert embedding of current text to predict the current label
* If it is the first word in the sentence, the LSTM embedding is zeros because no previous label exists.

In [None]:
class BERT(torch.nn.Module):
    def __init__(self, model,bertmodel): #set parameters 
        super().__init__()
        self.bert=bertmodel
        self.embed=model.embedding
        self.lstm=model.LSTM
        self.linear = torch.nn.Linear(in_features=768*2, out_features=9)

    def forward(self, x,y,z): #m,seq; m,seq
        x=self.bert(x)[0] #1,seq,768
        new_x=torch.zeros([1,768])

        #for remove the embeddings of -1 position
        for i in range (y.shape[1]):
          if y[0][i]!=-1:new_x=torch.cat((new_x,x[0][i].unsqueeze(0)),0)
        x=new_x[1:] #seq,768

        seq=self.embed(z.squeeze(0)) #seq,768
        seq=self.lstm(seq)[0] #seq,768
        seq=torch.cat((torch.zeros([1,768]),seq[:-1]),0)#seq,768

        x=torch.cat((x,seq),-1) #seq,768*2
        logits = self.linear(x)  #seq,9
        return logits
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertmodel=BertModel.from_pretrained('bert-base-cased').to(device)
for param in bertmodel.parameters():
    param.requires_grad = True
for param in model1.parameters():
    param.requires_grad = False
torch.manual_seed(0)
model2=BERT(model1,bertmodel).to(device)

In [124]:
def teacher_test(val_dataloader, model):
      model.eval()
      accuracy=0
      total=0
      for i, batch in enumerate(val_dataloader):
        x=batch[0].to(device)#m,seq
        y=batch[1].to(device)
        z=batch[2].to(device) # seq        
        with torch.no_grad():
          logits=model(x,y,z) #m,seq,9
        logits=torch.softmax(logits,-1) #m,seq,9
        preds=torch.argmax(logits,-1)#m,seq
        preds=preds.view(-1).to('cpu') #m*seq
        z=z.view(-1).to('cpu') #m*seq
        #print (preds)
        #print (z)
        accuracy+=accuracy_score(z,preds)*len(preds)
        total+=len(preds)
      return round(accuracy/total,3)

In [125]:
def train(train_dataloader,model,batchsize_grad,epochs,scheduler,optimizer, num_batch, val_dataloader, len_val, criterion):

    accumulating_batch_count = 0
    for epoch in range(epochs):
      print(f"Training epoch {epoch+1}")
      model.train()
      for i, batch in enumerate(train_dataloader):
          x=batch[0].to(device)#m,seq
          y=batch[1].to(device)
          z=batch[2].to(device) # seq
          logits=model(x,y,z).view(-1,9) #seq,9
          loss=criterion(logits,z.view(-1))/batchsize_grad
          loss.backward() #The gradients are computed when we call loss. backward() and are stored by PyTorch until we call optimizer.
          if accumulating_batch_count % batchsize_grad == 0: #when accumulated batch=16, we do optimizer after 16 batches of gradients are accumulated
              optimizer.step()
              #scheduler.step()
              optimizer.zero_grad()
              model.zero_grad()
              #print (i+1,'loss',loss.item())
          accumulating_batch_count += 1
      print ('teacher_test',teacher_test(val_dataloader,model))
      #model2=model
      #save the best model
      #if accuracy/len_val>0.82: path="best_model.pt"; torch.save(model.state_dict(), path)

In [126]:
#hyperparameter
batch_size=1
batchsize_grad=2
epochs=4 #simple model uses more epochs
lr=0.00005 #simple models uses larger lr

#create training data
train_input=[[i[0].squeeze(0),i[1].squeeze(0),j] for i, j in zip(train_data, train_labels)]
train_dataloader = DataLoader(train_input[0:100], batch_size=batch_size, shuffle=True, worker_init_fn=torch.manual_seed(0))

#create val data
val_dataloader = DataLoader(train_input[0:100], batch_size=1, shuffle=True,worker_init_fn=torch.manual_seed(0))
len_val=len(val_dataloader)

#configuration
num_batch = round(len(train_data)/batch_size)-1
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model2.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=epochs*(len(train_dataloader)/batchsize_grad))

In [None]:
train(train_dataloader,model2,batchsize_grad,2,scheduler,optimizer, num_batch,val_dataloader, len_val, criterion)

# Evaluate the model
* Sampling step by step instead of teacher forcing idea for testing

In [128]:
def sampling_test(val_dataloader, model):
      model.eval()
      accuracy=0
      total=0
      for i, batch in enumerate(val_dataloader):
        x=batch[0].to(device)#m, seq
        y=batch[1].to(device)#m, seq
        z=batch[2].to(device)#m, seq

        with torch.no_grad():
          x=model.bert(x)[0] #m,seq,768
        new_x=torch.zeros([1,768])
        #for remove the embeddings of -1 position
        for i in range (x.shape[1]):
          if y[0][i]!=-1:new_x=torch.cat((new_x,x[0][i].unsqueeze(0)),0)
        x=new_x[1:] #seq2,768

        #start to iterate over each position
        predicts=[]                  
        for j in range (x.shape[0]):
          if j==0: 
            lstm_embedding=torch.zeros([768])#768
            new_x=torch.cat((x[j],lstm_embedding),0) #768*2
            logits=model.linear(new_x) #9
            logits=torch.softmax(logits,-1)#9
            predict=torch.argmax(logits,-1)#1
            predicts.append(predict)

          else: 
            lstm_embedding=model.embed(torch.tensor(predicts))
            lstm_embedding=model.lstm(lstm_embedding)[0] #seq3,768
            lstm_embedding=lstm_embedding[-1]
            new_x=torch.cat((x[j],lstm_embedding),0)
            logits=model.linear(new_x)
            logits=torch.softmax(logits,-1)
            predict=torch.argmax(logits,-1)
            predicts.append(predict)
        predicts=torch.tensor(predicts)
        z=z.squeeze(0)
        #print (predicts)
        #print (z.squeeze(0))
        accuracy+=accuracy_score(z,predicts)*len(predicts)
        total+=len(predicts)
      return round(accuracy/total,3)

In [None]:
train_input=[[i[0].squeeze(0),i[1].squeeze(0),j] for i, j in zip(train_data, train_labels)]
test_input=[[i[0].squeeze(0),i[1].squeeze(0),j] for i, j in zip(test_data, test_labels)]
test_dataloader = DataLoader(test_input[0:100], batch_size=1, shuffle=True,worker_init_fn=torch.manual_seed(0))
sampling_test(test_dataloader, model2)