<span style="font-family:Papyrus; font-size:3em;">**BERT IMPLIMENTATION**</span>

In [1]:
import math
import numpy as np
from seqeval.metrics import classification_report,accuracy_score,f1_score
import torch.nn.functional as F
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig
from pytorch_transformers import BertForTokenClassification, AdamW



Using TensorFlow backend.


In [2]:
path = os.path.join(os.path.expanduser('~'), 'Documents', 'AIT 726','HW3')


def readfile(path):
    f = open(path)
    data = []
    sentence = []
    label = []
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=='\n':
            if len(sentence) > 0:
                data.append((sentence,label))
                sentence = []
                label = []
            continue
        splits = line.split(' ')
        sentence.append(splits[0])
        label.append(splits[-1][:-1])

    if len(sentence) > 0:
        data.append((sentence, label))
        sentence = []
        label = []
    return data



train = readfile(path + '/train.txt')
val = readfile(path+'/valid.txt')
test = readfile(path+'/test.txt')




In [3]:
#Special Tags for BERT X, [CLS], and [SEP]
tags = ('O', 'I-LOC', 'B-PER', 'I-PER', 'I-ORG','I-MISC','B-MISC', 'B-LOC', 'B-ORG', 'X', '[CLS]','[SEP]')

tag2idx = {tag: idx for idx, tag in enumerate(tags)}
idx2tag = {idx: tag for idx, tag in enumerate(tags)}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#set max length for bert
max_len  = 45

#Load Pretrained models tokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-cased')


tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list,label in train+val+test:
    temp_lable = []
    temp_token = []
    
    # Add [CLS] at the front 
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_lable.append(lab)
            else:
                temp_lable.append('X')  
                
    # Add [SEP] at the end
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_lable)))
        print("lables:%s"%(" ".join(temp_lable)))
    i_inc +=1

tokenizer.convert_ids_to_tokens(101)
# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])


tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
print(tags[0])



No.0,len:12
texts:[CLS] EU rejects German call to boycott British la ##mb . [SEP]
No.0,len:12
lables:[CLS] B-ORG O B-MISC O O O B-MISC O X O [SEP]
No.1,len:4
texts:[CLS] Peter Blackburn [SEP]
No.1,len:4
lables:[CLS] B-PER I-PER [SEP]
No.2,len:11
texts:[CLS] BR ##US ##SE ##LS 1996 - 08 - 22 [SEP]
No.2,len:11
lables:[CLS] B-LOC X X X O X X X X [SEP]
No.3,len:34
texts:[CLS] The European Commission said on Thursday it disagreed with German advice to consumers to s ##hun British la ##mb until scientists determine whether mad cow disease can be transmitted to sheep . [SEP]
No.3,len:34
lables:[CLS] O B-ORG I-ORG O O O O O O B-MISC O O O O O X B-MISC O X O O O O O O O O O O O O O [SEP]
No.4,len:39
texts:[CLS] Germany ' s representative to the European Union ' s veterinary committee Werner Z ##wing ##mann said on Wednesday consumers should buy sheep ##me ##at from countries other than Britain until the scientific advice was clearer . [SEP]
No.4,len:39
lables:[CLS] B-LOC O X O O O B-ORG I-ORG O 

In [4]:
#Attention Masks
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]

segment_ids = [[0] * len(input_id) for input_id in input_ids]

#Train Val Test Split post encoding
tr_inputs = input_ids[:14041]
tr_tags = tags[:14041]
tr_masks = attention_masks[:14041]
tr_segs = segment_ids[:14041]
tr_inputs = torch.tensor(tr_inputs)
tr_tags = torch.tensor(tr_tags)
tr_masks = torch.tensor(tr_masks)
tr_segs = torch.tensor(tr_segs)


val_inputs = input_ids[14041:17291]
val_tags = tags[14041:17291]
val_masks = attention_masks[14041:17291]
val_segs = segment_ids[14041:17291]
val_inputs = torch.tensor(val_inputs)
val_tags = torch.tensor(val_tags)
val_masks = torch.tensor(val_masks)
val_segs = torch.tensor(val_segs)

test_inputs = input_ids[17291:]
test_tags = tags[17291:]
test_masks = attention_masks[17291:]
test_segs = segment_ids[17291:]
test_inputs = torch.tensor(test_inputs)
test_tags = torch.tensor(test_tags)
test_masks = torch.tensor(test_masks)
test_segs = torch.tensor(test_segs)

#Mini Batch Size
batch_num = 32

# Only set token embedding, attention embedding, no segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_num)


In [5]:
#Model Prep
epochs = 4
max_grad_norm = 1.0
#Load Pretrained Model
model = BertForTokenClassification.from_pretrained('bert-base-cased',num_labels=len(tag2idx))

# Set model to GPU,if you are using GPU machine
model.cuda()


num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs
#Fine tuning the whole model
FULL_FINETUNING = True
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5)


In [6]:
#Training model 
model.train()

print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # forward pass
        outputs = model(b_input_ids, token_type_ids=None,
        attention_mask=b_input_mask, labels=b_labels)
        loss, scores = outputs[:2]
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
#model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

#output_model_file = os.path.join(path, "Bert.pt")
#output_config_file = os.path.join(path, "config.json")

#torch.save(model_to_save.state_dict(), output_model_file)
#model_to_save.config.to_json_file(output_config_file)
#tokenizer.save_vocabulary(path)


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 14041
  Batch size = 32
  Num steps = 1756


Epoch:  25%|██▌       | 1/4 [01:36<04:49, 96.49s/it]

Train loss: 0.16124260212936903


Epoch:  50%|█████     | 2/4 [03:12<03:12, 96.32s/it]

Train loss: 0.027790374092260147


Epoch:  75%|███████▌  | 3/4 [04:48<01:36, 96.17s/it]

Train loss: 0.01559402239025192


Epoch: 100%|██████████| 4/4 [06:25<00:00, 96.42s/it]

Train loss: 0.009839807140941787





In [7]:
#Validate Model
model.eval();

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    
    #inputs to numpy
    #input_ids = input_ids.to('cpu').numpy()
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        

        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if idx2tag[label_ids[i][j]] != "X" and idx2tag[label_ids[i][j]] != "[CLS]" and idx2tag[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(idx2tag[label_ids[i][j]])
                    temp_2.append(idx2tag[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)
        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)
print("***** Eval results *****")
print("\n%s"%(report))
print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))


***** Running evaluation *****
  Num examples =3250
  Batch size = 32
f1 socre: 0.936082
Accuracy score: 0.988380
***** Eval results *****

           precision    recall  f1-score   support

      ORG     0.8898    0.9349    0.9118      1321
      LOC     0.9742    0.9497    0.9618      1789
      PER     0.9616    0.9700    0.9658      1731
     MISC     0.8847    0.8983    0.8914       905

micro avg     0.9280    0.9443    0.9361      5746
macro avg     0.9369    0.9443    0.9404      5746

f1 socre: 0.936082
Accuracy score: 0.988380


In [8]:
#Test Model
#model = BertForTokenClassification.from_pretrained('bert-base-cased',num_labels=len(tag2idx))
#model.load_state_dict(torch.load(path+'/Bert.bin'))
model.eval()


eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(test_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
        # For eval mode, the first result of outputs is logits
        logits = outputs[0] 
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    
    #inputs to numpy
    #input_ids = input_ids.to('cpu').numpy()
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if idx2tag[label_ids[i][j]] != "X" and idx2tag[label_ids[i][j]] != "[CLS]" and idx2tag[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(idx2tag[label_ids[i][j]])
                    temp_2.append(idx2tag[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)
        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)
print("***** Eval results *****")
print("\n%s"%(report))
print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))


***** Running evaluation *****
  Num examples =3453
  Batch size = 32
f1 socre: 0.895597
Accuracy score: 0.978935
***** Eval results *****

           precision    recall  f1-score   support

     MISC     0.7216    0.8188    0.7671       690
      LOC     0.9278    0.9204    0.9241      1620
      PER     0.9555    0.9524    0.9540      1534
      ORG     0.8675    0.8978    0.8824      1634

micro avg     0.8818    0.9098    0.8956      5478
macro avg     0.8916    0.9098    0.9003      5478

f1 socre: 0.895597
Accuracy score: 0.978935
