## Import packages

In [1]:
import joblib
import json
import os
import time
from pathlib import Path

import cv2
import numpy as np
import torch
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, BertTokenizerFast, BertForTokenClassification, get_linear_schedule_with_warmup, RobertaForTokenClassification, RobertaTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


## Define constants

In [2]:
# Path to the preprocessed dataset (for token classification task)
DATASET_PATH = Path("../data/SROIE/layoutlm_data")

TRAIN_DATASET_PATH = DATASET_PATH / "train.txt"
TEST_DATASET_PATH = DATASET_PATH / "test.txt"

# Hyperparameters for the BERT algorithm
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 1e-5
MAX_SEQ_LEN = 128

## Prepare dataset

In [3]:
label_map = {
    "CLS": 0,
    "O": 1,
    "S-ADDRESS": 2,
    "S-COMPANY": 3,
    "S-DATE": 4,
    "S-TOTAL": 5
}

reverse_label_map = {
    0: "CLS",
    1: "O",
    2: "S-ADDRESS",
    3: "S-COMPANY",
    4: "S-DATE",
    5: "S-TOTAL"
}

In [4]:
class SROIEDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_seq_len, label_map):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.label_map = label_map
        
        self.labels = []
        self.sentences = []
        
        with open(file_path, 'r') as f:
            sentence = []
            labels = []
            
            for line in f:
                line = line.strip()
                
                if not line:
                    if sentence and labels:
                        self.sentences.append(' '.join(sentence))
                        self.labels.append(labels)
                    
                    sentence = []
                    labels = []
                else:
                    word, label = line.split("\t")
                    sentence.append(word)
                    labels.append(label)
                    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        labels = self.labels[idx]
        
        inputs = self.tokenizer.encode_plus(sentence, truncation=True, padding="max_length", max_length=self.max_seq_len)
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        
        label_ids = [self.label_map[l] for l in labels]
        label_ids = label_ids[:self.max_seq_len - 2]  # truncate if necessary
        label_ids = [0] + label_ids + [0]  # add special tokens
        label_ids = label_ids + [0]*(self.max_seq_len - len(label_ids))  # pad if necessary

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(label_ids, dtype=torch.long)
        }

In [5]:
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

train_dataset = SROIEDataset(TRAIN_DATASET_PATH, tokenizer, MAX_SEQ_LEN, label_map)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = SROIEDataset(TEST_DATASET_PATH, tokenizer, MAX_SEQ_LEN, label_map)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Train

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Use either BERT or RoBERTa model
# model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=6)
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=6)
model = model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able

In [7]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(train_dataloader) * EPOCHS
)

loss_values = []



In [8]:
times = []

for epoch in range(EPOCHS):
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHS))

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode.
    model.train()
    
    start = time.time()

    # For each batch of training data...
    for batch in train_dataloader:
        # add batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(**batch)

        # The call to model will return the loss (because we provided labels) and the "logits"--the model outputs prior to activation.
        loss = outputs.loss
        logits = outputs.logits

        # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients".
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
    end = time.time()
    times.append(end-start)

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average training loss: {0:.2f}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    
print("Average elasped time for 1 epoch (in seconds):", np.average(times))


Average training loss: 0.82

Average training loss: 0.40

Average training loss: 0.34

Average training loss: 0.30

Average training loss: 0.29

Average training loss: 0.27

Average training loss: 0.25

Average training loss: 0.24

Average training loss: 0.22

Average training loss: 0.22
Average elasped time for 1 epoch (in seconds): 241.80076909065247


In [None]:
model_in_mem = joblib.dump(model, "./bert.joblib")
size_in_bytes = os.path.getsize("./bert.joblib")
print(f"Model size: {size_in_bytes / (1024 * 1024)} MB")

## Evaluate

In [10]:
def evaluate(model, dataloader, device):
    model.eval()

    predictions , true_labels = [], []
    times = []

    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        start = time.time()

        with torch.no_grad():
            outputs = model(**batch)
            
        end = time.time()
        times.append(end-start)

        logits = outputs.logits
        logits = logits.detach().cpu().numpy()

        label_ids = batch["labels"].cpu().numpy()

        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    pred_tags = [[reverse_label_map[p_i] for p_i in p] for p in predictions]
    valid_tags = [[reverse_label_map[l_ii] for l_ii in l_i] for l_i in true_labels]
    
    print("F1-Score: ", f1_score(valid_tags, pred_tags))
    print("Precision: ", precision_score(valid_tags, pred_tags))
    print("Recall: ", recall_score(valid_tags, pred_tags))
    print(classification_report(valid_tags, pred_tags))
    print("Elapsed time for inference (in seconds):", sum(times))

In [11]:
evaluate(model, test_dataloader, device)

F1-Score:  0.8224739603132365
Precision:  0.8366589327146172
Recall:  0.808761961722488
           precision    recall  f1-score   support

  ADDRESS       0.90      0.92      0.91      3806
  COMPANY       0.85      0.94      0.89      1457
      CLS       0.51      0.63      0.56       694
    TOTAL       0.00      0.00      0.00       331
     DATE       0.75      0.21      0.33       400

micro avg       0.84      0.81      0.82      6688
macro avg       0.80      0.81      0.79      6688

Elapsed time for inference (in seconds): 34.13233304023743
