# Fine-Tuning *RoBERTa-small-bulgarian* For Named-Entity Recognition

In [1]:
%%capture

!pip install transformers==3.0.2

In [2]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [3]:
# Get the dataset

!git clone https://github.com/usmiva/bg-ner

Cloning into 'bg-ner'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 8 (delta 1), reused 4 (delta 0), pack-reused 0[K
Unpacking objects: 100% (8/8), done.


## Data Preprocessing

In [4]:
from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import string
import re

MODEL = "iarfmoose/roberta-small-bulgarian"
MAX_LEN = 128
BATCH_SIZE = 16

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL, max_len=MAX_LEN)

tag_to_id = {
    'O': 0,
    'I-PRO': 1,
    'I-PER': 2,
    'I-ORG': 3,
    'I-LOC': 4,
    'I-EVT': 5,
    'B-PRO': 6,
    'B-PER': 7,
    'B-ORG': 8,
    'B-LOC': 9,
    'B-EVT': 10
}

id_to_tag = {tag_to_id[tag]: tag for tag in tag_to_id}

def preprocess_data(filepath):
    sentences, ner_tags = parse_dataset(filepath)

    error_count = 0
    data = []
    for row in zip(sentences, ner_tags):
        encoding = encode_sentence(row[0], row[1])
        if encoding:
            data.append(encoding)
        else:
            error_count += 1
    if error_count > 0:
        print('Was unable to encode {} examples'.format(error_count))
    return data

def parse_dataset(filepath):
    with open(filepath, encoding='utf-8') as file:
        text = file.readlines()

    text = [line.replace('\n', '') for line in text]
    text = [line for line in text if len(line) > 0]
    word_list = [line.split('\t')[0] for line in text]
    label_list = [line.split('\t')[1] for line in text]

    sentences = []
    tags = []
    current_sentence = []
    current_tags = []
    for item in zip(word_list, label_list):
        current_sentence.append(item[0])
        current_tags.append(item[1])
        if item[0] == '.':
            sentences.append(' '.join(current_sentence))
            tags.append(current_tags)
            current_sentence = []
            current_tags = []
    
    return sentences, tags

def encode_sentence(sentence, ner_tags):
    sentence = preprocess_punctuation(sentence)
    encoded_sentence = tokenizer(
        sentence, 
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        add_special_tokens=True,
        return_offsets_mapping=True,
        return_tensors='pt'
    )

    encoded_labels = encode_tags_last(ner_tags, encoded_sentence.offset_mapping)

    if encoded_labels is not None:
        return {
            'input_ids': torch.squeeze(encoded_sentence.input_ids),
            'attention_mask': torch.squeeze(encoded_sentence.attention_mask),
            'labels': encoded_labels
        }
    else:
        return None
    
def preprocess_punctuation(text):
    text = text.replace('©', '-')
    return text

# encodes labels in the first token position of each word
def encode_tags_first(ner_tags, offset_mapping):

    offset_mapping = torch.squeeze(offset_mapping)

    labels = [tag_to_id[tag] for tag in ner_tags]
    encoded_labels = np.ones(len(offset_mapping), dtype=int) * -100

    for i in range(1, len(offset_mapping)):
        if ignore_mapping(offset_mapping[i-1]) or offset_mapping[i-1][-1] != offset_mapping[i][0]:
            if not ignore_mapping(offset_mapping[i]):
                try:
                    encoded_labels[i] = labels.pop(0)
                except(IndexError):
                    return None
    
    if len(labels) > 0:
        return None

    return torch.tensor(encoded_labels)

# encodes labels in the last token position of each word
def encode_tags_last(ner_tags, offset_mapping):

    offset_mapping = torch.squeeze(offset_mapping)

    labels = [tag_to_id[tag] for tag in ner_tags]
    encoded_labels = np.ones(len(offset_mapping), dtype=int) * -100

    for i in range(1, len(offset_mapping) - 1):

        if offset_mapping[i][1] != offset_mapping[i+1][0]:
            if not ignore_mapping(offset_mapping[i]):
                try:
                    encoded_labels[i] = labels.pop(0)
                except(IndexError):
                    return None
    
    if len(labels) > 0:
        return None

    return torch.tensor(encoded_labels)

def ignore_mapping(mapping):
    return mapping[0] == 0 and mapping[1] == 0


class NERDataset(Dataset):

    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        item = self.data[index]
        item['input_ids'] = item['input_ids'].to(device)
        item['attention_mask'] = item['attention_mask'].to(device)
        item['labels'] = item['labels'].to(device)
        return item
    
    def __len__(self):
        return len(self.data)

train_data = preprocess_data('bg-ner/train.txt')
train_set = NERDataset(train_data)
test_data = preprocess_data('bg-ner/test.txt')

dev_data, test_data = train_test_split(test_data, train_size=0.5, test_size=0.5)
dev_set = NERDataset(dev_data)
test_set = NERDataset(test_data)

train_loader = DataLoader(train_set, shuffle=True, batch_size=BATCH_SIZE)
dev_loader = DataLoader(dev_set, shuffle=False, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_set, shuffle=False, batch_size=BATCH_SIZE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1790545.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1436710.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=239.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=78.0, style=ProgressStyle(description_w…


Was unable to encode 23 examples
Was unable to encode 3 examples


In [5]:
#           01234567890123456789
sentence = 'Кучето ми е гладно .'
encoded_sentence = tokenizer(
    sentence, 
    add_special_tokens=True,
    return_offsets_mapping=True,
)
print(encoded_sentence['offset_mapping'])

[(0, 0), (0, 2), (2, 6), (7, 9), (10, 11), (12, 15), (15, 18), (19, 19), (19, 20), (0, 0)]


## Model

In [6]:
from transformers import RobertaForTokenClassification

learning_rate = 1e-5

model = RobertaForTokenClassification.from_pretrained(
    MODEL, 
    num_labels=len(tag_to_id)
)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=515.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=336426471.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at iarfmoose/roberta-small-bulgarian were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at iarfmoose/roberta-small-bulgarian and are newly initialized: ['classifier.weight', 'classifier.bias']
You

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

## Training

In [7]:
LOG_INTERVAL = round(len(train_loader) / 10)

def train(epoch):
    model.train()
    total_loss = 0

    for batch_index, batch in enumerate(train_loader):
        model.zero_grad()
        output = model(**batch)
        loss = output[0]
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_index % LOG_INTERVAL == 0 and batch_index > 0:
            current_loss = total_loss / LOG_INTERVAL
            print('| epoch {:3d} | ' 
                  '{:5d}/{:5d} batches | '
                  'loss {:5.2f}'.format(
                    epoch, 
                    batch_index, len(train_loader), 
                    current_loss))
            total_loss = 0

def test(data_loader):
    model.eval()
    total_score = 0
    total_len = 0

    with torch.no_grad():
        for batch_index, batch in enumerate(data_loader):
            output = model(**batch)
            preds = np.argmax(output[1].cpu(), axis=2)
            preds = preds[(batch['labels'] != -100)]
            labels = batch['labels'][(batch['labels'] != -100)]
            total_score += preds.eq(labels.cpu()).sum()
            total_len += len(labels)
    return (total_score.item() / total_len) * 100

In [8]:
EPOCHS = 5

accuracy = test(dev_loader)
print('| Pretraining Accuracy: {:.2f}%\n'.format(accuracy))

for epoch in range(1, EPOCHS + 1):
    train(epoch)
    accuracy = test(dev_loader)
    print('| epoch   {} |  Accuracy: {:.2f}%\n'.format(epoch, accuracy))

accuracy = test(test_loader)
print('| Accuracy on test set: {:.2f}%'.format(accuracy))

| Pretraining Accuracy: 18.06%

| epoch   1 |    45/  450 batches | loss  0.71
| epoch   1 |    90/  450 batches | loss  0.27
| epoch   1 |   135/  450 batches | loss  0.18
| epoch   1 |   180/  450 batches | loss  0.15
| epoch   1 |   225/  450 batches | loss  0.12
| epoch   1 |   270/  450 batches | loss  0.12
| epoch   1 |   315/  450 batches | loss  0.11
| epoch   1 |   360/  450 batches | loss  0.08
| epoch   1 |   405/  450 batches | loss  0.07
| epoch   1 |  Accuracy: 96.91%

| epoch   2 |    45/  450 batches | loss  0.07
| epoch   2 |    90/  450 batches | loss  0.06
| epoch   2 |   135/  450 batches | loss  0.05
| epoch   2 |   180/  450 batches | loss  0.05
| epoch   2 |   225/  450 batches | loss  0.06
| epoch   2 |   270/  450 batches | loss  0.05
| epoch   2 |   315/  450 batches | loss  0.05
| epoch   2 |   360/  450 batches | loss  0.05
| epoch   2 |   405/  450 batches | loss  0.05
| epoch   2 |  Accuracy: 98.04%

| epoch   3 |    45/  450 batches | loss  0.03
| epoch  