In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 30.2MB/s eta 0:00:01[K     |▉                               | 20kB 4.0MB/s eta 0:00:01[K     |█▎                              | 30kB 5.1MB/s eta 0:00:01[K     |█▊                              | 40kB 5.9MB/s eta 0:00:01[K     |██▏                             | 51kB 4.6MB/s eta 0:00:01[K     |██▋                             | 61kB 5.5MB/s eta 0:00:01[K     |███                             | 71kB 5.9MB/s eta 0:00:01[K     |███▍                            | 81kB 6.7MB/s eta 0:00:01[K     |███▉                            | 92kB 7.0MB/s eta 0:00:01[K     |████▎                           | 102kB 6.0MB/s eta 0:00:01[K     |████▊                           | 112kB 6.0MB/s eta 0:00:01[K     |█████▏                          | 122kB 6.0M

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [None]:
!git clone https://github.com/usmiva/bg-ner

Cloning into 'bg-ner'...
remote: Enumerating objects: 8, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 8 (delta 1), reused 4 (delta 0), pack-reused 0[K
Unpacking objects: 100% (8/8), done.


In [None]:
from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset, DataLoader
import numpy as np
import string
import re

MODEL = "iarfmoose/roberta-small-bulgarian"
MAX_LEN = 128

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL, max_len=MAX_LEN)

tag_to_id = {
    'O': 0,
    'I-PRO': 1,
    'I-PER': 2,
    'I-ORG': 3,
    'I-LOC': 4,
    'I-EVT': 5,
    'B-PRO': 6,
    'B-PER': 7,
    'B-ORG': 8,
    'B-LOC': 9,
    'B-EVT': 10
}

id_to_tag = {tag_to_id[tag]: tag for tag in tag_to_id}

class NERDataset(Dataset):

    def __init__(self, filepath):
        sentences, ner_tags = self.parse_dataset(filepath)

        error_count = 0
        self.data = []
        for row in zip(sentences, ner_tags):
            encoding = self.encode_sentence(row[0], row[1])
            if encoding:
                self.data.append(encoding)
            else:
                error_count += 1
        if error_count > 0:
            print('Was unable to encode {} examples'.format(error_count))

    def __getitem__(self, index):
        item = self.data[index]
        item['input_ids'] = item['input_ids'].to(device)
        item['attention_mask'] = item['attention_mask'].to(device)
        item['labels'] = item['labels'].to(device)
        return item
    
    def __len__(self):
        return len(self.data)

    def parse_dataset(self, filepath):
        with open(filepath, encoding='utf-8') as file:
            text = file.readlines()

        text = [line.replace('\n', '') for line in text]
        text = [line for line in text if len(line) > 0]
        word_list = [line.split('\t')[0] for line in text]
        label_list = [line.split('\t')[1] for line in text]

        sentences = []
        tags = []
        current_sentence = []
        current_tags = []
        for item in zip(word_list, label_list):
            current_sentence.append(item[0])
            current_tags.append(item[1])
            if item[0] == '.':
                sentences.append(' '.join(current_sentence))
                tags.append(current_tags)
                current_sentence = []
                current_tags = []
        
        return sentences, tags

    def encode_sentence(self, sentence, ner_tags):
        sentence = self.preprocess_punctuation(sentence)
        encoded_sentence = tokenizer(
            sentence, 
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            add_special_tokens=True,
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        encoded_labels = self.encode_tags(ner_tags, encoded_sentence.offset_mapping)

        if encoded_labels is not None:
            return {
                'input_ids': torch.squeeze(encoded_sentence.input_ids),
                'attention_mask': torch.squeeze(encoded_sentence.attention_mask),
                'labels': encoded_labels
            }
        else:
            return None
        
    def preprocess_punctuation(self, text):
        text = text.replace('©', '-')
        return text

    # encodes labels in the last token position of each word
    def encode_tags(self, ner_tags, offset_mapping):
        labels = [tag_to_id[tag] for tag in ner_tags]
        encoded_labels = np.ones(len(offset_mapping), dtype=int) * -100

        for i in range(1, len(offset_mapping) - 1):
            
            if offset_mapping[i][1] != offset_mapping[i+1][0]:
                if not self.ignore_mapping(offset_mapping[i]):
                    try:
                        encoded_labels[i] = labels.pop(0)
                    except(IndexError):
                        return None
        
        if len(labels) > 0:
            return None

        return torch.tensor(encoded_labels)

    def ignore_mapping(self, mapping):
        return mapping[0] == mapping[1]

train_set = NERDataset('bg-ner/train.txt')
test_set = NERDataset('bg-ner/test.txt')
train_loader = DataLoader(train_set, shuffle=True, batch_size=16)
test_loader = DataLoader(test_set, shuffle=False, batch_size=16)

Was unable to encode 24 examples
Was unable to encode 3 examples


In [None]:
from transformers import RobertaForTokenClassification

learning_rate = 1e-5

model = RobertaForTokenClassification.from_pretrained(
    MODEL, 
    num_labels=len(tag_to_id)
)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=515.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=336426471.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at iarfmoose/roberta-small-bulgarian were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at iarfmoose/roberta-small-bulgarian and are newly initialized: ['classifier.weight', 'classifier.bias']
You

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [None]:
LOG_INTERVAL = round(len(train_loader) / 10)

def train(epoch):
    model.train()
    total_loss = 0

    for batch_index, batch in enumerate(train_loader):
        model.zero_grad()
        output = model(**batch)
        loss = output[0]
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_index % LOG_INTERVAL == 0 and batch_index > 0:
            current_loss = total_loss / LOG_INTERVAL
            print('| epoch {:3d} | ' 
                  '{:5d}/{:5d} batches | '
                  'loss {:5.2f}'.format(
                    epoch, 
                    batch_index, len(train_loader), 
                    current_loss))
            total_loss = 0

def test(data_loader):
    model.eval()
    total_score = 0
    total_len = 0

    with torch.no_grad():
        for batch_index, batch in enumerate(data_loader):
            output = model(**batch)
            preds = np.argmax(output[1].cpu(), axis=2)
            preds = preds[(batch['labels'] != -100)]
            labels = batch['labels'][(batch['labels'] != -100)]
            total_score += preds.eq(labels.cpu()).sum()
            total_len += len(labels)
    return (total_score.item() / total_len) * 100

In [None]:
EPOCHS = 2

accuracy = test(test_loader)
print('| Pretraining Accuracy: {:.2f}%\n'.format(accuracy))

for epoch in range(1, EPOCHS + 1):
    train(epoch)
    accuracy = test(test_loader)
    print('| epoch   {} |  Accuracy: {:.2f}%\n'.format(epoch, accuracy))

| Pretraining Accuracy: 9.71%

| epoch   1 |    45/  450 batches | loss  0.75
| epoch   1 |    90/  450 batches | loss  0.29
| epoch   1 |   135/  450 batches | loss  0.21
| epoch   1 |   180/  450 batches | loss  0.15
| epoch   1 |   225/  450 batches | loss  0.12
| epoch   1 |   270/  450 batches | loss  0.13
| epoch   1 |   315/  450 batches | loss  0.11
| epoch   1 |   360/  450 batches | loss  0.10
| epoch   1 |   405/  450 batches | loss  0.08
| epoch   1 |  Accuracy: 96.86%

| epoch   2 |    45/  450 batches | loss  0.07
| epoch   2 |    90/  450 batches | loss  0.06
| epoch   2 |   135/  450 batches | loss  0.06
| epoch   2 |   180/  450 batches | loss  0.06
| epoch   2 |   225/  450 batches | loss  0.05
| epoch   2 |   270/  450 batches | loss  0.06
| epoch   2 |   315/  450 batches | loss  0.06
| epoch   2 |   360/  450 batches | loss  0.05
| epoch   2 |   405/  450 batches | loss  0.05
| epoch   2 |  Accuracy: 97.88%

