In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 3.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 15.8MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 20.4MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K 

In [2]:
!pip install conllu

Collecting conllu
  Downloading https://files.pythonhosted.org/packages/8e/49/eb3c57e95839d89d50cd667af29694543774fee480a52879ef8c689e5d9d/conllu-4.0-py2.py3-none-any.whl
Installing collected packages: conllu
Successfully installed conllu-4.0


In [3]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
%cd '/content/drive/My Drive/ml_hw/NLP/bulgarian/'

MODEL = "./roberta-base-bg"

from transformers import RobertaTokenizerFast
from torch.utils.data import Dataset, DataLoader
from conllu import parse_incr
import numpy as np
import string
import re

BATCH_SIZE = 16
MAX_LEN = 128

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL, max_len=MAX_LEN)

tag_to_id = {'ADJ': 0, 'ADP': 1, 'PUNCT': 2, 'ADV': 3, 'AUX': 4, 'SYM': 5, 
              'INTJ': 6, 'CCONJ': 7, 'X': 8, 'NOUN': 9, 'DET': 10, 'PROPN': 11, 
              'NUM': 12, 'VERB': 13, 'PART': 14, 'PRON': 15, 'SCONJ': 16}

id_to_tag = {tag_to_id[tag]: tag for tag in tag_to_id}

class POSDataset(Dataset):

    def __init__(self, data_path):
        self.data = []

        data_file = open(data_path, 'r', encoding="utf8")

        failed_count = 0
        for token_list in parse_incr(data_file):

            # first we need to tokenize the text
            text = token_list.metadata['text'] 
            text = self.preprocess_punctuation(text)
            tokenized_text = tokenizer(
                text, 
                max_length=MAX_LEN,
                padding='max_length',
                truncation=True,
                add_special_tokens=True,
                return_offsets_mapping=True,
                return_tensors='pt'
            )

            # next we can get the pos tags and encode them
            tags = [token['upos'] for token in token_list]
            encoded_labels = self.encode_tags(tags, tokenized_text.offset_mapping)
            if encoded_labels:
                self.data.append({
                    'input_ids': torch.squeeze(tokenized_text['input_ids']),
                    'attention_mask': torch.squeeze(tokenized_text['attention_mask']),
                    'labels': torch.tensor(encoded_labels)})
            else:
                failed_count += 1
        print("Unable to process {} examples".format(failed_count))
    
    def encode_tags(self, pos_tags, offset_mapping):
        labels = [tag_to_id[tag] for tag in pos_tags]
        encoded_labels = np.ones(len(offset_mapping), dtype=int) * -100

        for i in range(1, len(offset_mapping)):
            if self.ignore_mapping(offset_mapping[i-1]) or offset_mapping[i-1][-1] != offset_mapping[i][0]:
                if not self.ignore_mapping(offset_mapping[i]):
                    try:
                        encoded_labels[i] = labels.pop(0)
                    except(IndexError):
                        return None
        
        if len(labels) > 0:
            return None

        return encoded_labels.tolist()

    def ignore_mapping(self, mapping):
        return mapping[0] == mapping[1]

    def preprocess_punctuation(self, text):
        text = text.replace('...', '.')
        text = text.replace('..', '.')
        text = re.sub('([,.:;?!\()""''])', r' \1 ', text)
        text = re.sub('\s{2,}', ' ', text)
        return text

    def __getitem__(self, index):
        item = self.data[index]
        item['input_ids'] = item['input_ids'].to(device)
        item['attention_mask'] = item['attention_mask'].to(device)
        item['labels'] = item['labels'].to(device)
        return item

    def __len__(self):
        return len(self.data)


pos_paths = ['conllu/bg_btb-ud-dev.conllu',
             'conllu/bg_btb-ud-test.conllu',
             'conllu/bg_btb-ud-train.conllu']

dev_set, test_set, train_set = [POSDataset(path) for path in pos_paths]
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE)
dev_loader = DataLoader(dev_set, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)

/content/drive/My Drive/ml_hw/NLP/bulgarian
Unable to process 89 examples
Unable to process 79 examples
Unable to process 722 examples


In [12]:
from transformers import RobertaForTokenClassification

learning_rate = 1e-4

model = RobertaForTokenClassification.from_pretrained(
    MODEL, 
    num_labels=len(tag_to_id)
)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
model.to(device)

Some weights of the model checkpoint at ./roberta-base-bg were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at ./roberta-base-bg and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this mode

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((7

In [13]:
LOG_INTERVAL = round(len(train_loader) / 10)

def train(epoch):
    model.train()
    total_loss = 0

    for batch_index, batch in enumerate(train_loader):
        model.zero_grad()
        output = model(**batch)
        loss = output[0]
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_index % LOG_INTERVAL == 0 and batch_index > 0:
            current_loss = total_loss / LOG_INTERVAL
            print('| epoch {:3d} | ' 
                  '{:5d}/{:5d} batches | '
                  'loss {:5.2f}'.format(
                    epoch, 
                    batch_index, len(train_loader), 
                    current_loss))
            total_loss = 0

def test(data_loader):
    model.eval()
    total_score = 0
    total_len = 0

    with torch.no_grad():
        for batch_index, batch in enumerate(data_loader):
            output = model(**batch)
            preds = np.argmax(output[1].cpu(), axis=2)
            preds = preds[(batch['labels'] != -100)]
            labels = batch['labels'][(batch['labels'] != -100)]
            total_score += preds.eq(labels.cpu()).sum()
            total_len += len(labels)
    return (total_score.item() / total_len) * 100

In [14]:
EPOCHS = 5

accuracy = test(dev_loader)
print('| Pretraining Accuracy: {:.2f}%\n'.format(accuracy))

for epoch in range(1, EPOCHS + 1):
    train(epoch)
    accuracy = test(dev_loader)
    print('| epoch   {} |  Accuracy: {:.2f}%\n'.format(epoch, accuracy))

accuracy = test(test_loader)
print('\n Final Accuracy: {}%'.format(accuracy))

| Pretraining Accuracy: 8.67%

| epoch   1 |    51/  512 batches | loss  0.71
| epoch   1 |   102/  512 batches | loss  0.20
| epoch   1 |   153/  512 batches | loss  0.13
| epoch   1 |   204/  512 batches | loss  0.16
| epoch   1 |   255/  512 batches | loss  0.14
| epoch   1 |   306/  512 batches | loss  0.11
| epoch   1 |   357/  512 batches | loss  0.11
| epoch   1 |   408/  512 batches | loss  0.11
| epoch   1 |   459/  512 batches | loss  0.11
| epoch   1 |   510/  512 batches | loss  0.09
| epoch   1 |  Accuracy: 97.70%

| epoch   2 |    51/  512 batches | loss  0.13
| epoch   2 |   102/  512 batches | loss  0.07
| epoch   2 |   153/  512 batches | loss  0.06
| epoch   2 |   204/  512 batches | loss  0.07
| epoch   2 |   255/  512 batches | loss  0.06
| epoch   2 |   306/  512 batches | loss  0.04
| epoch   2 |   357/  512 batches | loss  0.05
| epoch   2 |   408/  512 batches | loss  0.04
| epoch   2 |   459/  512 batches | loss  0.05
| epoch   2 |   510/  512 batches | loss  0

In [25]:
torch.save(model.state_dict(), 'roberta-base-bg-pos.pt')

In [32]:
def get_relevant_labels(offset_mapping):
    relevant_labels = np.zeros(len(offset_mapping), dtype=int)

    for i in range(1, len(offset_mapping)):
        if ignore_mapping(offset_mapping[i-1]) or offset_mapping[i-1][-1] != offset_mapping[i][0]:
            if not ignore_mapping(offset_mapping[i]):
                relevant_labels[i] = 1

    return relevant_labels

def ignore_mapping(mapping):
    return mapping[0] == mapping[1]

with torch.no_grad():
    model.eval()
    input = 'Аз сьм мьж.'
    tokenized_text = tokenizer(
        input, 
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        add_special_tokens=True,
        return_offsets_mapping=True,
        return_tensors='pt'
    )
    tokenized_text['input_ids'] = tokenized_text['input_ids'].to(device)
    tokenized_text['attention_mask'] = tokenized_text['attention_mask'].to(device)
    output = model(
        input_ids=tokenized_text['input_ids'], 
        attention_mask=tokenized_text['attention_mask']
    )
    preds = np.argmax(output[0].cpu(), axis=2)
    relevant = get_relevant_labels(tokenized_text['offset_mapping'])
    predicted_labels = preds[0][relevant == True].tolist()
    print("Input sentence: {}".format(input))
    print("Predictions: {}".format([id_to_tag[id] for id in predicted_labels]))

Input sentence: Аз сьм мьж.
Predictions: ['PRON', 'VERB', 'NOUN']


In [34]:
path = './roberta-base-bulgarian-pos'
model.save_pretrained(path)
tokenizer.save_pretrained(path)

('./roberta-base-bulgarian-pos/vocab.json',
 './roberta-base-bulgarian-pos/merges.txt',
 './roberta-base-bulgarian-pos/special_tokens_map.json',
 './roberta-base-bulgarian-pos/added_tokens.json')