In [21]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install seqeval



In [22]:
import torch

torch.cuda.empty_cache()

This project will consist on Cross Lingual NER. We train a model on TokenClassification task in english, then we pass the knowledge to another dataset in the albanian language based on what the model already learned.

##Step 1 - Importing the dataset

In [23]:
from datasets import load_dataset
from collections import defaultdict
from datasets import DatasetDict

In [24]:
langs = ['en', 'sq']

pan_dict = defaultdict(DatasetDict)
for lang in langs:
  ds = load_dataset('wikiann', name=f'{lang}')
  for split in ds:
    pan_dict[lang][split] = (ds[split].shuffle(seed=42))

In [25]:
dataset_en = pan_dict['en'] #dataset for the english language

##Step 2 - Creating a separate column called ner_tags_str, which will represent the labels according to entities annotated: B-ORG, I-ORG, B-PER, O, etc

We have to do this, because when using seqeval as an evaluation measure, we need to have the string representations of the tags.

In [26]:
#features of the dataset
tags = dataset_en['train'].features['ner_tags'].feature

def create_tags(batch):

  return {"ner_tags_str":[tags.int2str(i) for i in batch['ner_tags']]}


dataset_en = dataset_en.map(create_tags,
                            batched=True)

##Step 3 - Applying tokenization and mapping the tokenization function to our dataset.

<b>Reminder</b>: We will be using the XLMRoberta model and the model uses SentencePiece algorithm to tokenize our data. We will take a simple example, explaining how the SentencePiece works, but the most important idea, is to encode the subwords the algorithm provide, in labels with the value of -100 or "IGN". These will be all the special tokens such as <s> and </s> (indicating beginning and ending of a sequence) or other subwords that will be created by words our algorithm doesn't recognize. In this way we will have an equal number of initial labels and tokenized labels. If we didn't apply this technique, unfortunately we would have an unequal number of tokens and labels to be predicted, resulting in errors in further phases of the code.

In [27]:
from transformers import AutoTokenizer

checkpoint = 'xlm-roberta-base'

sentence = ["My name is Gerti"]
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

inputs = tokenizer(sentence,
                   is_split_into_words=True)

print(inputs.tokens())

#Lets take the first sequence of our train set and see why we should apply this method
inputs = tokenizer(dataset_en['train'][0]['tokens'],
                   truncation=True,
                   is_split_into_words=True)

print("-------")
print(len(inputs.tokens()))
print(len(dataset_en['train'][0]['ner_tags'])) #3 extra tokens, we need to balance

['<s>', '▁My', '▁name', '▁is', '▁Ger', 'ti', '</s>']
-------
12
9


In [28]:
'''
The function will assign the values -100 to <s> and </s> and also the subwords created so we
can have an equal number of tokens and labels

'''

def align_labels(batch):

  labels = []

  #apply tokenization
  tokenized_dataset = tokenizer(batch['tokens'],
                                truncation=True,
                                is_split_into_words=True)

  labels = []

  for id, label in enumerate(batch['ner_tags']):
    word_ids = tokenized_dataset.word_ids(id)
    label_ids = []
    previous_word = None

    for word in word_ids:
      if word is None or previous_word == word:
        label_ids.append(-100)

      else:
        label_ids.append(label[word])
      previous_word = word
    labels.append(label_ids)
  tokenized_dataset['labels'] = labels
  return tokenized_dataset

encoded_dataset_en = dataset_en.map(align_labels,
                                    batched=True,
                                    batch_size=8)

In [29]:
#Let's check if everything is correct
print(len(encoded_dataset_en['train'][0]['input_ids']))
print(len(encoded_dataset_en['train'][0]['labels']))

12
12


##Step 4 - Doing some evaluation and creating the function compute_metrics which will be responsible for calculating the metrics.

We will also provide a datacollator, which means that the sequences will be padded according to the longest sequences. Now the elements that will get padded, don't take the zero value anymore, but -100.

In [30]:
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report,f1_score

#An example to understanding how seqeval works

y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



In [31]:
encoded_dataset_en['train']

Dataset({
    features: ['tokens', 'ner_tags', 'langs', 'spans', 'ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})

In [32]:
#Initializing data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [33]:
index2tag = {id: tag for id, tag in enumerate(tags.names)}
tag2index = {tag:id for id, tag in enumerate(tags.names)}

print(index2tag)
print(tag2index)

{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


In [34]:
import numpy as np

'''
The function evaluate will return predictions and labels, to be calculated in compute_metrics

'''


def evaluate(predictions, labels):

  label_ids, pred_ids = [], []
  #batch_size, sequence_length, num_of labels
  preds = np.argmax(predictions, dim=-1)
  batch_size, seq_len = predictions.shape

  for batch in range(batch_size):
    example_preds = []
    example_labels = []

    for seq in range(seq_len):
      if labels[batch, seq] != -100:
        example_labels.append(index2tag[labels[batch][seq]])
        example_preds.append(index2tag[preds[batch][seq]])

    label_ids.append(example_labels)
    pred_ids.append(example_preds)

  return label_ids, pred_ids

def compute_metrics(preds):


  y_true, y_pred = evaluate(preds.predictions, preds.label_ids)

  return {"F1-Score": f1_score(y_true, y_pred)}

##Step 5 - Creating the model. We will create our own classification head, and we load the body from RobertaModel.

We will create a model for token classification task, and we load the weights from RobertaPreTrainedModel and the body from RobertaModel. Of course, the model follows the transformer architecture steps, and the procedure is pretty standard; the head will consist on a classification layer with size (embedding_size, number_of_labels), where number_of_labels indicates the number of the ner_tags available.<br>

We will also use config's parameters, to get the values of the model's hyperparameters <br>

 In the forward method we will return a TokenClassifierOutput, which is a standard procedure, in returning the desired outputs (loss also).

In [35]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):

  def __init__(self, config):

    super().__init__(config)

    #Initializing the architecture
    #1st the labels
    self.num_labels = config.num_labels
    #transformer body
    self.roberta = RobertaModel(config, add_pooling_layer = False)
    #Creating the head
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, self.num_labels)

    #initializing the weights
    self.init_weights()


  def forward(self,
              input_ids = None,
              attention_mask = None,
              labels=None,
              **kwargs
              ):

    outputs = self.roberta(input_ids, attention_mask, **kwargs) #the hidden states created
    seq_output = self.dropout(outputs[0])
    logits = self.classifier(seq_output)

    loss = None

    if labels is not None:
      cross_entropy = nn.CrossEntropyLoss()
      loss = cross_entropy(logits.view(-1, self.num_labels),
                             labels.view(-1))

    return TokenClassifierOutput(logits=logits,
                               loss=loss,
                                 hidden_states=outputs.hidden_states,
                                 attentions=outputs.attentions)

##Step 6 - Instantiating our model, the TrainingArguments and Trainer objects among their hyperparameters.

In [36]:
from transformers import AutoConfig, TrainingArguments, Trainer
import torch


config = AutoConfig.from_pretrained(checkpoint,
                                    num_labels=tags.num_classes,
                                    label2id=tag2index,
                                    id2label=index2tag)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = XLMRobertaForTokenClassification.from_pretrained(checkpoint,config=config).to(device)

In [37]:
'''
Preparing hyperparameters to be updated
'''

num_epochs = 3
batch_size=8
logging_steps = len(encoded_dataset_en["train"]) // batch_size
model_name = f"{checkpoint}-finetuned-dataset-en"

training_args = TrainingArguments(output_dir = model_name,
                                  log_level="error",
                                  num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="epoch",
                                  save_steps=1e6,
                                  weight_decay=0.01,
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True
                                   )

In [38]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
#doing some updates with our dataset
encoded_dataset_en.set_format("torch")
encoded_dataset_en.remove_columns(['ner_tags','langs','spans'])

DatasetDict({
    validation: Dataset({
        features: ['tokens', 'ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['tokens', 'ner_tags_str', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20000
    })
})

In [40]:
trainer = Trainer(model,
                  training_args,
                  data_collator,
                  encoded_dataset_en['train'],
                  encoded_dataset_en['validation'],
                  tokenizer)

trainer.train()

/content/xlm-roberta-base-finetuned-dataset-en is already a clone of https://huggingface.co/Gerti/xlm-roberta-base-finetuned-dataset-en. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [45]:
#Quick testings and checking
tags #7 labels apparently
print(encoded_dataset_en['train'][0]['labels'])

tensor([-100,    0,    0,    0,    0,    0,    1, -100,    2,    2,    2, -100])


So far, the results are not bad, but this is not enough. It's better to perform a further analysis of the loss, because there are cases of course when the model, seems to perform well, but in practice has some major serious flaws.

In [46]:
def forward_pass_with_label(batch):

  #we need the inputs
  # Convert dict of lists to list of dicts suitable for data collator
  features = [dict(zip(batch, t)) for t in zip(*batch.values())]
  # Pad inputs and labels and put all tensors on device
  batch = data_collator(features)

  input_ids = batch['input_ids'].to(device)
  attention_mask = batch['attention_mask'].to(device)
  labels = batch['labels'].to(device)

  with torch.no_grad():

    outputs = trainer.model(input_ids, attention_mask)
    predicted_labels = torch.argmax(outputs.logits, dim=-1)
    cross_entropy = nn.CrossEntropyLoss()
    loss = cross_entropy(outputs.logits.view(-1, 7),
                         labels.view(-1))

     # Unflatten batch dimension and convert to numpy array
    loss = loss.view(len(input_ids), -1)


  return {"loss":loss,
          "predicted_labels":predicted_labels}

In [1]:
#Apply the loss to the test set
encoded_dataset_en['test'] = encoded_dataset_en['test'].map(forward_pass_with_label,
                                                            batched=True)

NameError: ignored