In [1]:
pip install datasets transformers umap-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from datasets import list_datasets, load_dataset, DatasetDict
from collections import defaultdict
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import pprint as pp
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from collections import Counter
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

In [3]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names('xtreme')

print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [4]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [5]:
langs = ['de', 'fr', 'it', 'en']
fracs = [0.629, 0.229, 0.084, 0.059]

panx_ch = defaultdict(DatasetDict)

panx_ch

defaultdict(datasets.dataset_dict.DatasetDict, {})

In [6]:
for lang, frac in zip(langs, fracs):
  # Load monolingual corpus
  ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
  # Shuffle and downsample each split according to spoken proportion
  for split in ds:
    panx_ch[lang][split] = (
    ds[split]
    .shuffle(seed=0)
    .select(range(int(frac * ds[split].num_rows))))



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



In [7]:
from datasets import load_dataset
de = load_dataset("xtreme", name="PAN-X.de")

de



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [8]:
panx_ch

defaultdict(datasets.dataset_dict.DatasetDict,
            {'de': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 12580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 6290
                 })
             }),
             'fr': DatasetDict({
                 train: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 4580
                 })
                 validation: Dataset({
                     features: ['tokens', 'ner_tags', 'langs'],
                     num_rows: 2290
                 })
                 test: Dataset({
                     features: ['tokens', 'ner_tags', 'la

In [9]:
import pandas as pd

In [10]:
pd.DataFrame({lang: panx_ch[lang]["train"].num_rows for lang in langs}, index=["Number of training Examples"])

Unnamed: 0,de,fr,it,en
Number of training Examples,12580,4580,1680,1180


In [11]:
element = panx_ch['de']['train'][0]
print(element)
print(type(element))

for key, value in element.items():
  print(f"{key}: {value}")

{'tokens': ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.'], 'ner_tags': [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0], 'langs': ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']}
<class 'dict'>
tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [12]:
panx_ch['de']['train'].features.items()

dict_items([('tokens', Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)), ('ner_tags', Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)), ('langs', Sequence(feature=Value(dtype='string', id=None), length=-1, id=None))])

In [13]:
tags = panx_ch['de']['train'].features['ner_tags'].feature

tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [14]:
def create_name_tags(batch):
  return {'ner_tags_str':[tags.int2str(idx) for idx in batch['ner_tags']]}

In [15]:
panx_de = panx_ch['de'].map(create_name_tags)
panx_de



DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs', 'ner_tags_str'],
        num_rows: 6290
    })
})

In [16]:
de_example = panx_de['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']], index=['Tokens', 'Tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


### Checking Unusual Imbalance Of the NER frequency in the dataset

In [17]:
split2freqs = defaultdict(Counter)

for split, dataset in panx_de.items():
  for row in dataset['ner_tags_str']:
    for tag in row:
      if tag.startswith('B'):
        tag_type = tag.split('-')[1]
        split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient='index')

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


## Creating a Custom Model for Token Classification

In [18]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

In [19]:
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
  config_class = XLMRobertaConfig

  def __init__(self, config):
    """
    we call the initialization function of the RobertaPreTrainedModel class. 
    This abstract class handles the initialization or loading of pretrained weights
    """
    super().__init__(config) 
    self.num_labels = config.num_labels # num of ner tags that work as labels
    
    # load model body
    self.roberta = RobertaModel(config, add_pooling_layer=False) # add_pooling_layer=False to ensure all hidden states are returned and not only the one associated with the [CLS] token
    
    # set up token classification
    self.dropout = nn.Dropout(config.hidden_dropout_prob)
    self.classifier = nn.Linear(config.hidden_size, config.num_labels) # 768 => 7 for each token

    self.init_weights() # loads both pretrained weighrs of Roberta Model body and randomly initialize weights of classification head

  def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, **kwargs):
    # use model to get encoder representation of each token
    # output[0] / outputs.last_hidden_state shape = [batch size, tokens, embedding size]
    outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)

    # apply classifier to encoder representation
    # print(f'output shape: {outputs.last_hidden_state.shape}')
    # print(outputs[0].shape)

    # Sequence shape = [batch size, tokens, embedding size]
    sequence_output = self.dropout(outputs[0])
    # print(f'sequence output shape: {sequence_output.shape}')

    # logits shape = [batch size, tokens, projected label size]
    logits = self.classifier(sequence_output)
    #print(f'logits shape: {logits.shape}')

    # calculate loss
    loss = None

    # if label is passed we calculate the loss
   
    if labels is not None:
      #print(f'labels shape: {labels.shape}')
      loss_fct = nn.CrossEntropyLoss()
      loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))


    # Return model output object
    return TokenClassifierOutput(loss=loss, logits=logits,
    hidden_states=outputs.hidden_states,
    attentions=outputs.attentions)



In [20]:
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [21]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [22]:
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)


text = "Jack Sparrow loves New York!"

bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [23]:
from transformers import AutoConfig
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
num_labels=tags.num_classes,
id2label=index2tag, label2id=tag2index)

Basically we can override the config file if we want to change specific things, such as number of classes or label names etc. 

In [24]:
xlmr_config

XLMRobertaConfig {
  "_name_or_path": "xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
.from_pretrained(xlmr_model_name, config=xlmr_config)
.to(device))

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classif

In [26]:
input_ids = xlmr_tokenizer.encode(text, return_tensors='pt')
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index=['Tokens', 'Input IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [27]:
# outputs = xlmr_model(input_ids.to(device)).logits
# predictions = torch.argmax(outputs, dim=-1)
# print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
# print(f"Shape of outputs: {outputs.shape}")

In [28]:
outputs = xlmr_model(input_ids.to(device)).logits
print(outputs.shape)
predictions = torch.argmax(outputs, dim=-1) # taking the max from the last dimension
print(f'Prediction shape: {predictions.shape}')
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}")

torch.Size([1, 10, 7])
Prediction shape: torch.Size([1, 10])
Number of tokens in sequence: 10
Shape of outputs: torch.Size([1, 10, 7])


In [29]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index=["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,O,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,I-PER,O


In [30]:
def tag_text(text, tags, model, tokenizer):
  # Get tokens with special characters
  tokens = tokenizer(text).tokens()
  # Encode the sequence into IDs
  input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
  # Get predictions as distribution over 7 possible classes
  outputs = model(input_ids)[0]
  # Take argmax to get most likely class per token
  predictions = torch.argmax(outputs, dim=2)
  # Convert to DataFrame
  preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
  return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

In [31]:
de_example

{'tokens': ['2.000',
  'Einwohnern',
  'an',
  'der',
  'Danziger',
  'Bucht',
  'in',
  'der',
  'polnischen',
  'Woiwodschaft',
  'Pommern',
  '.'],
 'ner_tags': [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0],
 'langs': ['de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de',
  'de'],
 'ner_tags_str': ['O',
  'O',
  'O',
  'O',
  'B-LOC',
  'I-LOC',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'I-LOC',
  'O']}

## Tokenizing Text for NER

In [32]:
words, labels = de_example['tokens'], de_example['ner_tags']

In [33]:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)

In [34]:
tokenized_input

{'input_ids': [0, 70101, 176581, 19, 142, 122, 2290, 708, 1505, 18363, 18, 23, 122, 127474, 15439, 13787, 14, 15263, 18917, 663, 6947, 19, 6, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [35]:
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [36]:
word_ids = tokenized_input.word_ids() # word ids = basically id of the tokens
word_ids

[None,
 0,
 1,
 1,
 2,
 3,
 4,
 4,
 4,
 5,
 5,
 6,
 7,
 8,
 8,
 9,
 9,
 9,
 9,
 10,
 10,
 10,
 11,
 11,
 None]

In [37]:
pd.DataFrame([tokens, word_ids], index=['Tokens', 'Word IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


Now we mask the special tokens for start and end and also the later part of the tokens since we wish to label only the first tokens of the full word. 

In [38]:
previous_word_index = None
label_ids = [] # the list containing id of the labels from ( 0 - 6 and -100 for those special tokens)

In [39]:
for word_idx in word_ids:
  if word_idx is None or word_idx == previous_word_index:
    label_ids.append(-100)
  elif word_idx != previous_word_index:
    label_ids.append(labels[word_idx])

  previous_word_index = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]
pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)
  

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


Why did we choose –100 as the ID to mask subword representations?
The reason is that in PyTorch the cross-entropy loss class
torch.nn.CrossEntropyLoss has an attribute called ignore_index
whose value is –100. This index is ignored during training, so we
can use it to ignore the tokens associated with consecutive
subwords


In [40]:
def tokenize_and_align_labels(examples):
  tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
    is_split_into_words=True)
  labels = []

  """
  examples = { # a single batch of some size ( let's say 2)
    'tokens': [[], []],
    'ner_tags': [[], []]
  },{}, {} ... {}
  """
  
  # tokenized inputs are also in batches
  #print(len(examples['tokens']))
  #print(examples)
  #print(tokenized_inputs)

  # this loop processes for a single batch and returns for a single batch
  for idx, label in enumerate(examples["ner_tags"]): # idx is the index of the elements in a single batch
    word_ids = tokenized_inputs.word_ids(batch_index=idx)
    #print(idx, word_ids)
    previous_word_idx = None
    label_ids = []
    
    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
      else:
        label_ids.append(label[word_idx])
      
      previous_word_idx = word_idx  

    labels.append(label_ids)

  tokenized_inputs["labels"] = labels
  return tokenized_inputs

In [41]:


def encode_panx_dataset(corpus):
  return corpus.map(tokenize_and_align_labels, batched=True,
  remove_columns=['langs', 'ner_tags', 'tokens'])

In [42]:
panx_de_encoded = encode_panx_dataset(panx_ch["de"])



Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

In [43]:
panx_de_encoded

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 12580
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6290
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6290
    })
})

In [44]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
from seqeval.metrics import classification_report


y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
    ["B-PER", "I-PER", "O"]]

    
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



In [46]:
import numpy as np

def align_predictions(predictions, label_ids):
  preds = np.argmax(predictions, axis=2) # [batch_size, seq_length, class labels]
  batch_size, seq_len = preds.shape
  labels_list, preds_list = [], [] # for forming the output format for a single batch

  for batch_idx in range(batch_size):
    example_labels, example_preds = [], [] # temporary list for processing one data from a batch
    for seq_idx in range(seq_len):
      # Ignore label IDs = -100
      if label_ids[batch_idx, seq_idx] != -100:
        example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        example_preds.append(index2tag[preds[batch_idx][seq_idx]])
        
    labels_list.append(example_labels)
    preds_list.append(example_preds)

  return preds_list, labels_list

## Fine Tuning the Model

In [47]:
from transformers import TrainingArguments
num_epochs = 3
batch_size = 24
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"
training_args = TrainingArguments(
  output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size, evaluation_strategy="epoch",
  save_steps=1e6, weight_decay=0.01, disable_tqdm=False,
  logging_steps=logging_steps, push_to_hub=False)

In [48]:
from seqeval.metrics import f1_score
def compute_metrics(eval_pred): # for a single batch
  y_pred, y_true = align_predictions(eval_pred.predictions,
  eval_pred.label_ids)
  return {"f1": f1_score(y_true, y_pred)}

The final step is to define a data collator so we can pad each input sequence to the
largest sequence length in a batch. Transformers provides a dedicated data collator
for token classification that will pad the labels along with the inputs:

In [49]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer) # padding labels is necessary since labels are also sequences. The padded labels are marked with -100

In [50]:
def model_init():
  return (XLMRobertaForTokenClassification
   .from_pretrained(xlmr_model_name, config=xlmr_config)
    .to(device))

In [51]:
from transformers import Trainer
trainer = Trainer(model_init=model_init, args=training_args,
          data_collator=data_collator, compute_metrics=compute_metrics,
          train_dataset=panx_de_encoded["train"],
          eval_dataset=panx_de_encoded["validation"],
          tokenizer=xlmr_tokenizer) 

In [52]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1
1,0.2583,0.159615,0.82311
2,0.1262,0.139502,0.846774
3,0.0824,0.133884,0.865335


TrainOutput(global_step=1575, training_loss=0.1555365372460986, metrics={'train_runtime': 492.0817, 'train_samples_per_second': 76.695, 'train_steps_per_second': 3.201, 'total_flos': 863012377186080.0, 'train_loss': 0.1555365372460986, 'epoch': 3.0})

In [53]:
text_de = "Jeff Dean ist ein Informatiker bei Google in Kalifornien"
tag_text(text_de, tags, trainer.model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
Tokens,<s>,▁Jeff,▁De,an,▁ist,▁ein,▁Informati,ker,▁bei,▁Google,▁in,▁Kaliforni,en,</s>
Tags,O,B-PER,I-PER,I-PER,O,O,O,O,O,B-ORG,O,B-LOC,I-LOC,O


## Error Analysis

In [1]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):

  print(batch)

  # Convert dict of lists to list of dicts suitable for data collator
  features = [dict(zip(batch, t)) for t in zip(*batch.values())]

  print(features)
  
  # Pad inputs and labels and put all tensors on device
  batch = data_collator(features)
  input_ids = batch["input_ids"].to(device)
  attention_mask = batch["attention_mask"].to(device)
  labels = batch["labels"].to(device)
  
  with torch.no_grad():
    # Pass data through model
    output = trainer.model(input_ids, attention_mask)
    # logit.size: [batch_size, sequence_length, classes]
    # Predict class with largest logit value on classes axis
    predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()

    print(output.logits.view(-1, 7).shape)
    print(labels.shape)
    print(labels.view(-1).shape)

  # Calculate loss per token after flattening batch dimension with view
  loss = cross_entropy(output.logits.view(-1, 7), # [batch_size * seq_length, 7] compared with [all data, 7]
          labels.view(-1), reduction="none")

  print(loss.shape)
  # Unflatten batch dimension and convert to numpy array
  loss = loss.view(len(input_ids), -1).cpu().numpy()
  print(loss.shape)

  return {"loss":loss, "predicted_label": predicted_label}

In [2]:
valid_set = panx_de_encoded["validation"]
valid_set[0]

NameError: ignored

In [None]:
valid_set = panx_de_encoded["validation"][:1000]
valid_set = valid_set.map(forward_pass_with_label, batched=True, batch_size=32)
df = valid_set.to_pandas()

In [None]:
index2tag[-100] = "IGN"
df["input_tokens"] = df["input_ids"].apply(
  lambda x: xlmr_tokenizer.convert_ids_to_tokens(x))
df["predicted_label"] = df["predicted_label"].apply(
  lambda x: [index2tag[i] for i in x])
df["labels"] = df["labels"].apply(
  lambda x: [index2tag[i] for i in x])
df['loss'] = df.apply(
  lambda x: x['loss'][:len(x['input_ids'])], axis=1)
df['predicted_label'] = df.apply(
  lambda x: x['predicted_label'][:len(x['input_ids'])], axis=1)
df.head(1)

In [None]:
df_tokens = df.apply(pd.Series.explode)
df_tokens = df_tokens.query("labels != 'IGN'")
df_tokens["loss"] = df_tokens["loss"].astype(float).round(2)
df_tokens.head(7)