<a href="https://colab.research.google.com/github/Andrea4-sr/mlnlp_ex5/blob/andrea2/ex05_ner_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
! pip3 install datasets
! pip3 install transformers
! pip3 install sklearn

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, BertForTokenClassification, AdamW, TrainingArguments, Trainer
from random import shuffle
import torch
from sklearn.metrics import f1_score

In [3]:
# Function to generate the desired dataset with the preferred parameters
def preprare_dataset(amount=6000, dataset_name='polyglot_ner', language='nl'):
  split_ = f"train[:{amount}]"
  dataset = load_dataset(dataset_name, language, split=split_)
  dataset = [d for d in dataset]
  return dataset

In [4]:
d = preprare_dataset()

Downloading builder script:   0%|          | 0.00/6.01k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/86.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading and preparing dataset polyglot_ner/nl to /root/.cache/huggingface/datasets/polyglot_ner/nl/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1...


Downloading data:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/520664 [00:00<?, ? examples/s]

Dataset polyglot_ner downloaded and prepared to /root/.cache/huggingface/datasets/polyglot_ner/nl/1.0.0/bb2e45c90cd345c87dfd757c8e2b808b78b0094543b511ac49bc0129699609c1. Subsequent calls will reuse this data.


In [5]:
def dataset_generator(dataset, splits: list):
  shuffle(dataset)  # randomize the order of the examples
  for amount in splits:
    dataset_split = dataset[:amount]
    dataset = dataset[amount:]
    yield dataset_split
  return

In [6]:
# generate and store the actual datasets

train3000, train1000, eval2000 = dataset_generator(d, [3000, 1000, 2000])

In [7]:
# just to check that all datasets have the desired length

len(train3000), len(train1000), len(eval2000)

(3000, 1000, 2000)

In [8]:
# initialize BERT tokenizer and model for Dutch (BERTje)

tokenizer_nl = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model_nl = BertForTokenClassification.from_pretrained("GroNLP/bert-base-dutch-cased", num_labels=4)

Downloading:   0%|          | 0.00/254 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/437M [00:00<?, ?B/s]

Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased

In [12]:
# todo: real data set!
tryout200 = train3000[:200]

# encode the dataset using the tokenizer

In [9]:
# function to add numerical labels manually (instead of strings!)

def manual_labels(label2ix={"O": 0, "LOC": 1, "PER": 2, "ORG": 3}, labels=[], padding=0):
  # note: label2ix is specific to the dataset we are working with! but it can be swapped with a different dictionary
  if labels:
    new_labels = [label2ix[label] for label in labels]
    if len(new_labels) < padding:  # check if padding is even needed
      missing = [0 for i in range(padding-len(new_labels))]
      return new_labels + missing
  else:
    # might be a useful feature to have
    return label2ix 

In [85]:
def encode_dataset(dataset, train_split=0.8):
  encoded_dataset = [tokenizer_nl(" ".join(item['words']), return_tensors="pt", padding='max_length', truncation=True, max_length=130) for item in dataset]

  # add numerical + padded lables to the encoded dataset
  for enc_item, item in zip(encoded_dataset, dataset):
    padding = len(enc_item['attention_mask'][0])  # use the length of the attention mask as a reference for how big the padding should be
    enc_item['labels'] = torch.LongTensor([manual_labels(labels=item['ner'], padding=padding)])

  # from notebook
  # we don't need the batch dimension when using the trainer
  # because the trainer does batching for us 
  for item in encoded_dataset:
      for key in item:
          item[key] = torch.squeeze(item[key])

  index = round(len(encoded_dataset)*train_split)

  train_set = encoded_dataset[:index]
  test_set = encoded_dataset[index:]

  return train_set, test_set



In [89]:
trymaxdata = train3000[:1842]

In [90]:
#encoded3000_train, encoded3000_test = encode_dataset(train3000, 0.73)
#encoded200_train, encoded200_test = encode_dataset(tryout200, 0.73) 
encodedmaxdata_train, encodedmaxdata_test = encode_dataset(trymaxdata, 0.73) 

In [91]:
# have a small look at the shapes of one example -> make sure that they're all the same!

for k, v in encodedmaxdata_test[0].items():
  print(f'{k}: shape is {v.shape}')

input_ids: shape is torch.Size([130])
token_type_ids: shape is torch.Size([130])
attention_mask: shape is torch.Size([130])
labels: shape is torch.Size([130])


In [87]:

for k, v in encoded200_test[0].items():
  print(f'{k}: shape is {v.shape}')

input_ids: shape is torch.Size([130])
token_type_ids: shape is torch.Size([130])
attention_mask: shape is torch.Size([130])
labels: shape is torch.Size([130])


In [None]:
encoded200_train[5]

In [92]:
def train_model(model_name, dataset, num_epochs, train_batch_size, eval_batch_size, out_dir='results', logging_dir='logs', no_cuda=False, requires_grad=True):

    training_args = TrainingArguments(
        num_train_epochs=num_epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        output_dir=out_dir,
        logging_dir=logging_dir,
        no_cuda=no_cuda
    )

    trainer = Trainer(
        model=model_nl,
        tokenizer=tokenizer_nl,
        args=training_args,
        train_dataset=dataset,
    )
    
    if not requires_grad:
      for param in model_nl.base_model.parameters():
        param.requires_grad = False

    trainer.train()
    
    return model_name, trainer
    

In [88]:
modeltryout = train_model(model_name="Model Tryout",  # takes about 3 mins
                          dataset=encoded200_train,    # todo: change dataset
                          num_epochs=1,
                          train_batch_size=4, 
                          eval_batch_size=4,
                          requires_grad=False
                         )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 146
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 37
  Number of trainable parameters = 3076


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




In [93]:
modelmaxdata = train_model(model_name="Model Maximum Data",  # takes about 3 mins
                          dataset=encodedmaxdata_train,    # todo: change dataset
                          num_epochs=1,
                          train_batch_size=4, 
                          eval_batch_size=4,
                          requires_grad=False
                          )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 1345
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 337
  Number of trainable parameters = 3076


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
model1000 = train_model(model_name="Model1000", 
                          dataset=train1000, 
                          train_batch_size=4, 
                          eval_batch_size=4
                         )

In [72]:
model3000 = train_model(model_name="Model3000", 
                          dataset=train3000, 
                          num_epochs=1,
                          train_batch_size=4, 
                          eval_batch_size=4
                         )

***** Running training *****
  Num examples = 3000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 108549892
The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: words, id, lang, ner. If words, id, lang, ner are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: ignored

In [None]:
model3000frozen = train_model(model_name="Model3000frozen",   # todo: frozen embeddings!
                          dataset=train3000, 
                          train_batch_size=4, 
                          eval_batch_size=4,
                          requires_grad=False   # is that right?
                         )

In [1]:
def get_scores(trainer, test_set=eval2000):
    
    preds = trainer[1].predict(test_set)
    
    total_mac = 0
    total_mic = 0
    count = 0

    for i in zip(preds.label_ids, preds.predictions.argmax(-1)):
      f1_mac = f1_score(i[0], i[1], average='macro')
      f1_mic = f1_score(i[0], i[1], average='micro')
      total_mac += f1_mac
      total_mic += f1_mic
      count += 1

    return {'macro': total_mac/count, 'micro': total_mic/count}

NameError: ignored

In [None]:
scorestryout = get_scores(modeltryout, encoded200_test)

***** Running Prediction *****
  Num examples = 146
  Batch size = 4


In [None]:
scores1000 = get_scores(model1000, eval2000)
scores3000 = get_scores(model3000, eval2000)
scores3000frozen = get_scores(model3000frozen, eval2000)

In [None]:
def pretty_print(model, scores):
  print()
  print(f'*** Model: {model[0]} ***\n')
  print('-'*len(f'*** Model: {model[0]} ***'))
  print("F1 Scores")
  print('-'*len(f'*** Model: {model[0]} ***'))
  print(f'F1-Macro\t{scores["macro"]:.4f}')
  print(f'F1-Micro\t{scores["micro"]:.4f}')
  print()

In [None]:
pretty_print(modeltryout, scorestryout)


*** Model: Model Tryout ***

---------------------------
F1 Scores
---------------------------
F1-Macro	0.8026
F1-Micro	0.9910



In [None]:
pretty_print(model1000, scores1000)
pretty_print(model3000, scores3000)
pretty_print(model3000frozen, scores3000frozen)