In [None]:
!pip install transformers

In [1]:
import sys, os
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from tqdm import tqdm
from pathlib import Path

nb_dir = os.path.abspath('')
BASE_DIR = os.path.dirname(nb_dir)
if not BASE_DIR in sys.path: sys.path.append(BASE_DIR)
    
import preprocess_conll
import conll

2022-07-18 22:45:52.089801: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-18 22:45:52.089837: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
DATA_DIR = '/opt/data/sulamerica/leia-ia2-extractor'
LABELSTUDIO_DIR = os.path.join(DATA_DIR, 'labelstudio', 'export')
MODELS_DIR = os.path.join(DATA_DIR, 'models')
labelstudio_filename = 'project-22-at-2022-04-18-12-26-d603fa96.conll'

params = {
    'batch_size': 8,
    'max_seq_len': 512,
    'model_name': 'neuralmind/bert-base-portuguese-cased',
    'overlap': 10,
    'learning_rate': 1e-05,
    'max_grad_norm': 10,
    'epochs': 100,
    'train_split_size': 0.7,
    'checkpoint': '/opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned/checkpoint-500'#'/opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500'
}

conll_file_path = os.path.join(LABELSTUDIO_DIR, labelstudio_filename)
tokenizer = AutoTokenizer.from_pretrained(params['model_name'])

### Preprocess 

In [3]:
preprocessed_dir = os.path.join(DATA_DIR, 'preprocessed')

trainset_filename, testset_filename = preprocess_conll.split_conll(conll_file_path, params['train_split_size'], LABELSTUDIO_DIR)

# Preprocess trainset
trainset_path = os.path.join(LABELSTUDIO_DIR, trainset_filename)
trainset_preprocessed_path = os.path.join(preprocessed_dir, f"{trainset_filename.split('.')[0]}_preprocessed.txt")
preprocess_conll.main(['--dataset', trainset_path, '--model_name_or_path', params['model_name'], \
                       '--max_len', str(params['max_seq_len']), '--overlap', str(params['overlap']), \
                       '--output_path', trainset_preprocessed_path])

# Preprocess testset
testset_path = os.path.join(LABELSTUDIO_DIR, testset_filename)
testset_preprocessed_path = os.path.join(preprocessed_dir, f"{testset_filename.split('.')[0]}_preprocessed.txt")
preprocess_conll.main(['--dataset', testset_path, '--model_name_or_path', params['model_name'], \
                       '--max_len', str(params['max_seq_len']), '--overlap', str(params['overlap']), \
                       '--output_path', testset_preprocessed_path])

###  Defining datasets

In [4]:
training_set = conll.ConllDataset(params, trainset_preprocessed_path, tokenizer)
test_set = conll.ConllDataset(params, testset_preprocessed_path, tokenizer)

In [5]:
training_loader = DataLoader(training_set, shuffle=True, batch_size=params['batch_size'])
test_loader = DataLoader(test_set, shuffle=True, batch_size=params['batch_size'])

missing_labels = [lbl for lbl in test_set.unique_labels if lbl not in training_set.unique_labels]
assert len(missing_labels)==0, f"Missing labels in training data: {missing_labels}"

## Train and evaluate model


In [6]:
from datasets import load_metric

def compute_metrics(p, metric_name="seqeval"):
    predictions, labels = p
    metric = load_metric(metric_name)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [training_set.id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [training_set.id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [7]:
from datetime import datetime
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

if params.get('checkpoint', ''):
    from transformers import AutoConfig, AutoModel
    tokenizer = AutoTokenizer.from_pretrained(params['checkpoint'], model_max_length=params['max_seq_len'])
    config = AutoConfig.from_pretrained(params['checkpoint'])
    model =  AutoModelForTokenClassification.from_pretrained(params['checkpoint'], config=config)
    model_dir = Path(params['checkpoint']).parent

else:
    model = AutoModelForTokenClassification.from_pretrained(params['model_name'], num_labels=len(training_set.unique_labels), id2label=training_set.id2label)
    model_dir = f"{MODELS_DIR}/{params['model_name']}-finetuned-{datetime.now().strftime('%m%d%Y_%H%M%S')}"
    
train_args = TrainingArguments(
    model_dir,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=params['batch_size'],
    per_device_eval_batch_size=params['batch_size'],
    num_train_epochs=100,
    weight_decay=0.01,
    logging_steps=1
)

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    train_args,
    train_dataset=training_set,
    eval_dataset=test_set,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [8]:
if params.get('checkpoint', ''):
    trainer.train(resume_from_checkpoint=params['checkpoint'])

else:
    trainer.train()

Loading model from /opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500).
***** Running training *****
  Num examples = 704
  Num Epochs = 100
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 8800
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 96
  Continuing training from global step 8500
  Will skip the first 96 epochs then the first 52 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/52 [00:00<?, ?it/s]

  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
97,6.9541,7.473933,0.000898,0.02775,0.001739,0.012117
98,3.2074,2.86284,0.000933,0.010314,0.001711,0.512747
99,0.8716,1.734667,0.003933,0.041012,0.007178,0.566573
100,1.1165,1.63258,0.004434,0.047642,0.008113,0.576776


***** Running Evaluation *****
  Num examples = 304
  Batch size = 8
***** Running Evaluation *****
  Num examples = 304
  Batch size = 8
***** Running Evaluation *****
  Num examples = 304
  Batch size = 8
***** Running Evaluation *****
  Num examples = 304
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




In [8]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 304
  Batch size = 8
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


{'eval_loss': 2.848015069961548,
 'eval_precision': 0.7849954182689672,
 'eval_recall': 0.6620888739158294,
 'eval_f1': 0.7183226786686925,
 'eval_accuracy': 0.6452572471217105,
 'eval_runtime': 129.6061,
 'eval_samples_per_second': 2.346,
 'eval_steps_per_second': 0.293}

In [9]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 304
  Batch size = 8


{'eval_loss': 1.6325796842575073,
 'eval_precision': 0.004433981669828354,
 'eval_recall': 0.04764243614931238,
 'eval_f1': 0.008112911657083114,
 'eval_accuracy': 0.5767758018092105,
 'eval_runtime': 85.4543,
 'eval_samples_per_second': 3.557,
 'eval_steps_per_second': 0.445,
 'epoch': 100.0}

In [12]:
trainer.save_model('/opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500')

Saving model checkpoint to /opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500
Configuration saved in /opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500/config.json
Model weights saved in /opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500/pytorch_model.bin
tokenizer config file saved in /opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500/tokenizer_config.json
Special tokens file saved in /opt/data/sulamerica/leia-ia2-extractor/models/neuralmind/bert-base-portuguese-cased-finetuned-07132022_112252/checkpoint-8500/special_tokens_map.json


### Draft

In [8]:
tokenizer('testando meu tokenizador')

{'input_ids': [101, 29284, 10605, 67099, 18436, 18687, 107847, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [81]:
model = BertForTokenClassification.from_pretrained(params['model_name'], 
                                                   num_labels=len(training_set.unique_labels),
                                                   id2label=training_set.id2label,
                                                   label2id=training_set.label2id)
model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [84]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=params['learning_rate'])

In [86]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=params['max_grad_norm']
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    model.eval()

In [None]:
for epoch in tqdm(range(params['epochs'])):
    train(epoch)

  0%|          | 0/10 [00:00<?, ?it/s]

Training loss per 100 training steps: 2.6322591304779053


 10%|█         | 1/10 [01:45<15:51, 105.69s/it]

Training loss epoch: 2.237698972225189
Training accuracy epoch: 0.42902028851752805
Training loss per 100 training steps: 1.8556643724441528


In [50]:
# Defining the validation function on the 20% of the dataset for tuning the bert model
def validate(validation_loader):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.eval()
    
    for idx, batch in enumerate(validation_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
           
        # compute validation accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Validation loss: {epoch_loss}")
    print(f"Validation accuracy: {tr_accuracy}")

In [51]:
validate(test_loader)

tensor([[  101,   102,     0,  ...,     0,     0,     0],
        [  101, 15022, 13348,  ...,     0,     0,     0],
        [  101, 10794, 28304,  ...,     0,     0,     0],
        [  101, 74004,   117,  ...,     0,     0,     0],
        [  101, 14476,   148,  ..., 10810, 10183,   102],
        [  101, 43964, 15417,  ...,     0,     0,     0]]) tensor([[1, 1, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])
Validation loss: 2.932495355606079
Validation accuracy: 0.034592868547099524
