In [1]:
task = "ner"
model_checkpoint = 'sentence-transformers/LaBSE'# "setu4993/LaBSE" # mBERT pre-trained from HuggingFace Hub
batch_size = 16

### Loading the dataset

In [2]:
from datasets import load_dataset, load_metric

datasets = load_dataset("conll2003")


Reusing dataset conll2003 (C:\Users\Bernard\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6)


In [3]:
label_list = datasets["train"].features[f"{task}_tags"].feature.names

label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

### Processing the data

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, max_length=400)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

### Fine-tuning

In [14]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Some weights of the model checkpoint at sentence-transformers/LaBSE were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialize

In [15]:
args = TrainingArguments(
    output_dir=f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [17]:
metric = load_metric("seqeval")

In [18]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
        "LOC-f1": results['LOC']["f1"],
        "LOC-precision": results['LOC']["precision"],
        "LOC-recall": results['LOC']["recall"],
    }

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [20]:
# Add early stopping to trainer

from transformers import EarlyStoppingCallback

trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

In [21]:
trainer.train()

RuntimeError: CUDA out of memory. Tried to allocate 1.44 GiB (GPU 0; 8.00 GiB total capacity; 4.95 GiB already allocated; 1.13 GiB free; 5.06 GiB reserved in total by PyTorch)

In [1]:
import torch
from transformers import BertModel, BertTokenizerFast


tokenizer = BertTokenizerFast.from_pretrained("setu4993/LaBSE")
model = BertModel.from_pretrained("setu4993/LaBSE")
model = model.eval()

english_sentences = [
    "dog",
    "Puppies are nice.",
    "I enjoy taking long walks along the beach with my dog.",
]
english_inputs = tokenizer(english_sentences, return_tensors="pt", padding=True)

with torch.no_grad():
    english_outputs = model(**english_inputs)

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

In [4]:
len(english_outputs)

2

In [5]:
english_outputs[0]

tensor([[[-0.0891, -0.3681,  0.0780,  ..., -0.6703, -1.0931, -0.8783],
         [ 0.4157, -0.4741,  0.8040,  ..., -0.2907, -1.2842, -1.3074],
         [-0.0891, -0.3681,  0.0780,  ..., -0.6703, -1.0931, -0.8783],
         ...,
         [ 0.2710, -0.1676,  0.5026,  ..., -0.0687, -0.8197, -0.9091],
         [ 0.2820, -0.1816,  0.5116,  ...,  0.0051, -0.8328, -0.9173],
         [ 0.2692, -0.1470,  0.5313,  ..., -0.0312, -0.8280, -0.9146]],

        [[ 0.4134,  0.8301,  0.2401,  ..., -0.4857, -0.8877, -0.1453],
         [ 1.0111, -0.0830,  1.0415,  ...,  0.1864, -1.3976, -0.7726],
         [ 0.9079,  0.1531,  0.8516,  ...,  0.1026, -1.0914, -0.3634],
         ...,
         [ 1.0021,  0.2776,  0.5574,  ..., -0.0480, -0.8325, -0.2678],
         [ 0.9893,  0.2820,  0.5472,  ...,  0.0370, -0.8336, -0.2561],
         [ 0.9865,  0.3638,  0.5971,  ..., -0.0057, -0.8459, -0.2536]],

        [[-0.0802,  0.6718,  0.5903,  ..., -1.2023, -1.1710,  0.2188],
         [ 0.5892, -0.4631,  0.4525,  ..., -0

In [11]:
english_outputs[1][0], english_outputs[1][1], english_outputs[1][2]

(tensor([-1.3473e-01, -2.1608e-02, -1.9249e-01, -4.0513e-01, -2.3961e-01,
          2.7830e-01,  5.6244e-01,  3.7427e-02, -8.3857e-01, -7.1555e-01,
         -6.4725e-02,  4.9802e-01,  7.6643e-01, -4.4483e-01, -2.5206e-01,
         -9.1947e-01,  7.0459e-01, -5.7049e-01,  5.4599e-01,  3.7562e-01,
         -1.7535e-01, -7.2055e-01,  3.4746e-01, -6.9417e-01, -6.6054e-01,
         -1.3161e-01,  3.5984e-01,  1.3651e-01, -1.7165e-01, -2.2598e-01,
         -3.8189e-01,  3.7355e-01, -5.1195e-01, -7.3582e-01, -6.6230e-01,
         -2.2338e-01, -5.1664e-01,  3.2047e-01,  4.7127e-01, -8.6406e-01,
          5.4163e-01, -4.6512e-01, -1.0731e-01,  1.8689e-02,  6.0123e-01,
          5.7982e-01,  1.3521e-01,  1.2690e-01, -4.4128e-01, -4.7346e-01,
         -3.9215e-01, -7.7843e-01, -8.2898e-01, -8.2672e-01, -8.5188e-01,
          7.8737e-01, -9.6975e-01, -3.5543e-01, -4.7266e-01, -8.7708e-01,
         -5.5298e-01, -7.2575e-01, -2.5778e-01,  1.1772e-01,  6.2019e-01,
          6.4406e-01, -6.7005e-01, -7.