In [1]:
import csv
import numpy as np
import torch
import transformers

In [2]:
model_name = "distilbert/distilbert-base-cased"

## MultiNERD data

Ce dataset est un text avec des catégories assez fines (dont nom de personne).<br>
Il est disponible [sur ce lien](https://github.com/Babelscape/multinerd)

In [3]:
with open("../data/raw/train_en.tsv") as f:
    rows = list(line.strip().split("\t") for line in f)

rows[:10]

[['0', 'The', 'O'],
 ['1', 'type', 'O'],
 ['2', 'locality', 'O'],
 ['3', 'is', 'O'],
 ['4',
  'Kīlauea',
  'B-LOC',
  'bn:02858748n',
  'Q188698',
  '350666',
  'Kīlauea',
  'Kīlauea is an active shield volcano in the Hawaiian Islands.',
  'https://upload.wikimedia.org/wikipedia/commons/b/b8/Puu_Oo_looking_up_Kilauea_-_edit.jpg'],
 ['5', '.', 'O'],
 [''],
 ['0', 'Common', 'O'],
 ['1', 'components', 'O'],
 ['2', 'of', 'O']]

In [4]:
def make_labelled_sentences(tagged_words):
    # Joining words until we meet a dot
    # Word's label is 1 if 'PER' is in its tag
    X = []
    y = []

    this_word = []
    this_labels = []
    for tagged_word in tagged_words:
        if len(tagged_word) < 3:
            # not a tagged word
            continue
        word = tagged_word[1]
        tag = tagged_word[2]

        if word == '.':
            X.append(this_word)
            y.append(this_labels)

            this_word = []
            this_labels = []
        else:
            this_word.append(word)
            this_labels.append(1 * tag.endswith("PER"))

    return X, y

In [5]:
sentences, labels = make_labelled_sentences(rows[:100_000])

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
sentences_training, sentences_test, labels_training, labels_test = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
)

In [8]:
sentences_train, sentences_dev, labels_train, labels_dev = train_test_split(
    sentences_training,
    labels_training,
    test_size=0.2,
    random_state=42,
)

# Applying Hugging face

In [96]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [10]:
def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [11]:
tokenized_train = tokenize_and_align_labels(sentences_train, labels_train)

In [12]:
tokenized_test = tokenize_and_align_labels(sentences_test, labels_test)

In [13]:
from datasets import Dataset

dataset_train = Dataset.from_dict(tokenized_train)
dataset_test = Dataset.from_dict(tokenized_test)

In [14]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

2024-11-06 17:27:38.100646: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 17:27:38.100720: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 17:27:38.107677: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 17:27:38.716581: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

labels = [0, 1]
label_list = ["0", "1"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Experiments
## V1: learning only last layer

In [16]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=2
)
model = model.to("cuda")


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
for name, _ in model.base_model.named_parameters():
  print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
transformer.layer.0.attention.q_lin.weight
transformer.layer.0.attention.q_lin.bias
transformer.layer.0.attention.k_lin.weight
transformer.layer.0.attention.k_lin.bias
transformer.layer.0.attention.v_lin.weight
transformer.layer.0.attention.v_lin.bias
transformer.layer.0.attention.out_lin.weight
transformer.layer.0.attention.out_lin.bias
transformer.layer.0.sa_layer_norm.weight
transformer.layer.0.sa_layer_norm.bias
transformer.layer.0.ffn.lin1.weight
transformer.layer.0.ffn.lin1.bias
transformer.layer.0.ffn.lin2.weight
transformer.layer.0.ffn.lin2.bias
transformer.layer.0.output_layer_norm.weight
transformer.layer.0.output_layer_norm.bias
transformer.layer.1.attention.q_lin.weight
transformer.layer.1.attention.q_lin.bias
transformer.layer.1.attention.k_lin.weight
transformer.layer.1.attention.k_lin.bias
transformer.layer.1.attention.v_lin.weight
transformer.lay

In [18]:
for name, param in model.base_model.named_parameters():
  param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if (
        any(layer_name in name for layer_name in ["layer.5"])
        and any(layer_type in name for layer_type in ["weight", "bias"])
        and "ffn.lin" in name
    ):
        param.requires_grad = True

In [19]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192
4,0.077000,0.014388,0.0,0.0,0.0,0.995919


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192
4,0.077000,0.014388,0.0,0.0,0.0,0.995919
5,0.077000,0.012754,0.0,0.0,0.0,0.996086


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192
4,0.077000,0.014388,0.0,0.0,0.0,0.995919
5,0.077000,0.012754,0.0,0.0,0.0,0.996086
6,0.015000,0.011836,0.0,0.0,0.0,0.996254


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192
4,0.077000,0.014388,0.0,0.0,0.0,0.995919
5,0.077000,0.012754,0.0,0.0,0.0,0.996086
6,0.015000,0.011836,0.0,0.0,0.0,0.996254
7,0.015000,0.011276,0.0,0.0,0.0,0.99631


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192
4,0.077000,0.014388,0.0,0.0,0.0,0.995919
5,0.077000,0.012754,0.0,0.0,0.0,0.996086
6,0.015000,0.011836,0.0,0.0,0.0,0.996254
7,0.015000,0.011276,0.0,0.0,0.0,0.99631
8,0.015000,0.010828,0.0,0.0,0.0,0.996254


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192
4,0.077000,0.014388,0.0,0.0,0.0,0.995919
5,0.077000,0.012754,0.0,0.0,0.0,0.996086
6,0.015000,0.011836,0.0,0.0,0.0,0.996254
7,0.015000,0.011276,0.0,0.0,0.0,0.99631
8,0.015000,0.010828,0.0,0.0,0.0,0.996254
9,0.010800,0.010652,0.0,0.0,0.0,0.996254


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.051069,0.0,0.0,0.0,0.985967
2,No log,0.024629,0.0,0.0,0.0,0.993011
3,0.077000,0.017471,0.0,0.0,0.0,0.995192
4,0.077000,0.014388,0.0,0.0,0.0,0.995919
5,0.077000,0.012754,0.0,0.0,0.0,0.996086
6,0.015000,0.011836,0.0,0.0,0.0,0.996254
7,0.015000,0.011276,0.0,0.0,0.0,0.99631
8,0.015000,0.010828,0.0,0.0,0.0,0.996254
9,0.010800,0.010652,0.0,0.0,0.0,0.996254
10,0.010800,0.010594,0.0,0.0,0.0,0.996422


TrainOutput(global_step=1790, training_loss=0.03040976324560922, metrics={'train_runtime': 151.5114, 'train_samples_per_second': 188.501, 'train_steps_per_second': 11.814, 'total_flos': 416848715652096.0, 'train_loss': 0.03040976324560922, 'epoch': 10.0})

# testing

In [60]:
list(enumerate(sentences_train[:5]))

[(0,
  ['The',
   '347th',
   'TFW',
   'was',
   'stationed',
   'at',
   'Korat',
   'Royal',
   'Thai',
   'Air',
   'Force',
   'Base',
   'from',
   '12',
   'July',
   '1974',
   'until',
   '30',
   'June',
   '1975']),
 (1,
  ['In',
   'his',
   'second',
   'start',
   ',',
   'he',
   'threw',
   'four',
   'touchdown',
   'passes',
   '(',
   'all',
   'to',
   'Irving',
   'Fryar',
   ')',
   'against',
   'the',
   'Miami',
   'Dolphins']),
 (2,
  ['At',
   'the',
   'same',
   'time',
   ',',
   'Filippo',
   'Inzaghi',
   'and',
   'Andriy',
   'Shevchenko',
   'proved',
   'to',
   'be',
   'dominant',
   'and',
   'dynamic',
   'strikers',
   ',',
   'who',
   'were',
   'prolific',
   'in',
   'front',
   'of',
   'goal']),
 (3,
  ['"',
   'Scaptius',
   'ignivena',
   '"',
   'is',
   'a',
   'moth',
   'in',
   'the',
   'family',
   'Erebidae']),
 (4,
  ['It',
   'houses',
   'a',
   'group',
   'of',
   'infection',
   'and',
   'immunology',
   'experts',
   ',',

In [94]:
def test_model(text_list, model, tokenizer):
    inputs = tokenizer(
        text_list,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(model.device)
    
    outputs = model(**inputs).logits
    print(outputs)
    predictions = torch.argmax(outputs, dim=-1)[0].cpu().numpy()
    return list(predictions[1:-1])


In [95]:
from transformers import pipeline

text = "the main story was written by Charles".split(' ')
#text = sentences[5]
print(text)

print(test_model(text, model, tokenizer))

['the', 'main', 'story', 'was', 'written', 'by', 'Charles']
tensor([[[ 1.9971, -2.3180],
         [ 3.8959, -3.9072],
         [ 4.1080, -3.9862],
         [ 3.5941, -3.7546],
         [ 4.5217, -5.0845],
         [ 3.7117, -3.8085],
         [ 2.7014, -2.5735],
         [-2.7005,  2.3887],
         [ 0.5380, -0.9269]]], device='cuda:0', grad_fn=<ViewBackward0>)
[0, 0, 0, 0, 0, 0, 1]


{'input_ids': tensor([[ 101, 7504, 1148, 5650,  102]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[[ 2.3783, -2.8245],
         [ 2.1961, -2.5802],
         [ 2.8877, -3.1586],
         [ 2.9470, -3.5176],
         [ 3.1414, -3.2952]]], device='cuda:0')


IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number

TypeError: list indices must be integers or slices, not tuple