In [73]:
import csv
import torch
import evaluate
import accelerate
import transformers
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

Use camembert model for french dataset

In [74]:
model_name = "almanach/camembert-base"

## MultiNERD data

Ce dataset est un text avec des catégories assez fines (dont nom de personne).<br>
Il est disponible [sur ce lien](https://github.com/Babelscape/multinerd)

In [75]:
with open("../data/raw/train_multinerd_fr.tsv") as f:
    rows = list(line.strip().split("\t") for line in f)

rows[:10]

 16%|█▌        | 253/1600 [04:49<25:38,  1.14s/it]


[['0', 'Il', 'O'],
 ['1', 'est', 'O'],
 ['2', 'incarné', 'O'],
 ['3', 'par', 'O'],
 ['4',
  'Austin',
  'B-PER',
  'bn:02525192n',
  'Q4204710',
  '7345300',
  'Austin_Stowell',
  'Austin Stowell est un acteur américain né le 24 décembre 1984 à Kensington dans le Connecticut.',
  'https://upload.wikimedia.org/wikipedia/commons/9/95/Austin_Stowell-DolphinTale.jpg'],
 ['5', 'Stowell', 'I-PER'],
 ['6', '.', 'O'],
 [''],
 ['0', 'c’', 'O'],
 ['1', 'est', 'O']]

In [76]:
def make_labelled_sentences(tagged_words):
    # Joining words until we meet a dot
    # Word's label is 1 if 'PER' is in its tag
    X = []
    y = []

    this_word = []
    this_labels = []
    for tagged_word in tagged_words:
        if len(tagged_word) < 3:
            # not a tagged word
            continue
        word = tagged_word[1]
        tag = tagged_word[2]

        if word == '.':
            X.append(this_word)
            y.append(this_labels)

            this_word = []
            this_labels = []
        else:
            this_word.append(word)
            this_labels.append(1 * tag.endswith("PER"))

    return X, y

In [77]:
sentences, labels = make_labelled_sentences(rows[:100_000])

In [78]:
sentences_training, sentences_test, labels_training, labels_test = train_test_split(
    sentences,
    labels,
    test_size=0.2,
    random_state=42,
)

In [79]:
sentences_train, sentences_dev, labels_train, labels_dev = train_test_split(
    sentences_training,
    labels_training,
    test_size=0.2,
    random_state=42,
)

# Applying Hugging face

In [80]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

In [81]:
def tokenize_and_align_labels(sentences, ner_tags):
    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(ner_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [82]:
tokenized_train = tokenize_and_align_labels(sentences_train, labels_train)

In [83]:
tokenized_test = tokenize_and_align_labels(sentences_test, labels_test)

In [84]:
dataset_train = Dataset.from_dict(tokenized_train)
dataset_test = Dataset.from_dict(tokenized_test)

In [85]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [86]:
seqeval = evaluate.load("seqeval")

labels = [0, 1]
label_list = ["0", "1"]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Experiments
## V1: learning only last layer

In [87]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=2
)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [88]:
for name, _ in model.base_model.named_parameters():
  print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

Passage sur le layer 11 pour figer le premier layer
Utilisation de intermediate.dense et output.dense pour récupérer les bons poids et biais

In [89]:
for name, param in model.base_model.named_parameters():
  param.requires_grad = False

for name, param in model.base_model.named_parameters():
    if (
        any(layer_name in name for layer_name in ["layer.11"])
        and any(layer_type in name for layer_type in ["weight", "bias"])
        and ("intermediate.dense" in name
          or "output.dense" in name)
    ):
        param.requires_grad = True

In [90]:
training_args = TrainingArguments(
    output_dir="my_awesome_wnut_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
 10%|▉         | 159/1600 [00:38<06:17,  3.82it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                  
 10%|█         | 160/1600 [00:48<06:17,  3.82it/s]
[A

{'eval_loss': 0.1878690868616104, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9527625922190966, 'eval_runtime': 10.0097, 'eval_samples_per_second': 79.623, 'eval_steps_per_second': 4.995, 'epoch': 1.0}


 20%|██        | 320/1600 [01:26<04:07,  5.17it/s]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                  
 20%|██        | 320/1600 [01:36<04:07,  5.17it/s]
[A

{'eval_loss': 0.12461039423942566, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9528687436972559, 'eval_runtime': 9.7091, 'eval_samples_per_second': 82.088, 'eval_steps_per_second': 5.15, 'epoch': 2.0}


 30%|███       | 480/1600 [02:13<04:01,  4.64it/s]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                  
 30%|███       | 480/1600 [02:23<04:01,  4.64it/s]
[A

{'eval_loss': 0.06502299755811691, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9928878509633247, 'eval_runtime': 9.8383, 'eval_samples_per_second': 81.01, 'eval_steps_per_second': 5.082, 'epoch': 3.0}


 31%|███▏      | 500/1600 [02:28<04:29,  4.09it/s]  
 31%|███▏      | 501/1600 [02:29<04:06,  4.46it/s]

{'loss': 0.1856, 'grad_norm': 0.21932148933410645, 'learning_rate': 1.375e-05, 'epoch': 3.12}


 40%|███▉      | 639/1600 [03:01<03:32,  4.53it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                  
 40%|████      | 640/1600 [03:11<03:32,  4.53it/s]
[A

{'eval_loss': 0.04388636723160744, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.995063956265591, 'eval_runtime': 9.8137, 'eval_samples_per_second': 81.213, 'eval_steps_per_second': 5.095, 'epoch': 4.0}


 50%|█████     | 800/1600 [03:50<02:50,  4.69it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                  
 50%|█████     | 800/1600 [03:59<02:50,  4.69it/s]
[A

{'eval_loss': 0.034293241798877716, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9959662438299454, 'eval_runtime': 9.8538, 'eval_samples_per_second': 80.883, 'eval_steps_per_second': 5.074, 'epoch': 5.0}


 60%|██████    | 960/1600 [04:37<02:19,  4.59it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                  
 60%|██████    | 960/1600 [04:47<02:19,  4.59it/s]
[A

{'eval_loss': 0.0289154052734375, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9962316225253437, 'eval_runtime': 9.7954, 'eval_samples_per_second': 81.364, 'eval_steps_per_second': 5.104, 'epoch': 6.0}


 62%|██████▎   | 1000/1600 [04:57<02:59,  3.33it/s]
 63%|██████▎   | 1001/1600 [04:57<02:33,  3.90it/s]

{'loss': 0.0463, 'grad_norm': 0.13308116793632507, 'learning_rate': 7.500000000000001e-06, 'epoch': 6.25}


 70%|███████   | 1120/1600 [05:25<01:48,  4.44it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                   
 70%|███████   | 1120/1600 [05:35<01:48,  4.44it/s]
[A

{'eval_loss': 0.025851935148239136, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.996497001220742, 'eval_runtime': 9.8047, 'eval_samples_per_second': 81.287, 'eval_steps_per_second': 5.1, 'epoch': 7.0}


 80%|███████▉  | 1279/1600 [06:12<01:02,  5.14it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                   
 80%|████████  | 1280/1600 [06:22<01:02,  5.14it/s]
[A

{'eval_loss': 0.02405460551381111, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9965500769598217, 'eval_runtime': 9.7366, 'eval_samples_per_second': 81.856, 'eval_steps_per_second': 5.135, 'epoch': 8.0}


 90%|█████████ | 1440/1600 [07:00<00:28,  5.53it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                   
 90%|█████████ | 1440/1600 [07:10<00:28,  5.53it/s]
[A

{'eval_loss': 0.023052828386425972, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9966031526989013, 'eval_runtime': 9.8035, 'eval_samples_per_second': 81.298, 'eval_steps_per_second': 5.1, 'epoch': 9.0}


 94%|█████████▍| 1500/1600 [07:24<00:23,  4.26it/s]
 94%|█████████▍| 1500/1600 [07:24<00:23,  4.26it/s]

{'loss': 0.0299, 'grad_norm': 0.09998130798339844, 'learning_rate': 1.25e-06, 'epoch': 9.38}


100%|██████████| 1600/1600 [07:48<00:00,  4.72it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)

[A                                               

                                                   
100%|██████████| 1600/1600 [07:59<00:00,  4.72it/s]
[A

{'eval_loss': 0.02271539717912674, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_accuracy': 0.9966031526989013, 'eval_runtime': 9.8326, 'eval_samples_per_second': 81.057, 'eval_steps_per_second': 5.085, 'epoch': 10.0}



100%|██████████| 1600/1600 [08:00<00:00,  3.33it/s]

{'train_runtime': 480.1805, 'train_samples_per_second': 53.063, 'train_steps_per_second': 3.332, 'train_loss': 0.0835815854370594, 'epoch': 10.0}





TrainOutput(global_step=1600, training_loss=0.0835815854370594, metrics={'train_runtime': 480.1805, 'train_samples_per_second': 53.063, 'train_steps_per_second': 3.332, 'total_flos': 884828390782752.0, 'train_loss': 0.0835815854370594, 'epoch': 10.0})

Force cpu usage if cuda isn't available

In [115]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

CamembertForTokenClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr

In [156]:
def predict_is_name(text_split_in_words, model, tokenizer):
    inputs = tokenizer([text_split_in_words], is_split_into_words=True)
    inputs["input_ids"] = torch.tensor(inputs["input_ids"])
    inputs["attention_mask"] = torch.tensor(inputs["attention_mask"])
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    word_ids = inputs.word_ids()
    labels = []
    previous_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is not None and word_idx != previous_word_idx:
            labels.append(predictions[idx])
            previous_word_idx = word_idx
    return labels

In [157]:
def predict(texts_split_into_words: list[list[str]]) -> list[list[int]]:
    model = AutoModelForTokenClassification.from_pretrained("ncls-p/esgi-td3-nlp")
    tokenizer = AutoTokenizer.from_pretrained("ncls-p/esgi-td3-nlp")

    labels = []
    for text_split_into_words in texts_split_into_words:
        word_labels = predict_is_name(text_split_into_words, model, tokenizer)
        labels.append(word_labels)

    return labels

Create list of sentences to predict

In [162]:
texts_split_in_words = [
    ["Bonjour", "class", "d'", "ESGI"],
    ["Hier", "j'ai", "assisté", "à", "une", "conférence", "par", "Pierre", "Disco", "sur", "l'intelligence", "artificielle", "à", "l'", "ESGI"],
    ["Le", "développement", "du", "traitement", "du", "langage", "naturel", "est", "essentiel", "pour", "la", "compréhension", "des", "données", "textuelles"],
    ["Les", "étudiants", "présentent", "leurs", "projets", "innovants", "dans", "le", "domaine", "de", "la", "vision", "par", "ordinateur"],
    ["Apprendre", "le", "python", "et", "les", "bibliothèques", "comme", "PyTorch", "peut", "être", "très", "bénéfique", "pour", "votre", "carrière"],
    ["L'analyse", "des", "sentiments", "est", "une", "application", "pratique", "du", "NLP", "dans", "le", "marketing", "digital"],
    ["Le", "chat", "dort", "sur", "le", "canapé"],
    ["Il", "fait", "beau", "aujourd'hui"],
    ["Marie", "et", "Jean", "sont", "allés", "au", "cinéma"],
    ["Nous", "aimons", "la", "musique", "classique"],
    ["Les", "enfants", "jouent", "dans", "le", "parc"],
    ["Lucie", "prépare", "un", "examen"],
    ["La", "nourriture", "est", "prête"],
    ["Paul", "et", "Sophie", "font", "du", "jogging", "le", "matin"],
    ["L'oiseau", "chante", "dans", "l'arbre"],
    ["Demain", "sera", "une", "grande", "journée"]
]

In [163]:
predict(texts_split_in_words)

[[0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0, 0]]