In [1]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
import ast
from datasets import load_dataset, DatasetDict

label_list = ["O", "B-LOC", "I-LOC"]

dataset = load_dataset('csv', data_files='data/dataset_token_classification.csv', delimiter=';')

In [4]:
# Evaluate tokens and ner_tags as lists
dataset = dataset.map(lambda line: {'tokens': ast.literal_eval(line['tokens'])})
dataset = dataset.map(lambda line: {'ner_tags': ast.literal_eval(line['ner_tags'])})

In [5]:
# Cast ner_tags to ClassLabel with all labels present in ner_tags
from datasets import ClassLabel, Sequence

dataset = dataset.cast_column("ner_tags", Sequence(feature=ClassLabel(num_classes=len(label_list), names=label_list)))

In [6]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)}

In [7]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])
    display(HTML(df.to_html()))

In [8]:
show_random_elements(dataset["train"])

Unnamed: 0,text,tokens,ner_tags
0,"ça ne vaut pas la peine, j'aime personnellement la phrase que vous rockez, ce qui signifie que vous êtes incroyable.","[ça, ne, vaut, pas, la, peine,, j', aime, personnellement, la, phrase, que, vous, rockez,, ce, qui, signifie, que, vous, êtes, incroyable, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,Je souhaite me rendre de Béning-Lès-Saint-Avold jusqu'à Mouscron / Moeskroen.,"[Je, souhaite, me, rendre, de, Béning-, Lès-, Saint-, Avold, jusqu', à, Mouscron, Moeskroen, .]","[O, O, O, O, O, B-LOC, I-LOC, I-LOC, I-LOC, O, O, B-LOC, I-LOC, O]"
2,Je recherche un itinéraire pour aller à St-Just-Sncf en partant de St-Aignan-Pl-Wilson.,"[Je, recherche, un, itinéraire, pour, aller, à, St-, Just-, Sncf, en, partant, de, St-, Aignan-, Pl-, Wilson, .]","[O, O, O, O, O, O, O, B-LOC, I-LOC, I-LOC, O, O, O, B-LOC, I-LOC, I-LOC, I-LOC, O]"
3,dijon à épierre.,"[dijon, à, épierre, .]","[B-LOC, O, B-LOC, O]"
4,Indique-moi le trajet le plus simple de Liepvre-(-R.-Guth-) vers Menton.,"[Indique-, moi, le, trajet, le, plus, simple, de, Liepvre-, R., Guth-, vers, Menton, .]","[O, O, O, O, O, O, O, O, B-LOC, I-LOC, I-LOC, O, B-LOC, O]"
5,Le voyage de givet à herbitzheim est ce que je recherche.,"[Le, voyage, de, givet, à, herbitzheim, est, ce, que, je, recherche, .]","[O, O, O, B-LOC, O, B-LOC, O, O, O, O, O, O]"
6,J'adorerais faire quelque chose mais je ne peux pas.,"[J', adorerais, faire, quelque, chose, mais, je, ne, peux, pas, .]","[O, O, O, O, O, O, O, O, O, O, O]"
7,Pourrais-tu me donner les indications pour aller de Albertville à Sèlestat-(Schwilgué) ?,"[Pourrais-, tu, me, donner, les, indications, pour, aller, de, Albertville, à, Sèlestat-, Schwilgué), , .]","[O, O, O, O, O, O, O, O, O, B-LOC, O, B-LOC, I-LOC, O, O]"
8,"Je souhaite aller de montargis ot à dives sur mer, s'il te plaît.","[Je, souhaite, aller, de, montargis, ot, à, dives, sur, mer,, s', il, te, plaît, .]","[O, O, O, O, B-LOC, I-LOC, O, B-LOC, I-LOC, I-LOC, O, O, O, O, O]"
9,ça fait longtemps.,"[ça, fait, longtemps, .]","[O, O, O, O]"


In [9]:
# print number of B-LOC and I-LOC in whole dataset
def count_labels(dataset):
    count_b_loc = 0
    count_i_loc = 0
    count_o = 0
    for line in dataset:
        for label in line['ner_tags']:
            if label == 1:
                count_b_loc += 1
            elif label == 2:
                count_i_loc += 1
            elif label == 0:
                count_o += 1
    print(f"Number of B-LOC: {count_b_loc}")
    print(f"Number of I-LOC: {count_i_loc}")
    print(f"Number of O: {count_o}")
    
count_labels(dataset["train"])

Number of B-LOC: 357696
Number of I-LOC: 315414
Number of O: 6121436


In [10]:
# Shuffle train dataset, and pick 50% of it
train_data = dataset['train']
train_data = train_data.shuffle(seed=42)
train_data = train_data.train_test_split(test_size=0.5)['train']

# Split test dataset into 10% validation and 10% test
train_test_valid = train_data.train_test_split(test_size=0.2)
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)

dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 143078
    })
    test: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 17885
    })
    valid: Dataset({
        features: ['text', 'tokens', 'ner_tags'],
        num_rows: 17885
    })
})

In [11]:
task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "distilbert-base-uncased"

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [14]:
example = dataset["train"][0]

In [15]:
example["tokens"]

['Parmi',
 'ces',
 'statuts,',
 'les',
 'principaux',
 'sont',
 'La',
 'Wikimedia',
 'Foundation',
 'fournit',
 'des',
 'statistiques',
 'mensuelles',
 'sur',
 'son',
 'site[',
 'z]']

In [16]:
tokenizer(example["tokens"], is_split_into_words=True)

{'input_ids': [101, 11968, 4328, 8292, 2015, 28093, 16446, 1010, 4649, 26927, 12273, 11514, 13754, 2365, 2102, 2474, 15536, 21138, 2098, 2401, 3192, 2176, 3490, 2102, 4078, 28093, 2923, 19516, 2273, 6342, 22869, 7505, 2365, 2609, 1031, 1062, 1033, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'par',
 '##mi',
 'ce',
 '##s',
 'stat',
 '##uts',
 ',',
 'les',
 'pri',
 '##nc',
 '##ip',
 '##aux',
 'son',
 '##t',
 'la',
 'wi',
 '##kim',
 '##ed',
 '##ia',
 'foundation',
 'four',
 '##ni',
 '##t',
 'des',
 'stat',
 '##ist',
 '##iques',
 'men',
 '##su',
 '##elles',
 'sur',
 'son',
 'site',
 '[',
 'z',
 ']',
 '[SEP]']

In [18]:
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

38 38


In [19]:
label_all_tokens = True

In [20]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [21]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/143078 [00:00<?, ? examples/s]

Map:   0%|          | 0/17885 [00:00<?, ? examples/s]

Map:   0%|          | 0/17885 [00:00<?, ? examples/s]

In [22]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in id2label.items()}

In [23]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
batch_size = 16
epochs = 3
metric_name = "f1"

In [25]:
args = TrainingArguments(
    f"models/distilbert-finetuned-token-classification-ner-trip",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    #push_to_hub=True,
)

In [26]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [27]:
from datasets import load_metric

metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [28]:
labels = [label_list[i] for i in example[f"{task}_tags"]]
metric.compute(predictions=[labels], references=[labels])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 1.0}

In [29]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [30]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [31]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=26829, training_loss=0.001011224740532176, metrics={'train_runtime': 1855.1505, 'train_samples_per_second': 231.374, 'train_steps_per_second': 14.462, 'total_flos': 1.05232752194013e+16, 'train_loss': 0.001011224740532176, 'epoch': 3.0})

In [32]:
trainer.save_model("models/distilbert-finetuned-token-classification-ner-trip")

In [33]:
trainer.evaluate()

{'eval_loss': 5.552212201109796e-07,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_accuracy': 1.0,
 'eval_runtime': 25.5932,
 'eval_samples_per_second': 698.82,
 'eval_steps_per_second': 43.684,
 'epoch': 3.0}

In [34]:
predictions, labels, _ = trainer.predict(tokenized_datasets["valid"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'LOC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 47938},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [35]:
train_metrics = trainer.evaluate(tokenized_datasets["train"])
validation_metrics = trainer.evaluate(tokenized_datasets["valid"])

In [59]:
def gather_outputs(outputs: list) -> list:
    # Group entities by their sequence
    grouped_entities = []
    current_group = []
    for entity in outputs:
        if not current_group or entity['start'] == current_group[-1]['end']:
            current_group.append(entity)
        else:
            grouped_entities.append(current_group)
            current_group = [entity]
    
    # Append the last group
    if current_group:
        grouped_entities.append(current_group)
    
    return grouped_entities

def get_locations_from_outputs(sentence: str, outputs: list) -> list:
    groups = gather_outputs(outputs)
    return [sentence[group[0]["start"]:group[-1]["end"]] for group in groups]

In [62]:
from transformers import pipeline

sentence = "Je veux aller de Port-Boulet à Le Havre."
#sentence = "Peux-tu m'aider à trouver mon chemin de Paris à Épierre ?"

token_classifier = pipeline("token-classification", model="models/distilbert-finetuned-token-classification-ner-trip", aggregation_strategy="simple")
outputs = token_classifier(sentence)

In [63]:
get_locations_from_outputs(sentence, outputs)

['Port-Boulet', 'Le Havre']