# A jour selon le notebook distilbert_token_classification_v8.ipynb

In [1]:
from transformers import pipeline

token_classifier = pipeline(
    "token-classification", 
    model="EliottClavier/distilbert-finetuned-token-classification-ner-trip", 
    aggregation_strategy="simple"
)

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [2]:
def gather_outputs(outputs: list) -> list:
    # Group entities by their sequence
    grouped_entities = []
    current_group = []
    for entity in outputs:
        if not current_group or entity['start'] == current_group[-1]['end']:
            current_group.append(entity)
        else:
            grouped_entities.append(current_group)
            current_group = [entity]

    # Append the last group
    if current_group:
        grouped_entities.append(current_group)

    return grouped_entities

def transform_sentence_from_outputs(sentence: str, outputs: list) -> list:
    groups = gather_outputs(outputs)
    locations = [{"label": group[0]["entity_group"], "city": sentence["text"][group[0]["start"]:group[-1]["end"]] } for group in groups]
    sentence = {
        "id": str(sentence["id"]),
        "locations": locations
    }

    sentence["locations"] = sorted(sentence["locations"], key=lambda group: group["label"], reverse=True)
    return sentence

def format_sentence_output(sentence_output: list) -> str:
    return f"{sentence_output['id']},{','.join([location['city'] for location in sentence_output['locations']])}"

In [3]:
sentences = [
    # With a trailing dot, departure has hyphen and arrival is composed of two words
    {"id": 1, "text": "Je veux aller de Port-Boulet à Le Havre."},
    # With trailing dot and arrival with a quote
    {"id": 2, "text": "Peux-tu m'aider à trouver mon chemin vers Paris en partant d'Épierre ?"},
    # With trailing dot, with departure un lower case and arrival composed of two words with a slash
    {"id": 3, "text": "Je cherche un moyen d'aller de margny-lès-compiègne à Saarbrücken/Sarrebruck."},
    # With trailing dot, with a prefix and departure without hyphen
    {"id": 4, "text": "Je veux me rendre chez mon ami Etienne à Saint Étienne depuis Nantes."},
    # With trailing dot, with a prefix and departure and arrival in lower case
    {"id": 5, "text": "Je veux aller de la ville de marseille à tours."},
    # With trailing dot, with a name
    {"id": 6, "text": "Je veux aller voir Emma qui habite à albi ville en partant de Vannes."},
    # Without trailing dot, with name + a prefix before departure
    {"id": 7, "text": "Trouve moi le chemin qui me mène chez David à Strasbourg en partant de la ville de Metz"},
    {"id": 8, "text": "Recherche le chemin le plus court entre la ville de Lorient et Paris"},
    {"id": 9, "text": "Trouve un itinéraire pour aller à Besançon depuis la ville de Oyonnax."},
    # Departure à arrival
    {"id": 10, "text": "Nantes à Toulouse"},
]

for sentence in sentences:
    outputs = token_classifier(sentence["text"])
    print(format_sentence_output(transform_sentence_from_outputs(sentence, outputs)))

1,Port-Boulet,Le Havre
2,'Épierre,Paris
3,margny-lès-compiègne,Saarbrücken/Sarrebruck
4,Nantes,Saint Étienne
5,marseille,tours
6,Vannes,albi ville
7,Metz,Strasbourg
8,Lorient,Paris
9,Oyonnax,Besançon
10,Nantes,Toulouse
