In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline
from datasets import Dataset
from rich.console import Console
from rich.text import Text

console = Console()

# 1. DATASET ENRICHI (20 exemples pour plus de robustesse)
label_list = ["O", "B-SENS", "I-SENS"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

raw_data = [
    (["The", "service", "is", "free", "of", "charge", "."], [0, 0, 0, 0, 0, 0, 0]),
    (["We", "share", "your", "data", "with", "advertising", "partners", "."], [0, 1, 2, 2, 2, 2, 2, 0]),
    (["Your", "subscription", "renews", "automatically", "every", "month", "."], [0, 0, 1, 2, 0, 0, 0]),
    (["You", "can", "delete", "your", "account", "at", "any", "time", "."], [0, 0, 0, 0, 0, 0, 0, 0, 0]),
    (["The", "court", "of", "London", "shall", "have", "exclusive", "jurisdiction", "."], [0, 1, 2, 2, 2, 2, 2, 2, 0]),
    (["We", "may", "change", "the", "price", "without", "prior", "notice", "."], [0, 0, 1, 2, 2, 2, 2, 2, 0]),
    (["User", "content", "becomes", "property", "of", "the", "company", "."], [1, 2, 2, 2, 2, 2, 2, 0]),
    (["No", "refunds", "will", "be", "issued", "under", "any", "circumstances", "."], [1, 2, 2, 2, 2, 2, 2, 2, 0]),
    (["This", "app", "requires", "camera", "access", "."], [0, 0, 0, 0, 0, 0]),
    (["Failure", "to", "pay", "results", "in", "immediate", "suspension", "."], [1, 2, 2, 2, 2, 2, 2, 0]),
    (["We", "store", "your", "password", "in", "plain", "text", "."], [0, 1, 2, 2, 2, 2, 2, 0]),
    (["You", "agree", "to", "not", "sue", "the", "developer", "."], [0, 0, 0, 1, 2, 2, 2, 0]),
    (["Cookies", "are", "used", "to", "improve", "experience", "."], [0, 0, 0, 0, 0, 0, 0]),
    (["All", "disputes", "are", "settled", "by", "arbitration", "."], [0, 1, 2, 2, 2, 2, 0]),
    (["We", "track", "your", "location", "even", "when", "offline", "."], [0, 1, 2, 2, 2, 2, 2, 0]),
    (["The", "interface", "is", "available", "in", "English", "."], [0, 0, 0, 0, 0, 0, 0]),
    (["Third", "parties", "may", "access", "your", "contacts", "."], [1, 2, 2, 2, 2, 2, 0]),
    (["This", "agreement", "is", "governed", "by", "French", "law", "."], [0, 0, 0, 0, 0, 0, 0, 0]),
    (["We", "may", "read", "your", "private", "messages", "."], [0, 0, 1, 2, 2, 2, 0]),
    (["The", "app", "is", "compatible", "with", "iOS", "."], [0, 0, 0, 0, 0, 0, 0])
]

dataset = Dataset.from_dict({
    "tokens": [x[0] for x in raw_data],
    "ner_tags": [x[1] for x in raw_data]
})

# 2. MODÈLE ET ENTRAÎNEMENT (Optimisé MPS)
model_id = "microsoft/deberta-v3-xsmall"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(
    model_id, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        labels.append([label[word_idx] if word_idx is not None else -100 for word_idx in word_ids])
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_ds = dataset.map(tokenize_and_align_labels, batched=True)

training_args = TrainingArguments(
    output_dir="./deberta-legal-rich",
    num_train_epochs=30, # Plus d'époques pour bien mémoriser les structures
    per_device_train_batch_size=8,
    use_mps_device=True,
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_ds,
    data_collator=DataCollatorForTokenClassification(tokenizer)
)

trainer.train()

# 3. FONCTION D'AFFICHAGE AVEC RICH
def afficher_analyse_rich(phrases):
    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device="mps")
    
    for phrase in phrases:
        entities = pipe(phrase)
        rich_text = Text(phrase)
        
        # On applique le style Rich pour chaque entité détectée
        for ent in entities:
            if ent['entity_group'] != 'O':
                # Surlignage jaune sur texte noir pour la visibilité
                rich_text.stylize("black on yellow", ent['start'], ent['end'])
                # Optionnel : ajouter le type de risque à la fin du mot
                # rich_text.append(f" [{ent['entity_group']}]", style="bold red")

        console.print(rich_text)
        console.print("-" * 20)

# 4. TEST SUR PLUSIEURS PHRASES
test_phrases = [
    "The service provider can sell your private data to third parties.",
    "You can delete your account whenever you want.",
    "We may change the monthly subscription price without any notice.",
    "This app is governed by the laws of the state of Delaware.",
    "We reserve the right to read your messages for safety reasons."
]

console.print("\n[bold cyan]Résultats de l'analyse sémantique :[/bold cyan]\n")
afficher_analyse_rich(test_phrases)

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss
10,0.8299
20,0.5231
30,0.3502
40,0.217
50,0.1417
60,0.1085
70,0.0708
80,0.0646
90,0.0555


Device set to use mps
