<a href="https://colab.research.google.com/github/Ensama-cmd/CivilEngineeringAI/blob/main/Notebooks/04_fine_tuning_NPL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Fine-tuning d'un modèle pour l'extraction des paramètres de construction

# Installation des dépendances
!pip install transformers datasets evaluate seqeval accelerate

# Import des bibliothèques
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import Dataset, load_metric
import numpy as np
from collections import defaultdict

# Chargement du tokenizer
model_checkpoint = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Exemple de données annotées (à compléter avec de vraies données)
# Format: [("texte", {"entities": [(start, end, label), ...]}), ...]
training_data = [
    (
        "Maison individuelle de 120m² avec 2 étages",
        {"entities": [(0, 5, "TYPE"), (24, 29, "SURFACE"), (35, 36, "ETAGES")]}
    ),
    (
        "Immeuble de 5 étages en béton de 800m²",
        {"entities": [(0, 7, "TYPE"), (11, 12, "ETAGES"), (19, 24, "MATERIAU"), (28, 32, "SURFACE")]}
    ),
    # Ajouter plus d'exemples ici...
]

# Préparation des données pour le fine-tuning
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Conversion des données au format requis
def convert_to_hf_format(data):
    tokens = []
    ner_tags = []

    for text, annotation in data:
        words = text.split()
        tags = ["O"] * len(words)

        for start, end, label in annotation["entities"]:
            # Trouver les mots couverts par l'entité
            entity_text = text[start:end]
            entity_words = entity_text.split()

            # Marquer le premier mot avec B- et les suivants avec I-
            for i, word in enumerate(words):
                if word == entity_words[0]:
                    tags[i] = f"B-{label}"
                    for j in range(1, len(entity_words)):
                        if i+j < len(words) and words[i+j] == entity_words[j]:
                            tags[i+j] = f"I-{label}"

        tokens.append(words)
        ner_tags.append([tag_to_id(tag) for tag in tags])

    return {"tokens": tokens, "ner_tags": ner_tags}

# Mapping des tags vers IDs
def tag_to_id(tag):
    tag_map = {
        "O": 0,
        "B-TYPE": 1, "I-TYPE": 2,
        "B-SURFACE": 3, "I-SURFACE": 4,
        "B-ETAGES": 5, "I-ETAGES": 6,
        "B-MATERIAU": 7, "I-MATERIAU": 8
    }
    return tag_map.get(tag, 0)

# Conversion des données
hf_data = convert_to_hf_format(training_data)
dataset = Dataset.from_dict(hf_data)

# Tokenization
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Chargement du modèle
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=9,  # Nombre de labels
    id2label={0: "O", 1: "B-TYPE", 2: "I-TYPE", 3: "B-SURFACE", 4: "I-SURFACE",
              5: "B-ETAGES", 6: "I-ETAGES", 7: "B-MATERIAU", 8: "I-MATERIAU"},
    label2id={"O": 0, "B-TYPE": 1, "I-TYPE": 2, "B-SURFACE": 3, "I-SURFACE": 4,
              "B-ETAGES": 5, "I-ETAGES": 6, "B-MATERIAU": 7, "I-MATERIAU": 8}
)

# Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="civil-engineering-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Métrique d'évaluation
metric = load_metric("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Entraînement
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Lancement de l'entraînement
trainer.train()

# Sauvegarde du modèle
model.save_pretrained("civil-engineering-ner-model")
tokenizer.save_pretrained("civil-engineering-ner-model")