<a href="https://colab.research.google.com/github/Ensama-cmd/CivilEngineeringAI/blob/main/Notebooks/02_extraction_amelior%C3%A9e.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installation des dépendances supplémentaires
!pip install spacy fuzzywuzzy python-Levenshtein
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
# Import des bibliothèques
import spacy
from fuzzywuzzy import fuzz, process
import re
import json
from transformers import pipeline

In [3]:
# Chargement des modèles
print("Chargement des modèles...")
nlp_spacy = spacy.load("fr_core_news_sm")
nlp_transformers = pipeline("token-classification", model="dslim/bert-base-NER", aggregation_strategy="simple")

Chargement des modèles...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertFor

In [4]:
# Définition des motifs pour le génie civil
construction_terms = {
    "materials": ["béton", "acier", "bois", "parpaing", "brique", "verre", "métal", "composite"],
    "structure_types": ["maison", "immeuble", "bureau", "entrepôt", "usine", "commerce", "pavillon"],
    "foundation_types": ["semelle", "pieux", "radier", "dalle", "sur pilotis"],
    "wall_types": ["porteur", "cloison", "rideau", "doublage", "isolation"]
}

In [5]:

# Fonction d'extraction améliorée
def extract_construction_params(description):
    doc = nlp_spacy(description)
    transformers_results = nlp_transformers(description)

    params = {
        "type_batiment": None,
        "surface": None,
        "hauteur": None,
        "etages": None,
        "materiaux": [],
        "fondations": None,
        "murs": [],
        "ouvertures": [],
        "contraintes": []
    }

    # Extraction du type de bâtiment
    for token in doc:
        if token.text.lower() in construction_terms["structure_types"]:
            params["type_batiment"] = token.text.lower()

    # Extraction des mesures avec expressions régulières
    # Surface
    surface_matches = re.findall(r'(\d+)\s*m²|\s(\d+)\s*mètres?\s*carrés?', description, re.IGNORECASE)
    if surface_matches:
        for match in surface_matches:
            for value in match:
                if value:
                    params["surface"] = int(value)
                    break
            if params["surface"]:
                break

    # Nombre d'étages
    etages_matches = re.findall(r'(\d+)\s*étages?|(\d+)\s*niveaux?', description, re.IGNORECASE)
    if etages_matches:
        for match in etages_matches:
            for value in match:
                if value:
                    params["etages"] = int(value)
                    break
            if params["etages"]:
                break

    # Matériaux
    for material in construction_terms["materials"]:
        if material in description.lower():
            params["materiaux"].append(material)

    # Fondations
    for foundation in construction_terms["foundation_types"]:
        if foundation in description.lower():
            params["fondations"] = foundation

    return params

In [6]:

# Test avec différentes descriptions
test_descriptions = [
    "Maison individuelle de 120m² avec 2 étages, murs en béton de 20cm, fondations semelles continues",
    "Immeuble de bureaux de 5 étages, 800m² par étage, structure acier, façade rideau en verre",
    "Entrepôt industriel de 2000m², plain-pied, hauteur sous plafond 8m, structure métallique"
]

for desc in test_descriptions:
    result = extract_construction_params(desc)
    print(f"Description: {desc}")
    print(f"Paramètres extraits: {json.dumps(result, indent=2, ensure_ascii=False)}")
    print("-" * 50)

Description: Maison individuelle de 120m² avec 2 étages, murs en béton de 20cm, fondations semelles continues
Paramètres extraits: {
  "type_batiment": "maison",
  "surface": 120,
  "hauteur": null,
  "etages": 2,
  "materiaux": [
    "béton"
  ],
  "fondations": "semelle",
  "murs": [],
  "ouvertures": [],
  "contraintes": []
}
--------------------------------------------------
Description: Immeuble de bureaux de 5 étages, 800m² par étage, structure acier, façade rideau en verre
Paramètres extraits: {
  "type_batiment": "immeuble",
  "surface": 800,
  "hauteur": null,
  "etages": 5,
  "materiaux": [
    "acier",
    "verre"
  ],
  "fondations": null,
  "murs": [],
  "ouvertures": [],
  "contraintes": []
}
--------------------------------------------------
Description: Entrepôt industriel de 2000m², plain-pied, hauteur sous plafond 8m, structure métallique
Paramètres extraits: {
  "type_batiment": "entrepôt",
  "surface": 2000,
  "hauteur": null,
  "etages": null,
  "materiaux": [
    