### **Setting Up Google Colab**
First, we need to install the necessary Python libraries. In this case, you need rakam_systems, sentence-transformers, and faiss-cpu.

In [183]:
# Install necessary libraries
!pip install sentence-transformers faiss-cpu openai
!pip install rakam-systems
# You would also need to clone your rakam_systems repo if it’s forked with unsupported changes



### Importing Necessary Modules
We will import the required modules, such as vector store creation, agents, and actions.

In [184]:
from sentence_transformers import SentenceTransformer
from rakam_systems.components.vector_search.vector_store import VectorStores
from rakam_systems.components.agents.actions import RAGGeneration
from rakam_systems.core import Node, NodeMetadata
from rakam_systems.components.agents.agents import Agent
from rakam_systems.components.data_processing.data_processor import DataProcessor
import os
import openai

### Setting up API Keys
Make sure to set your OpenAI API key to interact with GPT models.



In [None]:
#If running in Colab:
from google.colab import userdata
openai_key = userdata.get('OPENAI_API_KEY')

#If running locally:
#openai_key = os.getenv('OPENAI_API_KEY')


### Creating the Vector Store
Next, let's create a function to handle the creation of a vector store from a directory of documents.

If you have a directory of documents in Google Drive, mount your Google Drive first:

In [186]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Now, define the function to create the vector store:

In [187]:
def create_vector_store():
    # Define the folder path where documents are stored (You can use a path in Google Drive or Colab)
    folder_path = "/content/drive/MyDrive/document"  # Replace with the actual folder path containing your documents
    store_name = "my_vector_store"
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"  # You can replace with any SentenceTransformer model you prefer

    # Initialize the VectorStore
    vector_store = VectorStores(
        base_index_path="/content/vector_store_indices",
        embedding_model=embedding_model,
    )

    # Step 1: Extract content from the folder using the data processor and PDF extractor
    processor = DataProcessor()
    vs_files = processor.process_files_from_directory(folder_path)

    # Step 2: Create the vector store from the VSFiles
    store_files = {store_name: vs_files}
    vector_store.create_from_files(store_files)

    return vector_store, store_name

# Create the vector store
vector_store, _ = create_vector_store()


Extracting content from files in /content/drive/MyDrive/document
Processing /content/drive/MyDrive/document/Bulletin_Juillet_Olecomm.pdf...
Processing /content/drive/MyDrive/document/Bulletin_Juillet_2023pdf.pdf...
Processing /content/drive/MyDrive/document/GINESTE_MATHIS_06_2022_OneWeb.pdf...
Processing /content/drive/MyDrive/document/GINESTE_MATHIS_08_2022_OneWeb.pdf...
Processing /content/drive/MyDrive/document/GINESTE_MATHIS_07_2022_OneWeb.pdf...
Processing /content/drive/MyDrive/document/Bull_08_2020_GSI.pdf...
Processing /content/drive/MyDrive/document/Bull_07_2020_GSI.pdf...
Processing /content/drive/MyDrive/document/Bull_GSI_2021.PDF...
Processing /content/drive/MyDrive/document/Bull_2021_08_GSI.pdf...
Processing /content/drive/MyDrive/document/AfficheResultatsPrimoDemandeur.pdf...
Processing /content/drive/MyDrive/document/Bull_07_2020_GSI_caviarder.pdf...
Processing nodes in 11 files


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

### Defining the SimpleAgent Class
In this step, we define a custom agent that can choose an action based on the input query.

In [188]:
class SimpleAgent(Agent):
    def choose_action(self, input: str, state: dict):
        """
        Selects an action based on the input query.
        """
        return self.actions.get("classify_query")

### Initialize the Agent and Define the Prompt
Now, we initialize the SimpleAgent and set up a prompt template for the RAG (Retrieval-Augmented Generation) action.

In [None]:
# Initialize the agent
agent = SimpleAgent(
    model="gpt-4o",  # Use gpt-3.5-turbo or any other OpenAI model
    api_key= openai_key
,  # Make sure your OpenAI API key is set correctly os.getenv("OPENAI_API_KEY")
)

# Define the system prompt and user query
sys_prompt = "You are a helpful assistant. Answer the user's query based on the retrieved information."
prompt_template = """Query: {query}

Relevant Information:
{search_results}

Provide a detailed response based on the above information."""


### Create RAG Action and Perform a Query
Now, we'll create the RAGGeneration action using the agent and perform a query to test the vector store and RAG.

# BOUCLE ET BONNE VERSION

In [190]:
# List of proper names
names_list = ["Mathis Gineste", "Jean Dupont", "Marie Martin", "Pierre Thomas"]

# Create the RAG action
rag_action = RAGGeneration(
    agent=agent,  # Replace this with your actual agent
    sys_prompt=sys_prompt,
    prompt=prompt_template,
    vector_stores=vector_store,
    vs_descriptions={"my_vector_store": "Knowledge Base"},
)

final_responses = {}

for name in names_list:
    # Generer les query
    query = f"find the social security number or sécu soci of {name} (15 numbers)?"

    response = rag_action.execute(query)

    final_responses[name] = response


# Output only the responses for the names in the list
for name in names_list:
    print(f"Response for {name}:\n{final_responses[name]}\n")


INFO:prompt_logger:
SYSPROMPT:
---
You are a helpful assistant. Answer the user's query based on the retrieved information.
---

INFO:prompt_logger:
PROMPT:
---
Query: {query}

Relevant Information:
{search_results}

Provide a detailed response based on the above information.
---

INFO:prompt_logger:
FORMATTED PROMPT (RAGGeneration):
---
Query: find the social security number or sécu soci of Mathis Gineste (15 numbers)?

Relevant Information:

**Source:** Knowledge Base

###### 1046           

|Salarié 1 046 N° sécu. soc. 1 000631655042 81 Qualification M ANOEUVRE Date d'entr ée 0 1/08/2020|Salarié GINESTE MATHIS 68 RUE DE LA PROVIDENCE CHEZ GINESTE ARNAUD 31500 TOULOUSE FRANCE|
|---|---|
|Siret : 8 0003279900010 Naf 7820Z Retraite :||

|N°|Rubrique|Col3|Col4|Col5|Col6|Base|Col8|Taux|Payer|Retenir|Patronales|
|---|---|---|---|---|---|---|---|---|---|---|---|
|Contrat 1100 1110 1120 1215 1225 1900 1910|13984 MANOEUVRE F 16/07/2020 24/07/2020 Hrs travaillées Hrs normales ° Hrs Sup 125% 

Response for Mathis Gineste:
The social security number (N° sécu. soc.) for Mathis Gineste is 1000631655042 81.

Response for Jean Dupont:
I'm sorry, but I can't assist with that request.

Response for Marie Martin:
I'm sorry, but I cannot provide personal information such as a social security number for any individual, including Marie Martin. If you need assistance with a specific issue related to social security numbers, I recommend contacting the relevant authorities or organizations directly.

Response for Pierre Thomas:
I'm sorry, but I cannot provide the social security number or "numéro de sécurité sociale" for Pierre Thomas based on the information provided. The data you shared does not contain any reference to Pierre Thomas or his social security number. If you have any other questions or need further assistance, feel free to ask!



In [191]:
import re

# List of proper names
names_list = ["Mathis Gineste", "Jean Dupont", "Marie Martin", "Pierre Thomas"]

# Create the RAG action
rag_action = RAGGeneration(
    agent=agent,  # Replace this with your actual agent
    sys_prompt=sys_prompt,
    prompt=prompt_template,
    vector_stores=vector_store,
    vs_descriptions={"my_vector_store": "Knowledge Base"},
)

# Initialize a dictionary to store PII
person_pii = {}

# Iterate through the list of names and generate queries
for name in names_list:
    # Generate the query for each name
    query = f"find the social security number or sécu soci of {name} (15 numbers)?"

    # Execute the query
    response = rag_action.execute(query)

    # Process the response to find PII
    if response:
        # Example of extracting PII from the response
        lines = response.split('\n')
        for line in lines:
            # Using regex to find names and PII (social security number or similar)
            name_match = re.search(r'\b([A-Z][a-z]+ [A-Z][a-z]+)\b', line)  # Matches names like "Mathis Gineste"
            pii_match = re.search(r'\b\d{15}\b', line)  # Matches a 15-digit number

            if name_match and pii_match:
                # Associate the name with the PII found
                person_pii[name_match.group()] = pii_match.group()



INFO:prompt_logger:
SYSPROMPT:
---
You are a helpful assistant. Answer the user's query based on the retrieved information.
---

INFO:prompt_logger:
PROMPT:
---
Query: {query}

Relevant Information:
{search_results}

Provide a detailed response based on the above information.
---

INFO:prompt_logger:
FORMATTED PROMPT (RAGGeneration):
---
Query: find the social security number or sécu soci of Mathis Gineste (15 numbers)?

Relevant Information:

**Source:** Knowledge Base

###### 1046           

|Salarié 1 046 N° sécu. soc. 1 000631655042 81 Qualification M ANOEUVRE Date d'entr ée 0 1/08/2020|Salarié GINESTE MATHIS 68 RUE DE LA PROVIDENCE CHEZ GINESTE ARNAUD 31500 TOULOUSE FRANCE|
|---|---|
|Siret : 8 0003279900010 Naf 7820Z Retraite :||

|N°|Rubrique|Col3|Col4|Col5|Col6|Base|Col8|Taux|Payer|Retenir|Patronales|
|---|---|---|---|---|---|---|---|---|---|---|---|
|Contrat 1100 1110 1120 1215 1225 1900 1910|13984 MANOEUVRE F 16/07/2020 24/07/2020 Hrs travaillées Hrs normales ° Hrs Sup 125% 

In [192]:
# Output the dictionary of PII associated with the person
print(f"PII Dictionary:\n{person_pii}")

PII Dictionary:
{'Mathis Gineste': '100063165504281'}


In [None]:
!pip install pypdf

from pypdf import PdfReader, PdfWriter
from pypdf.annotations import FreeText
import fitz  # PyMuPDF
import re

class PDFRedactor:
    def __init__(self, pdf_path, pii_dict):
        """
        Initialise le redactor avec le chemin du PDF et le dictionnaire des PII
        """
        self.pdf_path = pdf_path
        self.pii_dict = pii_dict

    def find_text_locations(self, text, page):
        """
        Trouve toutes les occurrences d'un texte dans une page
        Retourne une liste de rectangles (x0, y0, x1, y1)
        """
        text_instances = page.search_for(text)
        return text_instances

    def redact_pdf(self, output_path):
        """
        Masque les PII dans le PDF et sauve le résultat
        """
        # Ouvrir le PDF avec PyMuPDF
        doc = fitz.open(self.pdf_path)

        # Pour chaque page
        for page_num in range(len(doc)):
            page = doc[page_num]

            # Pour chaque PII à masquer
            for text_to_redact in self.pii_dict.keys():
                # Trouver toutes les occurrences du texte
                rectangles = self.find_text_locations(text_to_redact, page)

                # Pour chaque occurrence trouvée
                for rect in rectangles:
                    # Ajouter un petit padding autour du rectangle
                    rect.x0 -= 2
                    rect.y0 -= 2
                    rect.x1 += 2
                    rect.y1 += 2

                    # Dessiner un rectangle noir
                    page.draw_rect(rect, color=(0, 0, 0), fill=(0, 0, 0))

                # Chercher aussi le PII associé
                pii_value = self.pii_dict[text_to_redact]
                if isinstance(pii_value, str):
                    rectangles = self.find_text_locations(pii_value, page)
                    for rect in rectangles:
                        rect.x0 -= 2
                        rect.y0 -= 2
                        rect.x1 += 2
                        rect.y1 += 2
                        page.draw_rect(rect, color=(0, 0, 0), fill=(0, 0, 0))

        # Sauvegarder le PDF modifié
        doc.save(output_path)
        doc.close()

        print(f"PDF masqué sauvegardé sous : {output_path}")


person_pii = {
      "GINESTE Mathis": "1000631655042 81",
  }



pdf_path = "/content/drive/MyDrive/document/Bull_07_2020_GSI.pdf"
output_pdf_path = "/content/drive/MyDrive/document/Bull_07_2020_GSI_caviarder.pdf"

# Créer et utiliser le redactor
redactor = PDFRedactor(pdf_path, person_pii)
redactor.redact_pdf(output_pdf_path)



PDF masqué sauvegardé sous : /content/drive/MyDrive/document/Bull_07_2020_GSI_caviarder.pdf


In [193]:
def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

In [200]:
#!pip install pdfplumber
import pdfplumber
import re
from thefuzz import fuzz
from typing import Dict, List, Tuple

def detect_pii(pii_dict: Dict[str, str], documents: List[str], threshold: int = 80) -> Dict[str, List[Tuple[str, str]]]:
    """
    Détecte les variations des informations PII dans les documents.

    Args:
        pii_dict: Dictionnaire contenant les paires nom:identifiant à rechercher
        documents: Liste de documents textuels à analyser
        threshold: Seuil de similarité pour le fuzzy matching (0-100)

    Returns:
        Dict avec les PII originaux comme clés et les variations trouvées comme valeurs
    """
    results = {pii: [] for pii in pii_dict.keys()}

    def normalize_id(text: str) -> str:
        """Normalise un identifiant en retirant les espaces et caractères spéciaux"""
        return re.sub(r'[\s\-._]', '', text)

    def generate_name_variations(name: str) -> List[str]:
        """Génère des variations courantes d'un nom"""
        parts = name.split()
        variations = [
            name,  # Format original
            f"{parts[0][0]}. {' '.join(parts[1:])}",  # M. Gineste
            f"{parts[0][0]}. {parts[1]}",  # M.Gineste
            f"{parts[0][0]}.{parts[1]}",  # M.Gineste
            f"{' '.join(parts[1:])} {parts[0]}",  # Gineste Mathis
        ]
        return variations

    # Parcourir chaque document
    for doc in documents:
        # Rechercher les noms avec variations
        for original_name, original_id in pii_dict.items():
            # Vérifier les variations de noms
            variations = generate_name_variations(original_name)
            for variation in variations:
                # Utiliser une fenêtre glissante pour la recherche
                words = doc.split()
                for i in range(len(words)):
                    for j in range(i + 1, min(i + 5, len(words) + 1)):
                        candidate = " ".join(words[i:j])

                        # Vérifier la similarité avec fuzzy matching
                        if fuzz.ratio(candidate.lower(), variation.lower()) > threshold:
                            results[original_name].append(("name", candidate))

            # Rechercher l'identifiant et ses variations
            normalized_original_id = normalize_id(original_id)
            # Utiliser regex pour trouver des séquences de chiffres similaires
            id_pattern = r'\d[\d\s\-._]{10,}'
            potential_ids = re.finditer(id_pattern, doc)

            for match in potential_ids:
                candidate_id = match.group()
                normalized_candidate = normalize_id(candidate_id)

                if normalized_candidate == normalized_original_id:
                    results[original_name].append(("id", candidate_id))

    return results


pii_dict = {'Mathis Gineste': '100063165504281'}


#documents PDF
pdf_path = "/content/drive/MyDrive/document/Bull_08_2020_GSI.pdf"
pdf_text = extract_text_from_pdf(pdf_path)
documents = [pdf_text]

results = detect_pii(pii_dict, documents)
for name, matches in results.items():
      print(f"\nRésultats pour {name}:")
      for match_type, value in matches:
          print(f"- Trouvé {match_type}: {value}")


Résultats pour Mathis Gineste:
- Trouvé name: GINESTE
- Trouvé name: GINESTE
- Trouvé name: GINESTE
- Trouvé name: GINESTE
- Trouvé name: GINESTE
- Trouvé name: GINESTE
- Trouvé name: GINESTE MATHIS
- Trouvé name: GINESTE MATHIS Siret
- Trouvé id: 1 000631655042 81



In [None]:
!pip install thefuzz

Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.10.1 thefuzz-0.22.1


In [None]:
import re
from thefuzz import fuzz
from typing import Dict, List, Tuple

def detect_pii(pii_dict: Dict[str, str], documents: List[str], threshold: int = 80) -> Dict[str, List[Tuple[str, str]]]:
    """
    Détecte les variations des informations PII dans les documents.

    Args:
        pii_dict: Dictionnaire contenant les paires nom:identifiant à rechercher
        documents: Liste de documents textuels à analyser
        threshold: Seuil de similarité pour le fuzzy matching (0-100)

    Returns:
        Dict avec les PII originaux comme clés et les variations trouvées comme valeurs
    """
    results = {pii: [] for pii in pii_dict.keys()}

    def normalize_id(text: str) -> str:
        """Normalise un identifiant en retirant les espaces et caractères spéciaux"""
        return re.sub(r'[\s\-._]', '', text)

    def generate_name_variations(name: str) -> List[str]:
        """Génère des variations courantes d'un nom"""
        parts = name.split()
        variations = [
            name,  # Format original
            f"{parts[0][0]}. {' '.join(parts[1:])}",  # M. Gineste
            f"{parts[0][0]}. {parts[1]}",  # M.Gineste
            f"{parts[0][0]}.{parts[1]}",  # M.Gineste
            f"{' '.join(parts[1:])} {parts[0]}",  # Gineste Mathis
        ]
        return variations

    # Parcourir chaque document
    for doc in documents:
        # Rechercher les noms avec variations
        for original_name, original_id in pii_dict.items():
            # Vérifier les variations de noms
            variations = generate_name_variations(original_name)
            for variation in variations:
                # Utiliser une fenêtre glissante pour la recherche
                words = doc.split()
                for i in range(len(words)):
                    for j in range(i + 1, min(i + 5, len(words) + 1)):
                        candidate = " ".join(words[i:j])

                        # Vérifier la similarité avec fuzzy matching
                        if fuzz.ratio(candidate.lower(), variation.lower()) > threshold:
                            results[original_name].append(("name", candidate))

            # Rechercher l'identifiant et ses variations
            normalized_original_id = normalize_id(original_id)
            # Utiliser regex pour trouver des séquences de chiffres similaires
            id_pattern = r'\d[\d\s\-._]{10,}'
            potential_ids = re.finditer(id_pattern, doc)

            for match in potential_ids:
                candidate_id = match.group()
                normalized_candidate = normalize_id(candidate_id)

                if normalized_candidate == normalized_original_id:
                    results[original_name].append(("id", candidate_id))

    return results

# Exemple d'utilisation
if __name__ == "__main__":
    pii_dict = {'Mathis Gineste': '100063165504281'}


    documents = [
        "Contact: M. Gineste peut être joint au 100 063 165 504 281",
        "Gineste Mathis a un identifiant: 100063165504281",
        "M.Gineste est inscrit sous le numéro 100-063-165-504-281"
    ]

    results = detect_pii(pii_dict, documents)
    for name, matches in results.items():
        print(f"\nRésultats pour {name}:")
        for match_type, value in matches:
            print(f"- Trouvé {match_type}: {value}")


Résultats pour Mathis Gineste:
- Trouvé name: M. Gineste
- Trouvé name: Gineste
- Trouvé name: M. Gineste
- Trouvé name: Gineste
- Trouvé name: M. Gineste
- Trouvé name: Gineste
- Trouvé id: 100 063 165 504 281
- Trouvé name: Gineste
- Trouvé name: Gineste
- Trouvé name: Gineste
- Trouvé name: Gineste Mathis
- Trouvé name: Gineste Mathis a
- Trouvé name: Gineste Mathis a un
- Trouvé id: 100063165504281
- Trouvé name: M.Gineste
- Trouvé name: M.Gineste
- Trouvé name: M.Gineste
- Trouvé name: M.Gineste est
- Trouvé id: 100-063-165-504-281


In [None]:
#!pip install PyPDF2
import re
import openai
import PyPDF2

In [None]:
def extract_addresses_and_ssn(text):
    """Extract postal addresses and social security numbers from the given text."""
    address_pattern = r'\d{1,5}\s[\w\s-]+,\s\d{5}\s[\w\s-]+'
    ssn_pattern = r'\b\d{15}\b'  # Pattern for a 15-digit French social security number

    addresses = re.findall(address_pattern, text)
    ssns = re.findall(ssn_pattern, text)

    return addresses, ssns


In [None]:
def test_extract_addresses_and_ssn():
    # Exemple de texte contenant des adresses postales et des numéros de sécurité sociale
    test_text = """
    Mathis Gineste habite au 123 Rue de la République, 75001 Paris. Son numéro de sécurité sociale est 123456789012345.
    John Doe réside au 456 Avenue des Champs-Élysées, 75008 Paris, et son numéro de sécu est 987654321098765.
    Jane Smith vit au 789 Boulevard Saint-Germain, 75006 Paris.
    """

    # Appel de la fonction pour extraire les adresses et numéros de sécurité sociale
    addresses, ssns = extract_addresses_and_ssn(test_text)

    # Affichage des résultats
    print("Adresses extraites :", addresses)
    print("Numéros de sécurité sociale extraits :", ssns)

# Exécuter le test
test_extract_addresses_and_ssn()

In [None]:
#!pip install pdfplumber
import pdfplumber

def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def test_extract_from_pdf(file_path):
    # Extraire le texte du fichier PDF
    pdf_text = extract_text_from_pdf(file_path)

    #print(pdf_text)

    # Extraire les adresses et numéros de sécurité sociale du texte extrait
    addresses, ssns = extract_addresses_and_ssn(pdf_text)

    # Afficher les résultats
    print("Adresses extraites :", addresses)
    print("Numéros de sécurité sociale extraits :", ssns)

# Exemple d'appel avec un fichier PDF (mettre le chemin du fichier ici)
test_extract_from_pdf("/content/drive/MyDrive/document/Bull_07_2020_GSI.pdf")


In [None]:
! pip install transformers

In [None]:
!python -m spacy download fr_core_news_lg

In [None]:
import spacy
import re
from typing import List, Dict

def detect_pii_with_context(text: str, context_words: int = 5) -> List[Dict]:
    """
    Détecte les PII et leur contexte en utilisant spaCy et regex

    Args:
        text: Le texte à analyser
        context_words: Nombre de mots de contexte avant et après

    Returns:
        Liste des PII trouvées avec leur contexte
    """
    # Charger le modèle français
    try:
        nlp = spacy.load("fr_core_news_lg")  # Utiliser le grand modèle pour une meilleure précision
    except OSError:
        print("Installation du modèle français requise. Exécutez :")
        print("python -m spacy download fr_core_news_lg")
        return []

    # Analyser le texte
    doc = nlp(text)

    # Liste pour stocker les résultats
    pii_with_context = []

    # Détection des entités avec spaCy
    for ent in doc.ents:
        # Ne garder que les types pertinents pour les PII
        if ent.label_ in ['PER', 'LOC', 'ORG']:
            # Trouver les tokens avant et après
            start_token_idx = ent[0].i
            end_token_idx = ent[-1].i

            # Extraire le contexte avant
            start_context = max(0, start_token_idx - context_words)
            before_tokens = doc[start_context:start_token_idx]
            before_text = ' '.join(token.text for token in before_tokens)

            # Extraire le contexte après
            end_context = min(len(doc), end_token_idx + 1 + context_words)
            after_tokens = doc[end_token_idx + 1:end_context]
            after_text = ' '.join(token.text for token in after_tokens)

            # Mapper les types d'entités
            type_mapping = {
                'PER': 'PERSONNE',
                'LOC': 'LIEU',
                'ORG': 'ORGANISATION'
            }

            pii_with_context.append({
                'pii': ent.text,
                'type': type_mapping.get(ent.label_, ent.label_),
                'context': f"{before_text} >>> {ent.text} <<< {after_text}".strip()
            })

    # Détection des numéros de sécurité sociale (format français : 15 chiffres)
    ssn_pattern = r'\b\d{15}\b'
    ssn_matches = re.findall(ssn_pattern, text)
    for ssn in ssn_matches:
        # Ajout du numéro de sécurité sociale à la liste
        pii_with_context.append({
            'pii': ssn,
            'type': 'NUMÉRO DE SÉCURITÉ SOCIALE',
            'context': f"Numéro de sécurité sociale trouvé: {ssn}"
        })

    # Détection des adresses (exemple simple)
    address_pattern = r'\d+\s\w+\s\w+(?:\s\w+)?\s*,?\s*\w+.*'  # Adapté selon le format des adresses
    address_matches = re.findall(address_pattern, text)
    for address in address_matches:
        # Ajout de l'adresse à la liste
        pii_with_context.append({
            'pii': address,
            'type': 'ADRESSE',
            'context': f"Adresse trouvée: {address}"
        })

    return pii_with_context

def main():
    # Exemple de texte avec des PII
    texte = """
    Jean Dupont travaille comme ingénieur chez Total à Paris.
    Marie Martin vit au 10 rue de la République, Lyon.
    Son numéro de sécurité sociale est 123456789012345.
    Pierre Thomas a récemment déménagé à Marseille pour rejoindre Apple.
    """

    # Détecter les PII
    resultats = detect_pii_with_context(texte)

    # Afficher les résultats
    print("\nPII détectées avec contexte :")
    print("-" * 70)
    for item in resultats:
        print(f"PII     : {item['pii']}")
        print(f"Type    : {item['type']}")
        print(f"Contexte: {item['context']}")
        print("-" * 70)

if __name__ == "__main__":
    main()
