In [3]:
 ! python -m spacy download sv_core_news_md

C:\Users\felix\Anaconda3\python.exe: No module named spacy


In [5]:
#Import libraries
import os
import re
import spacy
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from dotenv import load_dotenv
from langchain_experimental.data_anonymizer import PresidioAnonymizer
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
from langdetect import detect, LangDetectException
from faker import Faker
from langchain.schema import runnable
import requests
import json



In [3]:
%pip install --upgrade --quiet langdetect

Note: you may need to restart the kernel to use updated packages.


In [9]:
load_dotenv()

OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"]
if OPENROUTER_API_KEY is None:
    raise ValueError("OPENROUTER_API_KEY does not exist, add it to env")

This cell uses the OpenAI GPT-4 API to generate pre-marked flowing-text training data.
The generated texts include dummy data of personal identifiable information (PII). Each instance of sensitive information within the text is clearly marked according to predefined tags.

**Functions of the code:**
- **Folder Creation**: Automatically creates a folder for output if it does not already exist.
- **Data Generation**: Utilizes OpenAI's GPT-4 to craft text strings embedded with marked personal information.
- **File Output**: Saves each generated text string as a `.txt` file within the designated output folder.
- **PII Tagging**: Demonstrates how to programmatically mark personal information within text using specific XML-like tags for different data types.

**Sensitive Data Includes:**
- Names, phone numbers, addresses, email addresses, numeric identifiers (e.g., member numbers, bank account numbers), and credit card

**Note**
If text files already exist in the folder, please delete them before generating new ones. details.

In [10]:
import openai

#Enter the name of the folder where the texts will be generated.
#Creates a new folder if one does not exist
output_folder = "generated_texts_presidio"
os.makedirs(output_folder, exist_ok=True)

prompt = f"""
    Jag skapar output träningsdata som ska användas för att träna min modell.
    Användandet ska vara till att generera en löpande text som ska innehålla dummy data av påhittad personlig känslig information.
    Varje gång känslig information genereras ska den markeras tydligt i texten. Endast en löpande text, ingen annan output.
    
    Markeringsformat för känslig information:
    <name> för namn, <phone> för telefonnummer, <address> för adresser, <email> för e-postadresser, <id> för numeriska/alfanumeriska identifierare,
    och <credit_card> för kreditkortsinformation.
    Innan och efter markeringen lämna mellanrum.
    
    Exempel på hur texten ska formuleras:
    'Jag träffade en person som hette <name> Johan Svensson </name> igår. Han gav mig sitt telefonnummer <phone> 123-456-7890 </phone>
    samt hans e-postadress <email> johan.svensson@gmail.com </email>.'
    
    Personlig känslig information inkluderar:
    Person/Namn - Detta inkluderar förnamn, mellannamn, efternamn eller hela namn på individer.
    Telefonnummer - Alla telefonnummer, inklusive avgiftsfria nummer.
    Adress - Kompletta eller partiella adresser, inklusive gata, postnummer, husnummer, stad och stat.
    E-post - Alla e-postadresser.
    Numeriskt Identifierare - Alla numeriska eller alfanumeriska identifierare som ärendenummer, medlemsnummer, biljettnummer, bankkontonummer, IP-adresser, produktnycklar, serienummer, spårningsnummer för frakt, etc.
    Kreditkort - Alla kreditkortsnummer, säkerhetskoder eller utgångsdatum.
    """

def query(prompt, temperature=0.8):
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
            "Content-Type": "application/json"
        },
        data=json.dumps({
            "model": "openai/gpt-4o",  # using openai gpt4o for the generating of texts
            "messages": [
                {"role": "system", "content": "Du är en hjälpful assistent, designad för att generera text data."},
                {"role": "user", "content": prompt}
            ],
            "temperature": temperature,  #  influences the variety in the model's responses, is set in the function parameter.
            "max_tokens": 300 
        })
    )

    if response.status_code == 200:
        response_data = response.json()
        return response_data['choices'][0]['message']['content']
    else:
        raise Exception(f"API request failed with status code {response.status_code}: {response.text}")


for i in range(100):  # Enter the amout of texts to generate
    res = query(prompt)
    file_path = os.path.join(output_folder, f"text_{i+1}.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(res)
    
print("Texter genererade och sparade.")



Texter genererade och sparade.


Creates arrays of the PII in the texts wich will be used to test models comparing if the PII in these arrays still exist in the text

In [11]:
import os
import re

def extract_pii_contents(text):
    #Extracts PII contents from the text using regex.
    pattern = re.compile(r'<\w+>(.*?)</\w+>')
    return [match.group(1) for match in pattern.finditer(text)]

def read_files_and_extract_pii(folder_path):
    #Reads each text file in the folder, extracts PII contents, and returns a dict of filename to PII list.
    files_pii = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                pii_contents = extract_pii_contents(text)
                files_pii[filename] = pii_contents
    return files_pii

folder_path = 'generated_texts_presidio'
files_pii = read_files_and_extract_pii(folder_path)
for filename, pii_contents in files_pii.items():
    print(f"PII contents in {filename}: {pii_contents}")

PII contents in text_1.txt: [' Maria Andersson ', ' Storgatan 45, 12345 Stockholm ', ' 070-123-4567 ', ' maria.andersson@example.com ', ' ABC123XYZ ', ' 1234-5678-9101-1121, 123 ', ' EMP987654 ']
PII contents in text_10.txt: [' Maria Eriksson ', ' Storgatan 12, 123 45 Stockholm ', ' 070-123 4567 ', ' maria.eriksson@hotmail.com ', ' Erik Johansson ', ' 8475-KJH-7643 ', ' Peter Larsson ', ' 1234-5678-9876-5432 ']
PII contents in text_100.txt: [' Anna Karlsson ', ' Västra Gatan 12, 123 45 Småstad ', ' 070-123 45 67 ', ' anna.karlsson@example.com ', ' 4111-1111-1111-1111 ', ' 987654321 ']
PII contents in text_11.txt: [' Maria Karlsson ', ' Storgatan 12, 12345 Staden ', ' 070-123-4567 ', ' maria.karlsson@example.com ', ' ABC123XYZ ', ' 1234-5678-9876-5432 ']
PII contents in text_12.txt: [' Anna Karlsson ', ' Götgatan 14, 411 05 Göteborg ', ' 070-123 45 67 ', ' anna.karlsson@mail.com ', ' BK123456 ', ' Anna ']
PII contents in text_13.txt: [' Emma Karlsson ', ' Storgatan 12, 12345 Stockholm '

In [12]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span, DocBin
from spacy.language import Language
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngine
import os
import spacy.util

nlp_sv = spacy.load("sv_core_news_md")

# Create a Faker objects
fake = Faker('sv_SE') 
fake_en = Faker('en_US')

def add_custom_recognizers(analyzer_engine):
    #PatternRecognizer for phone numbers
    # Numbers starting with the country code +46, followed by 8 to 10 digits.
    # Numbers in a group format that might be separated by spaces or dashes.
    swedish_phone_recognizer = PatternRecognizer(
        supported_entity="PHONE_NUMBER",
        patterns=[Pattern("Swedish Phone Number", r'\+46\s?\d{1,4}\s?\d{2,8}|\d{2,4}-\d{2,8}|\d{10}', 0.8)]
    )
    
    # PatternRecognizer for email addresses
    email_recognizer = PatternRecognizer(
        supported_entity="EMAIL_ADDRESS",
        patterns=[Pattern("Email Address", r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', 0.8)]
    )
    
    # PatternRecognizer for credit card numbers
    credit_card_recognizer = PatternRecognizer(
        supported_entity="CREDIT_CARD",
        patterns=[Pattern("Credit Card Number", r'\b(?:\d{4}[-\s]?){3}\d{4}\b', 0.85)]
    )

    # Patternrecognizer for patterns like product keys, etc.
    product_key_pattern = PatternRecognizer(
        supported_entity="PRODUCT_KEY",
        patterns=[Pattern("Product Key Pattern", r"\b[A-Z0-9]{5}-[A-Z0-9]{5}-[A-Z0-9]{5}\b", 0.95)])

    ssn_recognizer = PatternRecognizer(
        supported_entity="SSN",
        patterns=[Pattern("Swedish SSN", r'\b\d{6}[-\s]?\d{4}\b', 0.95)])

    license_plate_recognizer = PatternRecognizer(
        supported_entity="LICENSE_PLATE",
        patterns=[Pattern("Swedish License Plate", r'\b[A-Z]{3}\s?\d{3}\b', 0.9)])

    ip_address_recognizer = PatternRecognizer(
        supported_entity="IP_ADDRESS",
        patterns=[Pattern("IP Address", r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', 0.9)])

    bank_account_recognizer = PatternRecognizer(
        supported_entity="BANK_ACCOUNT",
        patterns=[Pattern("Swedish Bank Account", r'\b\d{3,4}[-\s]?\d{2,4}[-\s]?\d{2,7}\b', 0.85)])
    
    # Add recognizers to the engine
    analyzer_engine.registry.add_recognizer(swedish_phone_recognizer)
    analyzer_engine.registry.add_recognizer(email_recognizer)
    analyzer_engine.registry.add_recognizer(credit_card_recognizer)
    analyzer_engine.registry.add_recognizer(product_key_pattern)
    analyzer_engine.registry.add_recognizer(ssn_recognizer)
    analyzer_engine.registry.add_recognizer(license_plate_recognizer)
    analyzer_engine.registry.add_recognizer(ip_address_recognizer)
    analyzer_engine.registry.add_recognizer(bank_account_recognizer)


    return analyzer_engine

def anonymize_with_spacy(text: str, language: str) -> str:
    
    if language == 'sv':

        doc = nlp_sv(text)
        anonymized_text = ""
        last_end = 0
        for ent in doc.ents:
            anonymized_text += text[last_end:ent.start_char]  # Add text before the entity
            if ent.label_ == "PRS":  # Person name
                anonymized_text += fake.name()
            elif ent.label_ == "LOC":  # Locations
                anonymized_text += fake.city()
            elif ent.label_ == "ORG":  # Organizations
                anonymized_text += fake.company()
            elif ent.label_ == "TME": # Time
                anonymized_text += str(fake.date_of_birth())
            elif ent.label_ == "EMAIL_ADDRESS":  # Email addresses
                anonymized_text += fake.email()
            elif ent.label_ == "CREDIT_CARD":  # Credit card numbers
                anonymized_text += fake_en.credit_card_number()
            elif ent.label_ == "PHONE_NUMBER":  # Phone numbers
                anonymized_text += fake.phone_number()
            elif ent.label == "SSN":
                anonymized_text += fake_en.ssn()
            elif ent.label == "LICENSE_PLATE":
                anonymized_text += fake.bothify(text='???-###')
            elif ent.label == "IP_ADDRESS":
                anonymized_text += fake_en.ipv4()
            elif ent.label == "BANK_ACCOUNT":
                anonymized_text += fake.bban()
            else:
                anonymized_text += '[ANONYMIZED]'  # Default anonymization
            last_end = ent.end_char
        anonymized_text += text[last_end:]  # Add the remaining text after last entity
        return anonymized_text

    return text  # Returns the original text if not Swedish

def detect_language_and_anonymize(text: str) -> dict:
    try:
        # Check if text is not empty
        if text.strip() and any(char.isalpha() for char in text):
            language = detect(text)
        else:
            language = 'sv'  # Default to Swedish
    except LangDetectException:
        language = 'sv'  # Default fallback

    anonymized_text = anonymize_with_spacy(text, language)
    return {"text": anonymized_text, "language": language}

def process_folder(input_folder, output_folder, batch_size=5):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # List all text files
    text_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]
    
    # Process files in batches
    for batch in spacy.util.minibatch(text_files, size=batch_size):
        # Process each batch
        for file_name in batch:
            file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            # Read file content
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read()

            # Anonymize text content
            result = detect_language_and_anonymize(text_content)

            # Write the anonymized text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(result['text'])

            print(f"Processed and saved anonymized text for {file_name}")

if __name__ == '__main__':
    analyzer_engine = AnalyzerEngine()
    analyzer_engine = add_custom_recognizers(analyzer_engine)
    input_folder = 'generated_texts_presidio'
    output_folder = 'anonymized_texts_presidio'
    process_folder(input_folder, output_folder)

Processed and saved anonymized text for text_1.txt
Processed and saved anonymized text for text_10.txt
Processed and saved anonymized text for text_100.txt
Processed and saved anonymized text for text_11.txt
Processed and saved anonymized text for text_12.txt
Processed and saved anonymized text for text_13.txt
Processed and saved anonymized text for text_14.txt
Processed and saved anonymized text for text_15.txt
Processed and saved anonymized text for text_16.txt
Processed and saved anonymized text for text_17.txt
Processed and saved anonymized text for text_18.txt
Processed and saved anonymized text for text_19.txt
Processed and saved anonymized text for text_2.txt
Processed and saved anonymized text for text_20.txt
Processed and saved anonymized text for text_21.txt
Processed and saved anonymized text for text_22.txt
Processed and saved anonymized text for text_23.txt
Processed and saved anonymized text for text_24.txt
Processed and saved anonymized text for text_25.txt
Processed and

In [3]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span, DocBin
from spacy.language import Language
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngine
import os
import spacy.util

# Load Swedish NLP model
nlp_sv = spacy.load("sv_core_news_md")

def add_custom_recognizers(analyzer_engine):
    # PatternRecognizer for phone numbers
    swedish_phone_recognizer = PatternRecognizer(
        supported_entity="PHONE_NUMBER",
        patterns=[Pattern("Swedish Phone Number", r'\+46\s?\d{1,4}\s?\d{2,8}|\d{2,4}-\d{2,8}|\d{10}', 0.8)]
    )

    # PatternRecognizer for email addresses
    email_recognizer = PatternRecognizer(
        supported_entity="EMAIL_ADDRESS",
        patterns=[Pattern("Email Address", r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', 0.8)]
    )

    # PatternRecognizer for credit card numbers
    credit_card_recognizer = PatternRecognizer(
        supported_entity="CREDIT_CARD",
        patterns=[Pattern("Credit Card Number", r'\b(?:\d{4}[-\s]?){3}\d{4}\b', 0.85)]
    )

    # Patternrecognizer for patterns like product keys, etc.
    product_key_pattern = PatternRecognizer(
        supported_entity="PRODUCT_KEY",
        patterns=[Pattern("Product Key Pattern", r"\b[A-Z0-9]{5}-[A-Z0-9]{5}-[A-Z0-9]{5}\b", 0.95)]
    )

    ssn_recognizer = PatternRecognizer(
        supported_entity="SSN",
        patterns=[Pattern("Swedish SSN", r'\b\d{6}[-\s]?\d{4}\b', 0.95)]
    )

    license_plate_recognizer = PatternRecognizer(
        supported_entity="LICENSE_PLATE",
        patterns=[Pattern("Swedish License Plate", r'\b[A-Z]{3}\s?\d{3}\b', 0.9)]
    )

    ip_address_recognizer = PatternRecognizer(
        supported_entity="IP_ADDRESS",
        patterns=[Pattern("IP Address", r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', 0.9)]
    )

    bank_account_recognizer = PatternRecognizer(
        supported_entity="BANK_ACCOUNT",
        patterns=[Pattern("Swedish Bank Account", r'\b\d{3,4}[-\s]?\d{2,4}[-\s]?\d{2,7}\b', 0.85)]
    )

    # Add recognizers to the engine
    analyzer_engine.registry.add_recognizer(swedish_phone_recognizer)
    analyzer_engine.registry.add_recognizer(email_recognizer)
    analyzer_engine.registry.add_recognizer(credit_card_recognizer)
    analyzer_engine.registry.add_recognizer(product_key_pattern)
    analyzer_engine.registry.add_recognizer(ssn_recognizer)
    analyzer_engine.registry.add_recognizer(license_plate_recognizer)
    analyzer_engine.registry.add_recognizer(ip_address_recognizer)
    analyzer_engine.registry.add_recognizer(bank_account_recognizer)

    return analyzer_engine

def anonymize_with_spacy(text: str, language: str) -> str:
    if language == 'sv':
        doc = nlp_sv(text)
        anonymized_text = ""
        last_end = 0
        for ent in doc.ents:
            anonymized_text += text[last_end:ent.start_char]  # Add text before the entity
            anonymized_text += '[ANONYMIZED]'  # Anonymize the entity
            last_end = ent.end_char
        anonymized_text += text[last_end:]  # Add the remaining text after last entity
        return anonymized_text

    return text  # Returns the original text if not Swedish

def detect_language_and_anonymize(text: str) -> dict:
    try:
        # Check if text is not empty
        if text.strip() and any(char.isalpha() for char in text):
            language = detect(text)
        else:
            language = 'sv'  # Default to Swedish
    except LangDetectException:
        language = 'sv'  # Default fallback

    anonymized_text = anonymize_with_spacy(text, language)
    return {"text": anonymized_text, "language": language}

def process_folder(input_folder, output_folder, batch_size=5):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # List all text files
    text_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]
    
    # Process files in batches
    for batch in spacy.util.minibatch(text_files, size=batch_size):
        # Process each batch
        for file_name in batch:
            file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            # Read file content
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read()

            # Anonymize text content
            result = detect_language_and_anonymize(text_content)

            # Write the anonymized text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(result['text'])

            print(f"Processed and saved anonymized text for {file_name}")

if __name__ == '__main__':
    analyzer_engine = AnalyzerEngine()
    analyzer_engine = add_custom_recognizers(analyzer_engine)
    input_folder = 'generated_texts'
    output_folder = 'anonymized_texts/presidio'
    process_folder(input_folder, output_folder)


Processed and saved anonymized text for text_1.txt
Processed and saved anonymized text for text_10.txt
Processed and saved anonymized text for text_100.txt
Processed and saved anonymized text for text_11.txt
Processed and saved anonymized text for text_12.txt
Processed and saved anonymized text for text_13.txt
Processed and saved anonymized text for text_14.txt
Processed and saved anonymized text for text_15.txt
Processed and saved anonymized text for text_16.txt
Processed and saved anonymized text for text_17.txt
Processed and saved anonymized text for text_18.txt
Processed and saved anonymized text for text_19.txt
Processed and saved anonymized text for text_2.txt
Processed and saved anonymized text for text_20.txt
Processed and saved anonymized text for text_21.txt
Processed and saved anonymized text for text_22.txt
Processed and saved anonymized text for text_23.txt
Processed and saved anonymized text for text_24.txt
Processed and saved anonymized text for text_25.txt
Processed and

In [14]:
def calculate_recall(true_positives, false_negatives):
    return true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

def evaluate_anonymization(original_pii, folder_path_anonymized):
    true_positives, false_negatives = 0, 0
    missed_pii_details = []  # Store details about PII that were not anonymized correctly.

    for filename, original_pii_contents in original_pii.items():
        anonymized_file_path = os.path.join(folder_path_anonymized, filename)
        with open(anonymized_file_path, 'r', encoding='utf-8') as file:
            anonymized_text = file.read()

        original_pii_set = set(original_pii_contents)
        anonymized_pii_set = set(extract_pii_contents(anonymized_text))

        detected_pii = original_pii_set & anonymized_pii_set
        missed_pii = original_pii_set - anonymized_pii_set

        true_positives += len(detected_pii)
        false_negatives += len(missed_pii)

        if missed_pii:
            missed_pii_details.append((filename, list(missed_pii)))

    recall = calculate_recall(true_positives, false_negatives)
    return {
        "recall": recall,
        "missed_pii": missed_pii_details
    }

# Example usage
folder_path_anonymized = 'anonymized_texts_presidio'
results = evaluate_anonymization(files_pii, folder_path_anonymized)

print(f"Recall: {results['recall']}")


Recall: 0.6352941176470588


In [110]:
if results['missed_pii']:
    print("Missed PII Details:")
    for file, pii in results['missed_pii']:
        print(f"File: {file}, Missed PII: {pii}")

Missed PII Details:
File: text_1.txt, Missed PII: [' Trädkojvägen 7, 660 60 Malung ', ' Berggränd 29, 140 34 Stockholm ', ' Carl Gustafsson ', ' Sara Eriksson ', ' Techvägen 3, 753 20 Uppsala ', ' mikael.blomqvist@hotmail.com ', ' Anna Berg ', ' Adam Lindström ', ' Mikael Blomqvist ']
File: text_10.txt, Missed PII: [' Rörstrandsgatan 21A, 113 55 Stockholm ', ' Isabella Larsson ']
File: text_11.txt, Missed PII: [' Smågatan 34, 67890 Landsville ', ' Anna Johansson ', ' anna.johansson@domain.se ', ' Storgatan 54, 12345 Stadsville ', ' Erik Olsson ']
File: text_12.txt, Missed PII: [' Kungsgatan 15, 111 43 Stockholm ', ' Karin Lindgren ']
File: text_13.txt, Missed PII: [' Emma Karlsson ', ' Södergatan 24, 12345 Stockholm ', ' Jakob Nilsson ', ' Södergatan 23, 12345 Stockholm ']
File: text_14.txt, Missed PII: [' Anna Andersson ', ' Smågatan 12, 34567 Lund ', ' Grönvägen 5, 12345 Stockholm ', ' Blomstergatan 23, 67890 Göteborg ', ' Ida Bergström ', ' Erik Johansson ', ' Oskar Pettersson ', ' 

In [14]:
load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
if OPENAI_API_KEY is None:
    raise ValueError("OPENAI_API_KEY does not exist, add it to env")