In [3]:
 ! python -m spacy download sv_core_news_md

C:\Users\felix\Anaconda3\python.exe: No module named spacy


In [2]:
#Import libraries
import os
import re
import spacy
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from dotenv import load_dotenv
from langchain_experimental.data_anonymizer import PresidioAnonymizer
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
from langdetect import detect, LangDetectException
from faker import Faker
from langchain.schema import runnable



In [3]:
%pip install --upgrade --quiet langdetect

Note: you may need to restart the kernel to use updated packages.


In [31]:
load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
if OPENAI_API_KEY is None:
    raise ValueError("OPENAI_API_KEY does not exist, add it to env")

This cell uses the OpenAI GPT-4 API to generate pre-marked flowing-text training data.
The generated texts include dummy data of personal identifiable information (PII). Each instance of sensitive information within the text is clearly marked according to predefined tags.

**Functions of the code:**
- **Folder Creation**: Automatically creates a folder for output if it does not already exist.
- **Data Generation**: Utilizes OpenAI's GPT-4 to craft text strings embedded with marked personal information.
- **File Output**: Saves each generated text string as a `.txt` file within the designated output folder.
- **PII Tagging**: Demonstrates how to programmatically mark personal information within text using specific XML-like tags for different data types.

**Sensitive Data Includes:**
- Names, phone numbers, addresses, email addresses, numeric identifiers (e.g., member numbers, bank account numbers), and credit card

**Note**
If text files already exist in the folder, please delete them before generating new ones. details.

In [98]:
import openai

#Enter the name of the folder where the texts will be generated.
#Creates a new folder if one does not exist
output_folder = "generated_texts"
os.makedirs(output_folder, exist_ok=True)

llm = OpenAI(openai_api_key=OPENAI_API_KEY)

output_string = ""
for i in range(50):
    prompt = f"""
    Jag skapar output träningsdata som ska användas för att träna min modell.
    Användandet ska vara till att generera en löpande text som ska innehålla dummy data av påhittad personlig känslig information.
    Varje gång känslig information genereras ska den markeras tydligt i texten. Endast en löpande text, ingen annan output.
    
    Markeringsformat för känslig information:
    <name> för namn, <phone> för telefonnummer, <address> för adresser, <email> för e-postadresser, <id> för numeriska/alfanumeriska identifierare, och <credit_card> för kreditkortsinformation.
    Innan och efter markeringen lämna mellanrum.
    
    Exempel på hur texten ska formuleras:
    'Jag träffade en person som hette <name> Johan Svensson </name> igår. Han gav mig sitt telefonnummer <phone> 123-456-7890 </phone> samt hans e-postadress <email> johan.svensson@gmail.com </email>.'
    
    Personlig känslig information inkluderar:
    Person/Namn - Detta inkluderar förnamn, mellannamn, efternamn eller hela namn på individer.
    Telefonnummer - Alla telefonnummer, inklusive avgiftsfria nummer.
    Adress - Kompletta eller partiella adresser, inklusive gata, postnummer, husnummer, stad och stat.
    E-post - Alla e-postadresser.
    Numeriskt Identifierare - Alla numeriska eller alfanumeriska identifierare som ärendenummer, medlemsnummer, biljettnummer, bankkontonummer, IP-adresser, produktnycklar, serienummer, spårningsnummer för frakt, etc.
    Kreditkort - Alla kreditkortsnummer, säkerhetskoder eller utgångsdatum.
    """
    
    response = openai.chat.completions.create(
    model="gpt-4",
    messages=[
      {"role": "system", "content": "Du är en hjälpful assistent, designad för att generera text data."},
      {"role": "user", "content": prompt}
    ]
    )
    res = response.choices[0].message.content
    file_path = os.path.join(output_folder, f"text_{i+1}.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(res)

print("Texter genererade och sparade.")


Texter genererade och sparade.


Creates arrays of the PII in the texts wich will be used to test models comparing if the PII in these arrays still exist in the text

In [3]:
import os
import re

def extract_pii_contents(text):
    #Extracts PII contents from the text using regex.
    pattern = re.compile(r'<\w+>(.*?)</\w+>')
    return [match.group(1) for match in pattern.finditer(text)]

def read_files_and_extract_pii(folder_path):
    #Reads each text file in the folder, extracts PII contents, and returns a dict of filename to PII list.
    files_pii = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                pii_contents = extract_pii_contents(text)
                files_pii[filename] = pii_contents
    return files_pii

folder_path = 'generated_texts'
files_pii = read_files_and_extract_pii(folder_path)
for filename, pii_contents in files_pii.items():
    print(f"PII contents in {filename}: {pii_contents}")

PII contents in text_1.txt: [' Adam Lindström ', ' 076-555-7890 ', ' Berggränd 29, 140 34 Stockholm ', ' Sara Eriksson ', ' 072-123-4567 ', ' Trädkojvägen 7, 660 60 Malung ', ' Carl Gustafsson ', ' carl.gustafsson@gmail.com ', ' CG123456J ', ' Mikael Blomqvist ', ' 079-654-3210 ', ' mikael.blomqvist@hotmail.com ', ' Techvägen 3, 753 20 Uppsala ', ' Anna Berg ', ' 070-999-8888 ', ' anna.berg@yahoo.com ', ' 5555-5555-5555-5555 ']
PII contents in text_10.txt: [' Isabella Larsson ', ' 0768-123-456 ', ' isabella.larsson@samplemail.com ', ' Rörstrandsgatan 21A, 113 55 Stockholm ', ' IT789456 ', ' 1234-5678-9012-3456 ']
PII contents in text_11.txt: [' Anna Johansson ', ' 070-123-4567 ', ' Storgatan 54, 12345 Stadsville ', ' anna.johansson@domain.se ', ' AJ7659 ', ' 4564-2359-4839-1234 ', ' Erik Olsson ', ' 072-789-0123 ', ' Smågatan 34, 67890 Landsville ', ' erik.olsson@web.se ', ' EO1239 ', ' 1234-5678-9101-1121 ']
PII contents in text_12.txt: [' Karin Lindgren ', ' 070-123-1698 ', ' karin.l

In [27]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span, DocBin
from spacy.language import Language
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngine
import os
import spacy.util

nlp_sv = spacy.load("sv_core_news_md")

# Create a Faker objects
fake = Faker('sv_SE') 
fake_en = Faker('en_US')

def add_custom_recognizers(analyzer_engine):
    #PatternRecognizer for phone numbers
    # Numbers starting with the country code +46, followed by 8 to 10 digits.
    # Numbers in a group format that might be separated by spaces or dashes.
    swedish_phone_recognizer = PatternRecognizer(
        supported_entity="PHONE_NUMBER",
        patterns=[Pattern("Swedish Phone Number", r'\+46\s?\d{1,4}\s?\d{2,8}|\d{2,4}-\d{2,8}|\d{10}', 0.8)]
    )
    
    # PatternRecognizer for email addresses
    email_recognizer = PatternRecognizer(
        supported_entity="EMAIL_ADDRESS",
        patterns=[Pattern("Email Address", r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', 0.8)]
    )
    
    # PatternRecognizer for credit card numbers
    credit_card_recognizer = PatternRecognizer(
        supported_entity="CREDIT_CARD",
        patterns=[Pattern("Credit Card Number", r'\b(?:\d{4}[-\s]?){3}\d{4}\b', 0.85)]
    )

    # Patternrecognizer for patterns like product keys, etc.
    product_key_pattern = PatternRecognizer(
        supported_entity="PRODUCT_KEY",
        patterns=[Pattern("Product Key Pattern", r"\b[A-Z0-9]{5}-[A-Z0-9]{5}-[A-Z0-9]{5}\b", 0.95)])

    ssn_recognizer = PatternRecognizer(
        supported_entity="SSN",
        patterns=[Pattern("Swedish SSN", r'\b\d{6}[-\s]?\d{4}\b', 0.95)])

    license_plate_recognizer = PatternRecognizer(
        supported_entity="LICENSE_PLATE",
        patterns=[Pattern("Swedish License Plate", r'\b[A-Z]{3}\s?\d{3}\b', 0.9)])

    ip_address_recognizer = PatternRecognizer(
        supported_entity="IP_ADDRESS",
        patterns=[Pattern("IP Address", r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', 0.9)])

    bank_account_recognizer = PatternRecognizer(
        supported_entity="BANK_ACCOUNT",
        patterns=[Pattern("Swedish Bank Account", r'\b\d{3,4}[-\s]?\d{2,4}[-\s]?\d{2,7}\b', 0.85)])
    
    # Add recognizers to the engine
    analyzer_engine.registry.add_recognizer(swedish_phone_recognizer)
    analyzer_engine.registry.add_recognizer(email_recognizer)
    analyzer_engine.registry.add_recognizer(credit_card_recognizer)
    analyzer_engine.registry.add_recognizer(product_key_pattern)
    analyzer_engine.registry.add_recognizer(ssn_recognizer)
    analyzer_engine.registry.add_recognizer(license_plate_recognizer)
    analyzer_engine.registry.add_recognizer(ip_address_recognizer)
    analyzer_engine.registry.add_recognizer(bank_account_recognizer)


    return analyzer_engine

def anonymize_with_spacy(text: str, language: str) -> str:
    
    if language == 'sv':

        doc = nlp_sv(text)
        anonymized_text = ""
        last_end = 0
        for ent in doc.ents:
            anonymized_text += text[last_end:ent.start_char]  # Add text before the entity
            if ent.label_ == "PRS":  # Person name
                anonymized_text += fake.name()
            elif ent.label_ == "LOC":  # Locations
                anonymized_text += fake.city()
            elif ent.label_ == "ORG":  # Organizations
                anonymized_text += fake.company()
            elif ent.label_ == "TME": # Time
                anonymized_text += str(fake.date_of_birth())
            elif ent.label_ == "EMAIL_ADDRESS":  # Email addresses
                anonymized_text += fake.email()
            elif ent.label_ == "CREDIT_CARD":  # Credit card numbers
                anonymized_text += fake_en.credit_card_number()
            elif ent.label_ == "PHONE_NUMBER":  # Phone numbers
                anonymized_text += fake.phone_number()
            elif ent.label == "SSN":
                anonymized_text += fake_en.ssn()
            elif ent.label == "LICENSE_PLATE":
                anonymized_text += fake.bothify(text='???-###')
            elif ent.label == "IP_ADDRESS":
                anonymized_text += fake_en.ipv4()
            elif ent.label == "BANK_ACCOUNT":
                anonymized_text += fake.bban()
            else:
                anonymized_text += '[ANONYMIZED]'  # Default anonymization
            last_end = ent.end_char
        anonymized_text += text[last_end:]  # Add the remaining text after last entity
        return anonymized_text

    return text  # Returns the original text if not Swedish

def detect_language_and_anonymize(text: str) -> dict:
    try:
        # Check if text is not empty
        if text.strip() and any(char.isalpha() for char in text):
            language = detect(text)
        else:
            language = 'sv'  # Default to Swedish
    except LangDetectException:
        language = 'sv'  # Default fallback

    anonymized_text = anonymize_with_spacy(text, language)
    return {"text": anonymized_text, "language": language}

def process_folder(input_folder, output_folder, batch_size=5):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # List all text files
    text_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]
    
    # Process files in batches
    for batch in spacy.util.minibatch(text_files, size=batch_size):
        # Process each batch
        for file_name in batch:
            file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            # Read file content
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read()

            # Anonymize text content
            result = detect_language_and_anonymize(text_content)

            # Write the anonymized text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(result['text'])

            print(f"Processed and saved anonymized text for {file_name}")

if __name__ == '__main__':
    analyzer_engine = AnalyzerEngine()
    analyzer_engine = add_custom_recognizers(analyzer_engine)
    input_folder = 'generated_texts'
    output_folder = 'anonymized_texts'
    process_folder(input_folder, output_folder)

Processed and saved anonymized text for text_1.txt
Processed and saved anonymized text for text_10.txt
Processed and saved anonymized text for text_11.txt
Processed and saved anonymized text for text_12.txt
Processed and saved anonymized text for text_13.txt
Processed and saved anonymized text for text_14.txt
Processed and saved anonymized text for text_15.txt
Processed and saved anonymized text for text_16.txt
Processed and saved anonymized text for text_17.txt
Processed and saved anonymized text for text_18.txt
Processed and saved anonymized text for text_19.txt
Processed and saved anonymized text for text_2.txt
Processed and saved anonymized text for text_20.txt
Processed and saved anonymized text for text_21.txt
Processed and saved anonymized text for text_22.txt
Processed and saved anonymized text for text_23.txt
Processed and saved anonymized text for text_24.txt
Processed and saved anonymized text for text_25.txt
Processed and saved anonymized text for text_26.txt
Processed and 

In [3]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span, DocBin
from spacy.language import Language
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngine
import os
import spacy.util

# Load Swedish NLP model
nlp_sv = spacy.load("sv_core_news_md")

def add_custom_recognizers(analyzer_engine):
    # PatternRecognizer for phone numbers
    swedish_phone_recognizer = PatternRecognizer(
        supported_entity="PHONE_NUMBER",
        patterns=[Pattern("Swedish Phone Number", r'\+46\s?\d{1,4}\s?\d{2,8}|\d{2,4}-\d{2,8}|\d{10}', 0.8)]
    )

    # PatternRecognizer for email addresses
    email_recognizer = PatternRecognizer(
        supported_entity="EMAIL_ADDRESS",
        patterns=[Pattern("Email Address", r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', 0.8)]
    )

    # PatternRecognizer for credit card numbers
    credit_card_recognizer = PatternRecognizer(
        supported_entity="CREDIT_CARD",
        patterns=[Pattern("Credit Card Number", r'\b(?:\d{4}[-\s]?){3}\d{4}\b', 0.85)]
    )

    # Patternrecognizer for patterns like product keys, etc.
    product_key_pattern = PatternRecognizer(
        supported_entity="PRODUCT_KEY",
        patterns=[Pattern("Product Key Pattern", r"\b[A-Z0-9]{5}-[A-Z0-9]{5}-[A-Z0-9]{5}\b", 0.95)]
    )

    ssn_recognizer = PatternRecognizer(
        supported_entity="SSN",
        patterns=[Pattern("Swedish SSN", r'\b\d{6}[-\s]?\d{4}\b', 0.95)]
    )

    license_plate_recognizer = PatternRecognizer(
        supported_entity="LICENSE_PLATE",
        patterns=[Pattern("Swedish License Plate", r'\b[A-Z]{3}\s?\d{3}\b', 0.9)]
    )

    ip_address_recognizer = PatternRecognizer(
        supported_entity="IP_ADDRESS",
        patterns=[Pattern("IP Address", r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', 0.9)]
    )

    bank_account_recognizer = PatternRecognizer(
        supported_entity="BANK_ACCOUNT",
        patterns=[Pattern("Swedish Bank Account", r'\b\d{3,4}[-\s]?\d{2,4}[-\s]?\d{2,7}\b', 0.85)]
    )

    # Add recognizers to the engine
    analyzer_engine.registry.add_recognizer(swedish_phone_recognizer)
    analyzer_engine.registry.add_recognizer(email_recognizer)
    analyzer_engine.registry.add_recognizer(credit_card_recognizer)
    analyzer_engine.registry.add_recognizer(product_key_pattern)
    analyzer_engine.registry.add_recognizer(ssn_recognizer)
    analyzer_engine.registry.add_recognizer(license_plate_recognizer)
    analyzer_engine.registry.add_recognizer(ip_address_recognizer)
    analyzer_engine.registry.add_recognizer(bank_account_recognizer)

    return analyzer_engine

def anonymize_with_spacy(text: str, language: str) -> str:
    if language == 'sv':
        doc = nlp_sv(text)
        anonymized_text = ""
        last_end = 0
        for ent in doc.ents:
            anonymized_text += text[last_end:ent.start_char]  # Add text before the entity
            anonymized_text += '[ANONYMIZED]'  # Anonymize the entity
            last_end = ent.end_char
        anonymized_text += text[last_end:]  # Add the remaining text after last entity
        return anonymized_text

    return text  # Returns the original text if not Swedish

def detect_language_and_anonymize(text: str) -> dict:
    try:
        # Check if text is not empty
        if text.strip() and any(char.isalpha() for char in text):
            language = detect(text)
        else:
            language = 'sv'  # Default to Swedish
    except LangDetectException:
        language = 'sv'  # Default fallback

    anonymized_text = anonymize_with_spacy(text, language)
    return {"text": anonymized_text, "language": language}

def process_folder(input_folder, output_folder, batch_size=5):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # List all text files
    text_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]
    
    # Process files in batches
    for batch in spacy.util.minibatch(text_files, size=batch_size):
        # Process each batch
        for file_name in batch:
            file_path = os.path.join(input_folder, file_name)
            output_file_path = os.path.join(output_folder, file_name)

            # Read file content
            with open(file_path, 'r', encoding='utf-8') as file:
                text_content = file.read()

            # Anonymize text content
            result = detect_language_and_anonymize(text_content)

            # Write the anonymized text to the output file
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(result['text'])

            print(f"Processed and saved anonymized text for {file_name}")

if __name__ == '__main__':
    analyzer_engine = AnalyzerEngine()
    analyzer_engine = add_custom_recognizers(analyzer_engine)
    input_folder = 'generated_texts'
    output_folder = 'anonymized_texts/presidio'
    process_folder(input_folder, output_folder)


Processed and saved anonymized text for text_1.txt
Processed and saved anonymized text for text_10.txt
Processed and saved anonymized text for text_100.txt
Processed and saved anonymized text for text_11.txt
Processed and saved anonymized text for text_12.txt
Processed and saved anonymized text for text_13.txt
Processed and saved anonymized text for text_14.txt
Processed and saved anonymized text for text_15.txt
Processed and saved anonymized text for text_16.txt
Processed and saved anonymized text for text_17.txt
Processed and saved anonymized text for text_18.txt
Processed and saved anonymized text for text_19.txt
Processed and saved anonymized text for text_2.txt
Processed and saved anonymized text for text_20.txt
Processed and saved anonymized text for text_21.txt
Processed and saved anonymized text for text_22.txt
Processed and saved anonymized text for text_23.txt
Processed and saved anonymized text for text_24.txt
Processed and saved anonymized text for text_25.txt
Processed and

In [30]:
def calculate_metrics(true_positives, false_positives, false_negatives):
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1_score

def evaluate_anonymization(original_pii, folder_path_anonymized):
    true_positives, false_positives, false_negatives = 0, 0, 0
    missed_pii_details = []  # store PII that were not anonymized correctly

    for filename, original_pii_contents in original_pii.items():
        anonymized_file_path = os.path.join(folder_path_anonymized, filename)
        with open(anonymized_file_path, 'r', encoding='utf-8') as file:
            anonymized_text = file.read()

        original_pii_set = set(original_pii_contents)
        anonymized_pii_set = set(extract_pii_contents(anonymized_text))

        detected_pii = original_pii_set & anonymized_pii_set
        missed_pii = original_pii_set - anonymized_pii_set

        true_positives += len(detected_pii)
        false_negatives += len(missed_pii)
        
        if missed_pii:
            missed_pii_details.append((filename, list(missed_pii)))

    precision, recall, f1_score = calculate_metrics(true_positives, false_positives, false_negatives)
    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "missed_pii": missed_pii_details
    }

folder_path_anonymized = 'anonymized_texts'
results = evaluate_anonymization(files_pii, folder_path_anonymized)

print(f"Precision: {results['precision']}")
print(f"Recall: {results['recall']}")
print(f"F1 Score: {results['f1_score']}")



Precision: 1.0
Recall: 0.6162046908315565
F1 Score: 0.762532981530343


In [110]:
if results['missed_pii']:
    print("Missed PII Details:")
    for file, pii in results['missed_pii']:
        print(f"File: {file}, Missed PII: {pii}")

Missed PII Details:
File: text_1.txt, Missed PII: [' Trädkojvägen 7, 660 60 Malung ', ' Berggränd 29, 140 34 Stockholm ', ' Carl Gustafsson ', ' Sara Eriksson ', ' Techvägen 3, 753 20 Uppsala ', ' mikael.blomqvist@hotmail.com ', ' Anna Berg ', ' Adam Lindström ', ' Mikael Blomqvist ']
File: text_10.txt, Missed PII: [' Rörstrandsgatan 21A, 113 55 Stockholm ', ' Isabella Larsson ']
File: text_11.txt, Missed PII: [' Smågatan 34, 67890 Landsville ', ' Anna Johansson ', ' anna.johansson@domain.se ', ' Storgatan 54, 12345 Stadsville ', ' Erik Olsson ']
File: text_12.txt, Missed PII: [' Kungsgatan 15, 111 43 Stockholm ', ' Karin Lindgren ']
File: text_13.txt, Missed PII: [' Emma Karlsson ', ' Södergatan 24, 12345 Stockholm ', ' Jakob Nilsson ', ' Södergatan 23, 12345 Stockholm ']
File: text_14.txt, Missed PII: [' Anna Andersson ', ' Smågatan 12, 34567 Lund ', ' Grönvägen 5, 12345 Stockholm ', ' Blomstergatan 23, 67890 Göteborg ', ' Ida Bergström ', ' Erik Johansson ', ' Oskar Pettersson ', ' 

In [9]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span, DocBin
from spacy.language import Language
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngine


nlp_sv = spacy.load("sv_core_news_md")

# Create a Faker objects
fake = Faker('sv_SE') 
fake_en = Faker('en_US')

# Numbers starting with the country code +46, followed by 8 to 10 digits.
# Numbers in a group format that might be separated by spaces or dashes.
swedish_phone_regex = r'\+?46\d{8,10}|\d{2,3}[-\s]?\d{2,3}[-\s]?\d{2,3}[-\s]?\d{2,4}'

email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

credit_card_regex = r'\b(?:\d{4}[-\s]?){3}\d{4}\b'

def anonymize_with_spacy(text: str, language: str) -> str:
    text = re.sub(email_regex, fake.email(), text)
    text = re.sub(credit_card_regex, fake_en.credit_card_number(), text)
    
    if language == 'sv':
        # Anonymize Swedish phone numbers 
        text = re.sub(swedish_phone_regex, fake.phone_number(), text)

        doc = nlp_sv(text)
        anonymized_text = ""
        last_end = 0
        for ent in doc.ents:
            anonymized_text += text[last_end:ent.start_char]  # Add text before the entity
            if ent.label_ == "PRS":  # Person names
                anonymized_text += fake.name()
            elif ent.label_ == "LOC":  # Locations
                anonymized_text += fake.city()
            elif ent.label_ == "ORG":  # Organizations
                anonymized_text += fake.company()
            elif ent.label_ == "TME": # Time
                anonymized_text += str(fake.date_of_birth())
            else:
                anonymized_text += '[ANONYMIZED]'  # Default anonymization
            last_end = ent.end_char
        anonymized_text += text[last_end:]  # Add the remaining text after last entity
        return anonymized_text

    return text  # Returns the original text if not Swedish

def detect_language_and_anonymize(text: str) -> dict:
    language = detect(text)
    anonymized_text = anonymize_with_spacy(text, language)
    print(f"Detected language: {language}")
    print(f"Anonymized text: {anonymized_text}")
    return {"text": anonymized_text, "language": language}

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

if __name__ == '__main__':
    file_path = input("Enter the path to text file: ")
    text_content = read_text_file(file_path)
    result = detect_language_and_anonymize(text_content)

#chain = runnable.RunnableLambda(detect_language_and_anonymize)
# Test the setup
#test_text = "hej, jag heter Felix och du kan nå mig på 076-1234567 eller felix.2000@gmail.com och jag bor i Sollentuna 19164 på Blåklockevägen 24"
#result = chain.invoke(test_text)

Enter the path to text file:  test.txt


Detected language: sv
Anonymized text: Hej jag heter Andreas Jonasson och detta är min email adress Karlsson Hedström AB


In [17]:
import os
import spacy
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
from langdetect import detect

# Initialize the Swedish spaCy model
nlp_sv = spacy.load("sv_core_news_md")

# Initialize the Presidio analyzer
analyzer = AnalyzerEngine()

# Define custom recognizers for various PII types
patterns = [
    Pattern(name="credit_card", regex=r"\b(?:\d{4}[-\s]?){3}\d{4}\b", score=0.8),
    Pattern(name="swedish_ssn", regex=r"\b\d{6,8}[-|+]\d{4}\b", score=0.85)  # Swedish SSN format
]

# Add recognizers to the analyzer
for pattern in patterns:
    recognizer = PatternRecognizer(supported_entity=pattern.name.upper(), patterns=[pattern])
    analyzer.registry.add_recognizer(recognizer)

def anonymize_with_spacy(text: str, language: str) -> str:
    doc = nlp_sv(text)
    anonymized_text = ""
    last_end = 0
    for ent in doc.ents:
        anonymized_text += text[last_end:ent.start_char]
        anonymized_text += '[ANONYMIZED]'  # Replace all entities with [ANONYMIZED]
        last_end = ent.end_char
    anonymized_text += text[last_end:]
    return anonymized_text

def process_folder(input_folder):
    output_folder = os.path.join(os.getcwd(), "anonymized_files")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            # Language detection
            language = detect(text)  # Uses langdetect to determine the language
            anonymized_text = anonymize_with_spacy(text, 'sv' if language == 'sv' else 'en')
            
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(anonymized_text)
            print(f"Processed and saved anonymized text to {output_path}")

if __name__ == '__main__':
    input_folder = input("Enter the path to the input folder: ")
    process_folder(input_folder)


Enter the path to the input folder:  testfolder


Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_1.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_10.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_2.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_3.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_4.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_5.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_6.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_7.txt
Processed and s

In [14]:
from faker import Faker
import random
import os

# Create a Faker object for Swedish locale
fake = Faker('sv_SE')

def generate_synthetic_pii_data(num_records, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)  # Create output directory if it doesn't exist

    for i in range(num_records):
        name = fake.name()  # Generate a fake name
        address = fake.address()  # Generate a fake address
        phone = fake.phone_number()  # Generate a fake phone number
        email = fake.email()  # Generate a fake email
        job = fake.job()  # Generate a fake job title
        company = fake.company()  # Generate a fake company name

        # Format the data into a text block simulating a paragraph
        paragraph = (
            f"Namn: {name}\n"
            f"Adress: {address}\n"
            f"Telefon: {phone}\n"
            f"Email: {email}\n"
            f"Yrke: {job}\n"
            f"Företag: {company}\n"
        )

        # Define file path
        file_path = os.path.join(output_dir, f"record_{i+1}.txt")
        
        # Write the synthetic data to a text file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(paragraph)

if __name__ == '__main__':
    output_dir = "testfolder"  # Define the directory where files will be saved
    num_records = 10  # Specify the number of records to generate
    generate_synthetic_pii_data(num_records, output_dir)
    print(f"Generated {num_records} synthetic data records in the directory '{output_dir}'.")


Generated 10 synthetic data records in the directory 'testfolder'.


In [25]:
import json
from sklearn.metrics import precision_recall_fscore_support as score

# Load annotated data
def load_annotated_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Function to get entity detection
def get_detected_entities(text):
    # calls the anonymization function
    result = detect_language_and_anonymize(text)
    # Enter the predicted catches
    return text, [
        {"start": 18, "end": 31, "label": "PERSON"},
        {"start": 58, "end": 69, "label": "PHONE_NUMBER"},
        {"start": 84, "end": 112, "label": "EMAIL"},
        {"start": 121, "end": 149, "label": "ADDRESS"},
        {"start": 165, "end": 175, "label": "PERSONAL_NUMBER"}
    ]

# Function to calculate metrics
def calculate_metrics(true_entities, predicted_entities):
    # Create dictionarie for quick lookup of predicted ranges and labels
    predicted_dict = {f"{ent['start']}-{ent['end']}": ent['label'] for ent in predicted_entities}

    # Generate y_true and y_pred lists
    y_true = [ent['label'] for ent in true_entities]
    y_pred = []
    for ent in true_entities:
        # Create a key for quick lookup
        key = f"{ent['start']}-{ent['end']}"
        # Check if the true entity has a corresponding prediction
        if key in predicted_dict:
            y_pred.append(predicted_dict[key])
        else:
            y_pred.append('None')  # No match found

    # Calculate precision, recall, and f1-score
    precision, recall, f1, _ = score(y_true, y_pred, labels=list(set(y_true + y_pred)), average='micro')
    return precision, recall, f1
    
data = load_annotated_data("data.json")
text, true_entities = data['text'], data['entities']
_, detected_entities = get_detected_entities(text)
precision, recall, f1 = calculate_metrics(true_entities, detected_entities)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Detected language: sv
Anonymized text: Hej, mitt namn är [ANONYMIZED] och jag är [ANONYMIZED]. Du kan nå mig på min mobil 08-10 44 81 eller via e-post alundstrom@example.net. Jag bor i Skövde på Varberg 24. Mitt personnummer är 08-10 44 81.
Precision: 0.8333333333333334, Recall: 0.8333333333333334, F1 Score: 0.8333333333333334


In [14]:
nlp_config = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": "sv", "model_name": "sv_core_news_md"},
         ],
}

anonymizer = PresidioReversibleAnonymizer(
    analyzed_fields=["PERSON"],
    languages_config=nlp_config,
)

print(
    anonymizer.anonymize("Hej jag heter Felix, du kan ringa mig på 076-1234567 eller skicka ett mail på test@gmail.com", language="sv"))

Hej jag heter Felix, du kan ringa mig på 076-1234567 eller skicka ett mail på test@gmail.com


In [14]:
load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
if OPENAI_API_KEY is None:
    raise ValueError("OPENAI_API_KEY does not exist, add it to env")

In [15]:

# Create the OpenAI client
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

# Define your prompt template
template = """The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.

User: {user_prompt}

AI Assistant: """
prompt = PromptTemplate.from_template(template)

# Initialize the LLMChain with the prompt and the OpenAI LLM client
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Enter the data input
user_prompt = "Hello"

# Run the user prompt through the chain
response = llm_chain.run(user_prompt=user_prompt)
print(response)

Hi there! How can I assist you today?
