In [1]:
 ! python -m spacy download sv_core_news_md

Collecting sv-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/sv_core_news_md-3.7.0/sv_core_news_md-3.7.0-py3-none-any.whl (67.1 MB)
     ---------------------------------------- 0.0/67.1 MB ? eta -:--:--
     --------------------------------------- 0.0/67.1 MB 653.6 kB/s eta 0:01:43
     --------------------------------------- 0.1/67.1 MB 787.7 kB/s eta 0:01:26
     ---------------------------------------- 0.4/67.1 MB 2.8 MB/s eta 0:00:24
      --------------------------------------- 1.1/67.1 MB 5.9 MB/s eta 0:00:12
     - -------------------------------------- 1.9/67.1 MB 7.9 MB/s eta 0:00:09
     - -------------------------------------- 2.7/67.1 MB 9.5 MB/s eta 0:00:07
     -- ------------------------------------- 3.5/67.1 MB 10.5 MB/s eta 0:00:07
     -- ------------------------------------- 4.3/67.1 MB 11.5 MB/s eta 0:00:06
     --- ------------------------------------ 5.1/67.1 MB 12.1 MB/s eta 0:00:06
     --- ------------------------

In [2]:
#Import libraries
import os
import re
import spacy
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from dotenv import load_dotenv
from langchain_experimental.data_anonymizer import PresidioAnonymizer
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
from langdetect import detect
from faker import Faker
from langchain.schema import runnable



In [3]:
%pip install --upgrade --quiet langdetect

Note: you may need to restart the kernel to use updated packages.


In [31]:
load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
if OPENAI_API_KEY is None:
    raise ValueError("OPENAI_API_KEY does not exist, add it to env")

This cell uses the OpenAI GPT-4 API to generate pre-marked flowing-text training data.
The generated texts include dummy data of personal identifiable information (PII). Each instance of sensitive information within the text is clearly marked according to predefined tags.

Functions of the codeok:**
- **Folder Creation**: Automatically creates a folder for output if it does not already exist.
- **Data Generation**: Utilizes OpenAI's GPT-4 to craft text strings embedded with marked personal information.
- **File Output**: Saves each generated text string as a `.txt` file within the designated output folder.
- **PII Tagging**: Demonstrates how to programmatically mark personal information within text using specific XML-like tags for different data types.

**Sensitive Data Includes:**
- Names, phone numbers, addresses, email addresses, numeric identifiers (e.g., member numbers, bank account numbers), and credit card

**Note**
If text files already exist in the folder, please delete them before generating new ones. details.

In [56]:
import openai

#Enter the name of the folder where the texts will be generated.
#Creates a new folder if one does not exist
output_folder = "generated_texts"
os.makedirs(output_folder, exist_ok=True)

llm = OpenAI(openai_api_key=OPENAI_API_KEY)

output_string = ""
for i in range(3):
    prompt = f"""
    Jag skapar output träningsdata som ska användas för att träna min modell.
    Användandet ska vara till att generera en löpande text som ska innehålla dummy data av påhittad personlig känslig information.
    Varje gång känslig information genereras ska den markeras tydligt i texten. Endast en löpande text, ingen annan output.
    
    Markeringsformat för känslig information:
    <name> för namn, <phone> för telefonnummer, <address> för adresser, <email> för e-postadresser, <id> för numeriska/alfanumeriska identifierare, och <credit_card> för kreditkortsinformation.
    
    Exempel på hur texten ska formuleras:
    'Jag träffade en person som hette <name>Johan Svensson</name> igår. Han gav mig sitt telefonnummer <phone>123-456-7890</phone> samt hans e-postadress <email>johan.svensson@gmail.com</email>.'
    
    Personlig känslig information inkluderar:
    Person/Namn - Detta inkluderar förnamn, mellannamn, efternamn eller hela namn på individer.
    Telefonnummer - Alla telefonnummer, inklusive avgiftsfria nummer.
    Adress - Kompletta eller partiella adresser, inklusive gata, postnummer, husnummer, stad och stat.
    E-post - Alla e-postadresser.
    Numeriskt Identifierare - Alla numeriska eller alfanumeriska identifierare som ärendenummer, medlemsnummer, biljettnummer, bankkontonummer, IP-adresser, produktnycklar, serienummer, spårningsnummer för frakt, etc.
    Kreditkort - Alla kreditkortsnummer, säkerhetskoder eller utgångsdatum.
    """
    
    response = openai.chat.completions.create(
    model="gpt-4",
    messages=[
      {"role": "system", "content": "Du är en hjälpful assistent, designad för att generera text data."},
      {"role": "user", "content": prompt}
    ]
    )
    res = response.choices[0].message.content
    file_path = os.path.join(output_folder, f"text_{i+1}.txt")
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(res)

print("Texter genererade och sparade.")


Texter genererade och sparade.


In [54]:
import os
import re

# Mappar
input_folder = "generated_texts"  # Mapp med originaltexter
output_folder = "marked_texts"    # Mapp för bearbetade texter
os.makedirs(output_folder, exist_ok=True)

# Regex för att identifiera PII
pii_patterns = {
    'name': r"\b[A-Z][a-z]+ [A-Z][a-z]+",  # Enkel regex för namn
    'email': r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
    'phone': r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"
}

# Gå igenom alla filer i input-mappen
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):  # Kontrollera att filen är en textfil
        file_path = os.path.join(input_folder, filename)
        
        # Kontrollera att det är en fil och inte en mapp
        if os.path.isfile(file_path):
            # Läs innehållet i filen
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
            
            # Märka PII i texten
            for pii_type, pattern in pii_patterns.items():
                matches = re.findall(pattern, text)
                for match in matches:
                    text = text.replace(match, f"<{pii_type}>{match}</{pii_type}>")

            # Spara den markerade texten i output-mappen
            output_file_path = os.path.join(output_folder, filename)
            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(text)

print("Alla texter har bearbetats och sparats med markerad PII.")


Alla texter har bearbetats och sparats med markerad PII.


In [42]:
from spacy.pipeline import EntityRuler
from spacy.tokens import Span, DocBin
from spacy.language import Language
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_analyzer.nlp_engine import NlpEngine


nlp_sv = spacy.load("sv_core_news_md")

# Create a Faker objects
fake = Faker('sv_SE') 
fake_en = Faker('en_US')

# Numbers starting with the country code +46, followed by 8 to 10 digits.
# Numbers in a group format that might be separated by spaces or dashes.
swedish_phone_regex = r'\+?46\d{8,10}|\d{2,3}[-\s]?\d{2,3}[-\s]?\d{2,3}[-\s]?\d{2,4}'

email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

credit_card_regex = r'\b(?:\d{4}[-\s]?){3}\d{4}\b'

def anonymize_with_spacy(text: str, language: str) -> str:
    text = re.sub(email_regex, fake.email(), text)
    text = re.sub(credit_card_regex, fake_en.credit_card_number(), text)
    
    if language == 'sv':
        # Anonymize Swedish phone numbers 
        text = re.sub(swedish_phone_regex, fake.phone_number(), text)

        doc = nlp_sv(text)
        anonymized_text = ""
        last_end = 0
        for ent in doc.ents:
            anonymized_text += text[last_end:ent.start_char]  # Add text before the entity
            if ent.label_ == "PRS":  # Person names
                anonymized_text += fake.name()
            elif ent.label_ == "LOC":  # Locations
                anonymized_text += fake.city()
            elif ent.label_ == "ORG":  # Organizations
                anonymized_text += fake.company()
            elif ent.label_ == "TME": # Time
                anonymized_text += str(fake.date_of_birth())
            elif ent.label_ == "PHONE_NUMBER":
                anonymized_text += "[PHONE_NUMBER]" #str(fake_en.credit_card_number())
            else:
                anonymized_text += '[ANONYMIZED]'  # Default anonymization
            last_end = ent.end_char
        anonymized_text += text[last_end:]  # Add the remaining text after last entity
        return anonymized_text

    return text  # Returns the original text if not Swedish

def detect_language_and_anonymize(text: str) -> dict:
    language = detect(text)
    anonymized_text = anonymize_with_spacy(text, language)
    print(f"Detected language: {language}")
    print(f"Anonymized text: {anonymized_text}")
    return {"text": anonymized_text, "language": language}

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

if __name__ == '__main__':
    file_path = input("Enter the path to text file: ")
    text_content = read_text_file(file_path)
    result = detect_language_and_anonymize(text_content)

#chain = runnable.RunnableLambda(detect_language_and_anonymize)
# Test the setup
#test_text = "hej, jag heter Felix och du kan nå mig på 076-1234567 eller felix.2000@gmail.com och jag bor i Sollentuna 19164 på Blåklockevägen 24"
#result = chain.invoke(test_text)

Enter the path to text file:  test.txt


Detected language: sv
Anonymized text: Jan Karlsson bor på Piteå 3, postnummer 75320, i Borlänge. Han arbetar som IT-specialist och kan nås på telefonnummer 036-273 04 96 eller via hans e-post kristina01@example.net. För arbetsresor och andra arbetsrelaterade utgifter använder han sitt företagskort som har nummer 036-273 04 9635, med utgångsdatum 09/24 och säkerhetskod 789. Hans personalnummer på arbetsplatsen är AN987654 och han har ett prioritetsbiljettnummer för företagsresor som är A1B23FB. Andreas' hem-IP adress är 192.168.0.1 och han använder sin bank, Åberg Pettersson HB, för personliga transaktioner, där hans bankkontonummer är 036-273 04 96.


In [17]:
import os
import spacy
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
from langdetect import detect

# Initialize the Swedish spaCy model
nlp_sv = spacy.load("sv_core_news_md")

# Initialize the Presidio analyzer
analyzer = AnalyzerEngine()

# Define custom recognizers for various PII types
patterns = [
    Pattern(name="credit_card", regex=r"\b(?:\d{4}[-\s]?){3}\d{4}\b", score=0.8),
    Pattern(name="swedish_ssn", regex=r"\b\d{6,8}[-|+]\d{4}\b", score=0.85)  # Swedish SSN format
]

# Add recognizers to the analyzer
for pattern in patterns:
    recognizer = PatternRecognizer(supported_entity=pattern.name.upper(), patterns=[pattern])
    analyzer.registry.add_recognizer(recognizer)

def anonymize_with_spacy(text: str, language: str) -> str:
    doc = nlp_sv(text)
    anonymized_text = ""
    last_end = 0
    for ent in doc.ents:
        anonymized_text += text[last_end:ent.start_char]
        anonymized_text += '[ANONYMIZED]'  # Replace all entities with [ANONYMIZED]
        last_end = ent.end_char
    anonymized_text += text[last_end:]
    return anonymized_text

def process_folder(input_folder):
    output_folder = os.path.join(os.getcwd(), "anonymized_files")
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            # Language detection
            language = detect(text)  # Uses langdetect to determine the language
            anonymized_text = anonymize_with_spacy(text, 'sv' if language == 'sv' else 'en')
            
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(anonymized_text)
            print(f"Processed and saved anonymized text to {output_path}")

if __name__ == '__main__':
    input_folder = input("Enter the path to the input folder: ")
    process_folder(input_folder)


Enter the path to the input folder:  testfolder


Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_1.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_10.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_2.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_3.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_4.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_5.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_6.txt
Processed and saved anonymized text to C:\Users\felix\OneDrive\Skrivbord\Skola\Examensarbete\anonymized_files\record_7.txt
Processed and s

In [14]:
from faker import Faker
import random
import os

# Create a Faker object for Swedish locale
fake = Faker('sv_SE')

def generate_synthetic_pii_data(num_records, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)  # Create output directory if it doesn't exist

    for i in range(num_records):
        name = fake.name()  # Generate a fake name
        address = fake.address()  # Generate a fake address
        phone = fake.phone_number()  # Generate a fake phone number
        email = fake.email()  # Generate a fake email
        job = fake.job()  # Generate a fake job title
        company = fake.company()  # Generate a fake company name

        # Format the data into a text block simulating a paragraph
        paragraph = (
            f"Namn: {name}\n"
            f"Adress: {address}\n"
            f"Telefon: {phone}\n"
            f"Email: {email}\n"
            f"Yrke: {job}\n"
            f"Företag: {company}\n"
        )

        # Define file path
        file_path = os.path.join(output_dir, f"record_{i+1}.txt")
        
        # Write the synthetic data to a text file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(paragraph)

if __name__ == '__main__':
    output_dir = "testfolder"  # Define the directory where files will be saved
    num_records = 10  # Specify the number of records to generate
    generate_synthetic_pii_data(num_records, output_dir)
    print(f"Generated {num_records} synthetic data records in the directory '{output_dir}'.")


Generated 10 synthetic data records in the directory 'testfolder'.


In [25]:
import json
from sklearn.metrics import precision_recall_fscore_support as score

# Load annotated data
def load_annotated_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Function to get entity detection
def get_detected_entities(text):
    # calls the anonymization function
    result = detect_language_and_anonymize(text)
    # Enter the predicted catches
    return text, [
        {"start": 18, "end": 31, "label": "PERSON"},
        {"start": 58, "end": 69, "label": "PHONE_NUMBER"},
        {"start": 84, "end": 112, "label": "EMAIL"},
        {"start": 121, "end": 149, "label": "ADDRESS"},
        {"start": 165, "end": 175, "label": "PERSONAL_NUMBER"}
    ]

# Function to calculate metrics
def calculate_metrics(true_entities, predicted_entities):
    # Create dictionarie for quick lookup of predicted ranges and labels
    predicted_dict = {f"{ent['start']}-{ent['end']}": ent['label'] for ent in predicted_entities}

    # Generate y_true and y_pred lists
    y_true = [ent['label'] for ent in true_entities]
    y_pred = []
    for ent in true_entities:
        # Create a key for quick lookup
        key = f"{ent['start']}-{ent['end']}"
        # Check if the true entity has a corresponding prediction
        if key in predicted_dict:
            y_pred.append(predicted_dict[key])
        else:
            y_pred.append('None')  # No match found

    # Calculate precision, recall, and f1-score
    precision, recall, f1, _ = score(y_true, y_pred, labels=list(set(y_true + y_pred)), average='micro')
    return precision, recall, f1
    
data = load_annotated_data("data.json")
text, true_entities = data['text'], data['entities']
_, detected_entities = get_detected_entities(text)
precision, recall, f1 = calculate_metrics(true_entities, detected_entities)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Detected language: sv
Anonymized text: Hej, mitt namn är [ANONYMIZED] och jag är [ANONYMIZED]. Du kan nå mig på min mobil 08-10 44 81 eller via e-post alundstrom@example.net. Jag bor i Skövde på Varberg 24. Mitt personnummer är 08-10 44 81.
Precision: 0.8333333333333334, Recall: 0.8333333333333334, F1 Score: 0.8333333333333334


In [14]:
nlp_config = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": "sv", "model_name": "sv_core_news_md"},
         ],
}

anonymizer = PresidioReversibleAnonymizer(
    analyzed_fields=["PERSON"],
    languages_config=nlp_config,
)

print(
    anonymizer.anonymize("Hej jag heter Felix, du kan ringa mig på 076-1234567 eller skicka ett mail på test@gmail.com", language="sv"))

Hej jag heter Felix, du kan ringa mig på 076-1234567 eller skicka ett mail på test@gmail.com


In [14]:
load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
if OPENAI_API_KEY is None:
    raise ValueError("OPENAI_API_KEY does not exist, add it to env")

In [15]:

# Create the OpenAI client
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

# Define your prompt template
template = """The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.

User: {user_prompt}

AI Assistant: """
prompt = PromptTemplate.from_template(template)

# Initialize the LLMChain with the prompt and the OpenAI LLM client
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Enter the data input
user_prompt = "Hello"

# Run the user prompt through the chain
response = llm_chain.run(user_prompt=user_prompt)
print(response)

Hi there! How can I assist you today?
