In [62]:
 ! python -m spacy download sv_core_news_md

Collecting sv-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/sv_core_news_md-3.7.0/sv_core_news_md-3.7.0-py3-none-any.whl (67.1 MB)
     ---------------------------------------- 0.0/67.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/67.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/67.1 MB ? eta -:--:--
     ---------------------------------------- 0.0/67.1 MB ? eta -:--:--
     --------------------------------------- 0.1/67.1 MB 476.3 kB/s eta 0:02:21
     --------------------------------------- 0.1/67.1 MB 547.6 kB/s eta 0:02:03
     --------------------------------------- 0.1/67.1 MB 547.6 kB/s eta 0:02:03
     --------------------------------------- 0.1/67.1 MB 547.6 kB/s eta 0:02:03
     --------------------------------------- 0.2/67.1 MB 686.6 kB/s eta 0:01:38
     --------------------------------------- 0.2/67.1 MB 686.6 kB/s eta 0:01:38
     --------------------------------------- 0.2/67.1

In [63]:
#Import libraries
import os
import re
import spacy
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from dotenv import load_dotenv
from langchain_experimental.data_anonymizer import PresidioAnonymizer
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
from langdetect import detect
from faker import Faker
from langchain.schema import runnable



In [64]:
%pip install --upgrade --quiet langdetect

Note: you may need to restart the kernel to use updated packages.


In [79]:
from spacy.pipeline import EntityRuler
from spacy.language import Language
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern
from presidio_anonymizer import AnonymizerEngine

# Initialize the Presidio analyzer and anonymizer engines
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Define a custom recognizer for credit card numbers using a regex pattern
credit_card_pattern = Pattern(name="credit_card", regex=r"\b(?:\d{4}[-\s]?){3}\d{4}\b", score=1.0)

# Correctly initialize the PatternRecognizer with the patterns in a list
credit_card_recognizer = PatternRecognizer(supported_entity="CREDIT_CARD", patterns=[credit_card_pattern])

# Add the custom recognizer to the analyzer
analyzer.registry.add_recognizer(credit_card_recognizer)


nlp_sv = spacy.load("sv_core_news_md")

# Create a Faker object
fake = Faker('sv_SE') 
#fake_en = Faker('en_US')

# Numbers starting with the country code +46, followed by 8 to 10 digits.
# Numbers in a group format that might be separated by spaces or dashes.
swedish_phone_regex = r'\+?46\d{8,10}|\d{2,3}[-\s]?\d{2,3}[-\s]?\d{2,3}[-\s]?\d{2,4}'

email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'

credit_card_regex = r'\b(?:\d{4}[-\s]?){3}\d{4}\b'


def anonymize_with_spacy(text: str, language: str) -> str:
    text = re.sub(email_regex, fake.email(), text)
    
    if language == 'sv':
        # Anonymize Swedish phone numbers 
        text = re.sub(swedish_phone_regex, fake.phone_number(), text)

        doc = nlp_sv(text)
        anonymized_text = ""
        last_end = 0
        for ent in doc.ents:
            anonymized_text += text[last_end:ent.start_char]  # Add text before the entity
            if ent.label_ == "PRS":  # Person names
                anonymized_text += fake.name()
            elif ent.label_ == "LOC":  # Locations
                anonymized_text += fake.city()
            elif ent.label_ == "ORG":  # Organizations
                anonymized_text += fake.company()
            elif ent.label_ == "TME": # Time
                anonymized_text += str(fake.date_of_birth())
            elif ent.label_ == "CREDIT_CARD":
                anonymized_text += str(fake_en.credit_card_number())
            else:
                anonymized_text += '[ANONYMIZED]'  # Default anonymization
            last_end = ent.end_char
        anonymized_text += text[last_end:]  # Add the remaining text after last entity
        return anonymized_text

    return text  # Returns the original text if not Swedish

def detect_language_and_anonymize(text: str) -> dict:
    language = detect(text)
    anonymized_text = anonymize_with_spacy(text, language)
    print(f"Detected language: {language}")
    print(f"Anonymized text: {anonymized_text}")
    return {"text": anonymized_text, "language": language}

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

if __name__ == '__main__':
    file_path = input("Enter the path to text file: ")
    text_content = read_text_file(file_path)
    result = detect_language_and_anonymize(text_content)

#chain = runnable.RunnableLambda(detect_language_and_anonymize)
# Test the setup
#test_text = "hej, jag heter Felix och du kan nå mig på 076-1234567 eller felix.2000@gmail.com och jag bor i Sollentuna 19164 på Blåklockevägen 24"
#result = chain.invoke(test_text)

Enter the path to text file:  test.txt


Detected language: sv
Anonymized text: hej, jag heter Tomas Eriksson och jag är [ANONYMIZED]. Du kan nå mig på 08-523 85 10 eller qjohansson@example.com och jag bor i Lidköping 1953-12-29 på Östersund 24. mitt kort är 08-523 85 10-08-523 85 10 . 


In [25]:
import json
from sklearn.metrics import precision_recall_fscore_support as score

# Load annotated data
def load_annotated_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Function to get entity detection
def get_detected_entities(text):
    # calls the anonymization function
    result = detect_language_and_anonymize(text)
    # Enter the predicted catches
    return text, [
        {"start": 18, "end": 31, "label": "PERSON"},
        {"start": 58, "end": 69, "label": "PHONE_NUMBER"},
        {"start": 84, "end": 112, "label": "EMAIL"},
        {"start": 121, "end": 149, "label": "ADDRESS"},
        {"start": 165, "end": 175, "label": "PERSONAL_NUMBER"}
    ]

# Function to calculate metrics
def calculate_metrics(true_entities, predicted_entities):
    # Create dictionarie for quick lookup of predicted ranges and labels
    predicted_dict = {f"{ent['start']}-{ent['end']}": ent['label'] for ent in predicted_entities}

    # Generate y_true and y_pred lists
    y_true = [ent['label'] for ent in true_entities]
    y_pred = []
    for ent in true_entities:
        # Create a key for quick lookup
        key = f"{ent['start']}-{ent['end']}"
        # Check if the true entity has a corresponding prediction
        if key in predicted_dict:
            y_pred.append(predicted_dict[key])
        else:
            y_pred.append('None')  # No match found

    # Calculate precision, recall, and f1-score
    precision, recall, f1, _ = score(y_true, y_pred, labels=list(set(y_true + y_pred)), average='micro')
    return precision, recall, f1
    
data = load_annotated_data("data.json")
text, true_entities = data['text'], data['entities']
_, detected_entities = get_detected_entities(text)
precision, recall, f1 = calculate_metrics(true_entities, detected_entities)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Detected language: sv
Anonymized text: Hej, mitt namn är [ANONYMIZED] och jag är [ANONYMIZED]. Du kan nå mig på min mobil 08-10 44 81 eller via e-post alundstrom@example.net. Jag bor i Skövde på Varberg 24. Mitt personnummer är 08-10 44 81.
Precision: 0.8333333333333334, Recall: 0.8333333333333334, F1 Score: 0.8333333333333334


In [25]:
nlp_config = {
    "nlp_engine_name": "spacy",
    "models": [
        {"lang_code": "sv", "model_name": "sv_core_news_md"},
         ],
}

anonymizer = PresidioReversibleAnonymizer(
    analyzed_fields=["PERSON"],
    languages_config=nlp_config,
)

print(
    anonymizer.anonymize("Hej jag heter Felix, du kan ringa mig på 076-1234567 eller skicka ett mail på test@gmail.com", language="sv"))

SyntaxError: positional argument follows keyword argument (1141381121.py, line 11)

In [14]:
load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
if OPENAI_API_KEY is None:
    raise ValueError("OPENAI_API_KEY does not exist, add it to env")

In [15]:

# Create the OpenAI client
llm = OpenAI(openai_api_key=OPENAI_API_KEY)

# Define your prompt template
template = """The following is a conversation with an AI assistant. The assistant is helpful, creative, clever, and very friendly.

User: {user_prompt}

AI Assistant: """
prompt = PromptTemplate.from_template(template)

# Initialize the LLMChain with the prompt and the OpenAI LLM client
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Enter the data input
user_prompt = "Hello"

# Run the user prompt through the chain
response = llm_chain.run(user_prompt=user_prompt)
print(response)

Hi there! How can I assist you today?
