In [8]:
import re
import spacy
from dagster import op, Out, build_op_context

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Regular expression pattern to detect email addresses
EMAIL_PATTERN = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b")


# Op to load data
@op
def load_data(context):
    data = [
        "John Doe lives in New York.",
        "Jane's email is jane.doe@example.com.",
        "Juudas I think your address is 231, Muranga",
        "She is 34 years old",
    ]
    context.log.info("Original Data:")
    for idx, line in enumerate(data):
        context.log.info(f"Original Line {idx+1}: {line}")
        print(f"Original Line {idx+1}: {line}")
    return data


# Op to apply NER for PII detection
@op(out=Out())
def detect_pii(context, data):
    processed_data = []
    for doc in data:
        # First pass: Apply the NER model
        ner_doc = nlp(doc)
        redacted_doc = ""
        for token in ner_doc:
            if token.ent_type_ in ["PERSON", "GPE", "EMAIL", "ORG"]:
                redacted_doc += "[REDACTED] "
            else:
                redacted_doc += token.text + " "

        # Second pass: Use regex to find and redact emails
        redacted_doc = re.sub(EMAIL_PATTERN, "[REDACTED]", redacted_doc)

        processed_data.append(redacted_doc.strip())

    context.log.info("Processed Data:")
    for idx, line in enumerate(processed_data):
        context.log.info(f"Processed Line {idx+1}: {line}")
        print(f"Processed Line {idx+1}: {line}")

    return processed_data


# Op to save or output the processed data
@op(out=Out())
def save_data(context, processed_data):
    for idx, line in enumerate(processed_data):
        context.log.info(f"Final Processed Line {idx+1}: {line}")
        print(f"Final Processed Line {idx+1}: {line}")
    return processed_data


# Create a context for the ops
context = build_op_context()

# Load the original data
original_data = load_data(context)

# Process the data to detect and redact PII
processed_data = detect_pii(context, data=original_data)

# Display the processed data
print("\nProcessed Data:")
for line in processed_data:
    print(line)

# Final save/output step (for demonstration purposes, it just prints the data)
save_data(context, processed_data=processed_data)


2024-09-03 06:19:07 +0300 - dagster - INFO - system - Original Data:
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Original Line 1: John Doe lives in New York.
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Original Line 2: Jane's email is jane.doe@example.com.
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Original Line 3: Juudas I think your address is 231, Muranga
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Original Line 4: She is 34 years old
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Processed Data:
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Processed Line 1: [REDACTED] [REDACTED] lives in [REDACTED] [REDACTED] .
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Processed Line 2: [REDACTED] 's email is [REDACTED] .
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Processed Line 3: [REDACTED] I think your address is 231 , [REDACTED]
2024-09-03 06:19:07 +0300 - dagster - INFO - system - Processed Line 4: She is 34 years 

Original Line 1: John Doe lives in New York.
Original Line 2: Jane's email is jane.doe@example.com.
Original Line 3: Juudas I think your address is 231, Muranga
Original Line 4: She is 34 years old
Processed Line 1: [REDACTED] [REDACTED] lives in [REDACTED] [REDACTED] .
Processed Line 2: [REDACTED] 's email is [REDACTED] .
Processed Line 3: [REDACTED] I think your address is 231 , [REDACTED]
Processed Line 4: She is 34 years old

Processed Data:
[REDACTED] [REDACTED] lives in [REDACTED] [REDACTED] .
[REDACTED] 's email is [REDACTED] .
[REDACTED] I think your address is 231 , [REDACTED]
She is 34 years old
Final Processed Line 1: [REDACTED] [REDACTED] lives in [REDACTED] [REDACTED] .
Final Processed Line 2: [REDACTED] 's email is [REDACTED] .
Final Processed Line 3: [REDACTED] I think your address is 231 , [REDACTED]
Final Processed Line 4: She is 34 years old


['[REDACTED] [REDACTED] lives in [REDACTED] [REDACTED] .',
 "[REDACTED] 's email is [REDACTED] .",
 '[REDACTED] I think your address is 231 , [REDACTED]',
 'She is 34 years old']