In [1]:
%pip install presidio-analyzer presidio-anonymizer spacy

Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.357-py3-none-any.whl.metadata (3.3 kB)
Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.357-py3-none-any.whl.metadata (8.2 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-8.13.53-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting azure-core (from presidio-anonymizer)
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Collecting pycryptodome>=3.10.1 (from presidio-anonymizer)
  Downloading pycryptodome-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.357-py3-none-any.whl (112 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [5]:
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
import os
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# Disable SSL verification (for your use case)
os.environ['CURL_CA_BUNDLE'] = ''
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# Load the local SpaCy model
nlp = spacy.load("en_core_web_sm")

# Create Presidio Analyzer and Anonymizer Engines
from presidio_analyzer.nlp_engine import SpacyNlpEngine

# Pass the model name, not the actual model object
nlp_engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": "en_core_web_sm"}])  # Use the model name here
analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
anonymizer = AnonymizerEngine()

# Function to mask PII entities using Presidio
def mask_pii_with_presidio(text):
    # Analyze the text to find PII
    results = analyzer.analyze(text=text, language="en")

    # Print detected entities and their labels
    for result in results:
        print(f"Entity: {result.entity_type}, Text: {text[result.start:result.end]}")

    # Anonymize (mask) the detected PII entities
    anonymized_text = anonymizer.anonymize(text, results)

    return anonymized_text

# Sample input text with PII
input_text = """
John Doe lives at 123 Elm Street, Springfield. His company, TechCorp, pays him a salary of $50,000.
He was born on 12th July 1990 and works from 9 AM to 5 PM.
"""

# Mask the PII from the input text
masked_output = mask_pii_with_presidio(input_text)

print("\nMasked Output:\n", masked_output)




Entity: PERSON, Text: John Doe
Entity: LOCATION, Text: Springfield
Entity: DATE_TIME, Text: 12th July 1990
Entity: DATE_TIME, Text: 9 AM to 5 PM

Masked Output:
 text: 
<PERSON> lives at 123 Elm Street, <LOCATION>. His company, TechCorp, pays him a salary of $50,000.
He was born on <DATE_TIME> and works from <DATE_TIME>.

items:
[
    {'start': 142, 'end': 153, 'entity_type': 'DATE_TIME', 'text': '<DATE_TIME>', 'operator': 'replace'},
    {'start': 115, 'end': 126, 'entity_type': 'DATE_TIME', 'text': '<DATE_TIME>', 'operator': 'replace'},
    {'start': 35, 'end': 45, 'entity_type': 'LOCATION', 'text': '<LOCATION>', 'operator': 'replace'},
    {'start': 1, 'end': 9, 'entity_type': 'PERSON', 'text': '<PERSON>', 'operator': 'replace'}
]



In [11]:
import spacy
from presidio_analyzer import AnalyzerEngine, RecognizerResult, EntityRecognizer
from presidio_analyzer.nlp_engine import SpacyNlpEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.recognizer_registry import RecognizerRegistry

# Load SpaCy model
spacy_model_path = "en_core_web_sm"
try:
    nlp = spacy.load(spacy_model_path)
    print("SpaCy model loaded successfully!")
except Exception as e:
    print(f"Error loading SpaCy model: {e}")

# Initialize SpaCy NLP Engine
nlp_engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": spacy_model_path}])

# Custom recognizer for allowlist
class AllowlistRecognizer(EntityRecognizer):
    def __init__(self, allowlist):
        super().__init__()
        self.allowlist = allowlist

    def load(self):
        pass  # No external models to load

    def analyze(self, text, entities, nlp_artifacts=None):
        results = []
        for word in self.allowlist:
            start = text.find(word)
            if start != -1:
                end = start + len(word)
                results.append(
                    RecognizerResult(entity_type="ALLOWLIST", start=start, end=end, score=1.0)
                )
        return results

# Custom recognizer for denylist
class DenylistRecognizer(EntityRecognizer):
    def __init__(self, denylist):
        super().__init__()
        self.denylist = denylist

    def load(self):
        pass  # No external models to load

    def analyze(self, text, entities, nlp_artifacts=None):
        results = []
        for word in self.denylist:
            start = text.find(word)
            if start != -1:
                end = start + len(word)
                results.append(
                    RecognizerResult(entity_type="DENYLIST", start=start, end=end, score=1.0)
                )
        return results

# Create allowlist and denylist
allowlist = {"TechCorp", "Springfield"}  # Words to ignore
denylist = {"John", "Doe", "Elm Street"}  # Words to always mask

# Register custom recognizers
registry = RecognizerRegistry()
registry.add_recognizer(AllowlistRecognizer(allowlist))
registry.add_recognizer(DenylistRecognizer(denylist))

# Initialize AnalyzerEngine and AnonymizerEngine
analyzer = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)
anonymizer = AnonymizerEngine()

# Function to mask PII with allowlist and denylist handling
def mask_pii_with_lists(text):
    # Analyze the text
    results = analyzer.analyze(
        text=text,
        language="en",
        score_threshold=0.5,  # Threshold for PII detection
    )

    # Filter results based on allowlist/denylist
    filtered_results = []
    for result in results:
        if result.entity_type == "ALLOWLIST":
            print(f"Skipping allowlist word: {text[result.start:result.end]}")
            continue
        if result.entity_type == "DENYLIST":
            print(f"Forcing denylist word: {text[result.start:result.end]}")
        filtered_results.append(result)

    # Anonymize (mask) the filtered entities
    anonymized_text = anonymizer.anonymize(
        text=text,
        analyzer_results=filtered_results,
        operators={"DEFAULT": {"type": "replace", "new_value": "<MASK>"}}
    )

    return anonymized_text.text

# Sample text
input_text = """
John Doe lives at 123 Elm Street, Springfield. His company, TechCorp, pays him a salary of $50,000.
He was born on 12th July 1990 and works from 9 AM to 5 PM.
"""

# Run the masking function
masked_output = mask_pii_with_lists(input_text)
print("\nMasked Output:\n", masked_output)




SpaCy model loaded successfully!


TypeError: EntityRecognizer.__init__() missing 1 required positional argument: 'supported_entities'

In [17]:
import spacy
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

# Load SpaCy model for NLP
spacy_model_path = "en_core_web_sm"
nlp = spacy.load(spacy_model_path)

# Initialize SpaCy NLP Engine
nlp_engine = SpacyNlpEngine(models=[{"lang_code": "en", "model_name": spacy_model_path}])

# Initialize Analyzer Engine
analyzer = AnalyzerEngine(nlp_engine=nlp_engine)

# Initialize Anonymizer Engine
anonymizer = AnonymizerEngine()

# Define Allowlist and Denylist
allowlist = ["TechCorp", "Springfield"]
denylist = ["John", "Doe", "Elm Street"]

# Mask PII Function
def mask_pii(text):
    # Analyze the text for PII entities
    results = analyzer.analyze(text=text, language="en", score_threshold=0.5)

    # Filter out allowlisted entities
    filtered_results = [
        result for result in results if result.entity_type not in ["ALLOWLIST"] and result.entity_type != "DENYLIST"
    ]

    # Anonymize (mask) the text based on filtered results
    anonymized_text = anonymizer.anonymize(text=text, analyzer_results=filtered_results)

    return anonymized_text.text

# Test Input
input_text = """
John Doe lives at 123 Elm Street, Springfield. His company, TechCorp, pays him a salary of $50,000.
He was born on 12th July 1990 and works from 9 AM to 5 PM.
"""

# Run the function
masked_text = mask_pii(input_text)
print("\nMasked Output:\n", masked_text)





Masked Output:
 
<PERSON> lives at 123 Elm Street, <LOCATION>. His company, TechCorp, pays him a salary of $50,000.
He was born on <DATE_TIME> and works from <DATE_TIME>.



In [30]:
import pandas as pd
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
import spacy
from presidio_analyzer import RecognizerRegistry
import logging

# Configure logging
logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)

# Load SpaCy model for NLP
spacy_model_path = "en_core_web_sm"
nlp = spacy.load(spacy_model_path)

# Configure NLP Engine with ignored labels
nlp_engine = SpacyNlpEngine(
    models=[{
        "lang_code": "en",
        "model_name": spacy_model_path,
        "labels_to_ignore": ["FAC"]  # Ignore facility labels
    }]
)

# Initialize Analyzer Engine with registry
registry = RecognizerRegistry()
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)

# Initialize Anonymizer Engine
anonymizer = AnonymizerEngine()

def mask_pii(text, allowlist=None, denylist=None):
    if allowlist is None:
        allowlist = []
    if denylist is None:
        denylist = []

    try:
        # Analyze the text for PII entities
        results = analyzer.analyze(
            text=text,
            language="en",
            score_threshold=0.5,
            entities=None,  # Detect all supported entities
            return_decision_process=True  # Include analysis explanation
        )

        # Filter out allowlisted entities
        filtered_results = [
            result for result in results
            if result.entity_type not in allowlist and result.entity_type not in denylist
        ]

        # Anonymize the text
        anonymized_text = anonymizer.anonymize(text=text, analyzer_results=filtered_results)

        # Prepare findings for CSV
        findings = []
        for result in filtered_results:
            entity_text = text[result.start:result.end]
            finding = {
                "Entity_Type": result.entity_type,
                "Text": entity_text,
                "Start": result.start,
                "End": result.end,
                "Confidence": result.score,
                "Source": "Presidio Analyzer"  # Instead of undefined recognizer attribute
            }
            findings.append(finding)

        # Create DataFrame and save to CSV
        if findings:  # Only create CSV if there are findings
            df = pd.DataFrame(findings)
            df.to_csv('findings.csv', index=False)
            print(f"Found {len(findings)} PII entities")
        else:
            print("No PII entities found")

        return anonymized_text.text

    except Exception as e:
        print(f"Error during PII masking: {str(e)}")
        return text  # Return original text if error occurs

# Test Input
input_text = """
John Doe lives at 123 Elm Street, Springfield. His company, TechCorp, pays him a salary of $50,000.
He was born on 12th July 1990 and works from 9 AM to 5 PM.
"""

# Specify entities to ignore (optional)
ignore_entities = ["FAC"]  # Ignore facility entities

# Run the function
masked_text = mask_pii(input_text)
print("\nMasked Output:\n", masked_text)

Found 4 PII entities

Masked Output:
 
<PERSON> lives at 123 Elm Street, <LOCATION>. His company, TechCorp, pays him a salary of $50,000.
He was born on <DATE_TIME> and works from <DATE_TIME>.

