In [7]:
!pip install presidio-analyzer presidio-anonymizer

Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.356-py3-none-any.whl.metadata (8.2 kB)
Collecting azure-core (from presidio-anonymizer)
  Downloading azure_core-1.32.0-py3-none-any.whl.metadata (39 kB)
Downloading presidio_anonymizer-2.2.356-py3-none-any.whl (31 kB)
Downloading azure_core-1.32.0-py3-none-any.whl (198 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.9/198.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: azure-core, presidio-anonymizer
Successfully installed azure-core-1.32.0 presidio-anonymizer-2.2.356


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [12]:
# Import necessary libraries
import os
import re
from transformers import pipeline
from datasets import load_dataset
import torch

# Set your environment variables (API keys, if needed, for cloud models)
# For Hugging Face models, you generally don’t need an API key unless using specific models on Hugging Face Hub

# Step 1: Load Hugging Face's NER pipeline
# We will use a pretrained BERT model for Named Entity Recognition (NER)
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Step 2: Define scrubbers for different types of PII
def scrub_phone_numbers(text: str) -> str:
    return re.sub(r"\(?\+?[0-9]*\)?[-.\s]?[0-9]+[-.\s]?[0-9]+[-.\s]?[0-9]+", "[REDACTED PHONE NUMBER]", text)

def scrub_credit_card_numbers(text: str) -> str:
    return re.sub(r"\b(?:\d[ -]*?){13,16}\b", "[REDACTED CREDIT CARD]", text)

def scrub_email_addresses(text: str) -> str:
    return re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "[REDACTED EMAIL ADDRESS]", text)

def scrub_postal_codes(text: str) -> str:
    return re.sub(r"\b\d{5}(-\d{4})?\b", "[REDACTED POSTAL CODE]", text)

def scrub_sin_numbers(text: str) -> str:
    return re.sub(r"\b\d{3}-\d{3}-\d{3}\b", "[REDACTED SIN]", text)

# Combine all scrubbers into a list
ALL_SCRUBBERS = [
    scrub_phone_numbers,
    scrub_credit_card_numbers,
    scrub_email_addresses,
    scrub_postal_codes,
    scrub_sin_numbers,
]

# Step 3: Function to apply all scrubbers to input text
def apply_scrubbers(text: str) -> str:
    """
    Apply all predefined scrubbers to the input text.
    """
    for scrubber in ALL_SCRUBBERS:
        text = scrubber(text)
    return text

# Step 4: Function to detect PII using Hugging Face's NER pipeline
def detect_pii_with_huggingface(text: str):
    """
    This function uses Hugging Face's NER model to detect PII in the text.
    """
    # Detect named entities using Hugging Face's NER pipeline
    ner_results = ner_pipeline(text)
    
    # Filter out non-PII entities (e.g., labels other than 'PER' for persons, 'ORG' for organizations)
    pii_entities = [entity for entity in ner_results if entity['entity'] in ['PER', 'LOC', 'ORG', 'MISC']]
    
    return pii_entities

# Step 5: Set up a function to send a request to a Hugging Face model (e.g., summarization or generation)
def send_huggingface_request(text: str, model="facebook/bart-large-cnn", max_tokens=50):
    """
    Sends a request to a Hugging Face model after scrubbing PII.
    """
    # Step 5.1: Detect and scrub PII using Hugging Face's NER model
    detected_pii = detect_pii_with_huggingface(text)
    scrubbed_text = apply_scrubbers(text)
    
    # Step 5.2: Load the Hugging Face model (for text summarization or generation)
    summarizer = pipeline("summarization", model=model)
    
    # Step 5.3: Generate a summary of the scrubbed text
    summary = summarizer(scrubbed_text, max_length=max_tokens, min_length=25, do_sample=False)
    
    return summary, detected_pii

# Step 6: Example usage of the function with a sample input
example_text = "Michael Smith (msmith@gmail.com, (+1) 111-111-1111) committed a mistake when he used PyTorch Trainer instead of HF Trainer."

# Send the request and print the response
summary, detected_pii = send_huggingface_request(
    text=f"{example_text}\n\nSummarize the above text in 1-2 sentences."
)
print("Summary:", summary)
print("Detected PII:", detected_pii)

# Step 7: Testing PII Scrubbing on Real Data (Optional)
# You can load a dataset containing PII and apply the scrubbing mechanism to it

# Load the AI4Privacy PII Masking dataset
pii_ds = load_dataset("ai4privacy/pii-masking-200k")

# Example input from the dataset
example_text = pii_ds["train"][36]["source_text"]

# Send the request with the scrubbed dataset example
summary, detected_pii = send_huggingface_request(
    text=f"{example_text}\n\nSummarize the above text in 1-2 sentences."
)
print("Summary:", summary)
print("Detected PII:", detected_pii)


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Your max_length is set to 50, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


Summary: [{'summary_text': 'Michael Smith committed a mistake when he used PyTorch Trainer instead of HF Trainer. He used the wrong name for the training program.'}]
Detected PII: []


README.md:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

english_pii_43k.jsonl:   0%|          | 0.00/73.8M [00:00<?, ?B/s]

french_pii_62k.jsonl:   0%|          | 0.00/116M [00:00<?, ?B/s]

german_pii_52k.jsonl:   0%|          | 0.00/97.8M [00:00<?, ?B/s]

italian_pii_50k.jsonl:   0%|          | 0.00/93.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/209261 [00:00<?, ? examples/s]

Summary: [{'summary_text': "I need the latest update on assessment results. Please send the files to [REDACTED EMAIL ADDRESS]. For your extra time, we'll offer you Kip[REDACTED PHONE NUMBER]. But please provide your л"}]
Detected PII: []


In [10]:
# 1. Import necessary libraries

# Presidio is for PII detection and redaction
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, OperatorConfig
from presidio_analyzer import RecognizerResult

# Hugging Face's transformers for NER (Named Entity Recognition)
from transformers import pipeline

# 2. Setup Presidio Analyzer

# Initialize Presidio's AnalyzerEngine, which will detect various PII categories
analyzer = AnalyzerEngine()

# List available PII recognizers in Presidio
available_recognizers = analyzer.get_recognizers()
print("Available recognizers in Presidio:", [r.name for r in available_recognizers])

# 3. Setup Hugging Face for NER

# Using Hugging Face pipeline for Named Entity Recognition (NER)
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# 4. Function to detect and redact PII using Presidio

def detect_and_redact_pii_with_presidio(text: str):
    """
    This function uses Presidio to detect and redact PII from the text.
    """
    # Analyze the text to detect PII
    results = analyzer.analyze(text=text, language='en')
    
    # Create an anonymizer engine
    anonymizer = AnonymizerEngine()
    
    # Redact detected PII
    anonymized_text = anonymizer.anonymize(text, results)
    
    # Return the anonymized text and detected entities
    return anonymized_text, results

# Example input text with PII
input_text = "John Doe's email is john.doe@example.com and his phone number is +1234567890."

# Detect and redact PII using Presidio
anonymized_text_presidio, detected_pii_presidio = detect_and_redact_pii_with_presidio(input_text)
print("Anonymized text using Presidio:", anonymized_text_presidio)

# 5. Function to detect PII using Hugging Face's NER pipeline

from transformers import pipeline

# Using Hugging Face's NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Function to detect PII using Hugging Face's NER pipeline
def detect_pii_with_huggingface(text: str):
    """
    This function uses Hugging Face's NER model to detect PII in the text.
    """
    # Detect named entities using Hugging Face's NER pipeline
    ner_results = ner_pipeline(text)
    
    # Filter out non-PII entities (e.g., labels other than 'PER' for persons, 'ORG' for organizations)
    pii_entities = [entity for entity in ner_results if entity['entity'] in ['PER', 'LOC', 'ORG', 'MISC']]
    
    return pii_entities

# Example input text with PII
input_text = "John Doe's email is john.doe@example.com and his phone number is +1234567890."

# Detect PII using Hugging Face
detected_pii_huggingface = detect_pii_with_huggingface(input_text)
print("Detected PII using Hugging Face:", detected_pii_huggingface)


# Detect PII using Hugging Face
detected_pii_huggingface = detect_pii_with_huggingface(input_text)
print("Detected PII using Hugging Face:", detected_pii_huggingface)

# 6. Compare the results from both methods

print("Comparing Presidio and Hugging Face NER Results:")
print(f"Presidio detected PII: {detected_pii_presidio}")
print(f"Hugging Face detected PII: {detected_pii_huggingface}")

# 7. Customizing Presidio for specific PII types

# Example: Adding custom recognizers or modifying existing ones
custom_recognizers = analyzer.get_recognizers()
print("Customizable Recognizers:", custom_recognizers)

# Presidio allows customization of the recognizers, for example, to redact specific keywords
def custom_pii_redaction(text: str):
    """
    Custom PII redaction using custom recognizers in Presidio
    """
    custom_results = analyzer.analyze(text=text, language='en')
    custom_anonymizer = AnonymizerEngine()
    custom_anonymized_text = custom_anonymizer.anonymize(text, custom_results)
    return custom_anonymized_text

# Example input with custom text
custom_input_text = "Jane Doe was seen at 1234 Elm Street and her email is jane.doe@customdomain.com."
custom_anonymized_text = custom_pii_redaction(custom_input_text)
print("Custom Anonymized text using Presidio:", custom_anonymized_text)


Available recognizers in Presidio: ['UsBankRecognizer', 'SpacyRecognizer', 'AuAbnRecognizer', 'IbanRecognizer', 'InAadhaarRecognizer', 'InPanRecognizer', 'AuMedicareRecognizer', 'InPassportRecognizer', 'MedicalLicenseRecognizer', 'PhoneRecognizer', 'InVehicleRegistrationRecognizer', 'CryptoRecognizer', 'IpRecognizer', 'UsItinRecognizer', 'SgFinRecognizer', 'UsLicenseRecognizer', 'UrlRecognizer', 'AuTfnRecognizer', 'DateRecognizer', 'UsSsnRecognizer', 'EmailRecognizer', 'AuAcnRecognizer', 'UsPassportRecognizer', 'NhsRecognizer', 'InVoterRecognizer', 'UkNinoRecognizer', 'CreditCardRecognizer']


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Anonymized text using Presidio: text: <PERSON> email is <EMAIL_ADDRESS> and his phone number is +<US_BANK_NUMBER>.
items:
[
    {'start': 59, 'end': 75, 'entity_type': 'US_BANK_NUMBER', 'text': '<US_BANK_NUMBER>', 'operator': 'replace'},
    {'start': 18, 'end': 33, 'entity_type': 'EMAIL_ADDRESS', 'text': '<EMAIL_ADDRESS>', 'operator': 'replace'},
    {'start': 0, 'end': 8, 'entity_type': 'PERSON', 'text': '<PERSON>', 'operator': 'replace'}
]



Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Detected PII using Hugging Face: []
Detected PII using Hugging Face: []
Comparing Presidio and Hugging Face NER Results:
Presidio detected PII: [type: EMAIL_ADDRESS, start: 20, end: 40, score: 1.0, type: PERSON, start: 0, end: 10, score: 0.85, type: URL, start: 20, end: 27, score: 0.5, type: URL, start: 29, end: 40, score: 0.5, type: US_BANK_NUMBER, start: 66, end: 76, score: 0.05, type: US_DRIVER_LICENSE, start: 66, end: 76, score: 0.01]
Hugging Face detected PII: []
Customizable Recognizers: [<presidio_analyzer.predefined_recognizers.us_bank_recognizer.UsBankRecognizer object at 0x7fb493b1d810>, <presidio_analyzer.predefined_recognizers.spacy_recognizer.SpacyRecognizer object at 0x7fb493b1d840>, <presidio_analyzer.predefined_recognizers.au_abn_recognizer.AuAbnRecognizer object at 0x7fb493b1d870>, <presidio_analyzer.predefined_recognizers.iban_recognizer.IbanRecognizer object at 0x7fb493b1d8a0>, <presidio_analyzer.predefined_recognizers.in_aadhaar_recognizer.InAadhaarRecognizer object