In [1]:
import os
import random
import unicodedata
import numpy as np
import pandas as pd

from tqdm import tqdm
from functools import partial
from spacy.lang.en import English
from scipy.special import softmax
from collections import Counter
from datasets import load_dataset
from multiprocessing import Pool, cpu_count
from SanText import SanText_plus, SanText_plus_init
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

In [2]:
dataset = load_dataset("sst2")

Found cached dataset sst2 (/Users/deathscope/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Convert to pandas DataFrames
train_df = dataset['train'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')
validation_df = dataset['validation'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')
test_df = dataset['test'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')

In [4]:
train_df

Unnamed: 0_level_0,sentence,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0
...,...,...
67344,a delightful comedy,1
67345,"anguish , anger and frustration",0
67346,"at achieving the modest , crowd-pleasing goals...",1
67347,a patient viewer,1


In [5]:
def word_normalize(text):
    """Resolve different type of unicode encodings."""
    return unicodedata.normalize('NFD', text)

In [6]:
def get_vocab_SST2(dataset, tokenizer, tokenizer_type):
    vocab = Counter()

    # Loop through the 'sentence' column of the train_df
    for text in dataset['sentence']:
        if tokenizer_type == "subword":
            tokenized_text = tokenizer.tokenize(text)
        elif tokenizer_type == "word":
            tokenized_text = [token.text for token in tokenizer(text)]
        for token in tokenized_text:
            vocab[token] += 1

    if tokenizer_type == "subword":
        for token in tokenizer.vocab:
            vocab[token] += 1

    return vocab

In [7]:
def cal_probability(word_embed_1, word_embed_2, epsilon=2.0):
    distance = euclidean_distances(word_embed_1, word_embed_2)
    sim_matrix = -distance
    prob_matrix = softmax(epsilon * sim_matrix / 2, axis=1)
    return prob_matrix

In [8]:
SENSITIVE_WORD_PERCENTAGE = 0.9
P = 0.3
WORD_EMBEDDING_PATH = 'glove.42B.300d.txt'
EMBEDDING_TYPE = 'glove'
EPSILON = 3

In [9]:
tokenizer = English()
tokenizer_type = 'word'

In [10]:
vocab = get_vocab_SST2(train_df, tokenizer, tokenizer_type)

In [11]:
sensitive_word_count = int(SENSITIVE_WORD_PERCENTAGE * len(vocab))
words = [key for key, _ in vocab.most_common()]
sensitive_words = words[-sensitive_word_count - 1:]

In [12]:
sensitive_words2id = {word: k for k, word in enumerate(sensitive_words)}
print('#Total Words: %d, #Sensitive Words: %d' % (len(words),len(sensitive_words2id)))

#Total Words: 13887, #Sensitive Words: 12499


In [13]:
sensitive_word_embed = []
all_word_embed=[]
word2id = {}
sword2id = {}
sensitive_count = 0
all_count = 0

In [14]:
num_lines = sum(1 for _ in open(WORD_EMBEDDING_PATH))
print("Loading Word Embedding File: %s" % WORD_EMBEDDING_PATH)

with open(WORD_EMBEDDING_PATH) as f:
    # Skip first line if of form count/dim.
    line = f.readline().rstrip().split(' ')
    if len(line) != 2:
        f.seek(0)
    for row in tqdm(f, total=num_lines - 1):
        content = row.rstrip().split(' ')
        cur_word=word_normalize(content[0])
        if cur_word in vocab and cur_word not in word2id:
            word2id[cur_word] = all_count
            all_count += 1
            emb=[float(i) for i in content[1:]]
            all_word_embed.append(emb)
            if cur_word in sensitive_words2id:
                sword2id[cur_word] = sensitive_count
                sensitive_count += 1
                sensitive_word_embed.append(emb)
        assert len(word2id)==len(all_word_embed)
        assert len(sword2id) == len(sensitive_word_embed)
    f.close()

Loading Word Embedding File: glove.42B.300d.txt


1917495it [00:16, 118339.43it/s]                                                


In [15]:
all_word_embed=np.array(all_word_embed, dtype='f')
sensitive_word_embed = np.array(sensitive_word_embed, dtype='f')

In [16]:
print("All Word Embedding Matrix: %s" % str(all_word_embed.shape))
print("Sensitive Word Embedding Matrix: %s" % str(sensitive_word_embed.shape))

All Word Embedding Matrix: (13713, 300)
Sensitive Word Embedding Matrix: (12328, 300)


In [17]:
print("Calculating Prob Matrix for Exponential Mechanism...")
prob_matrix = cal_probability(all_word_embed, sensitive_word_embed, EPSILON)

Calculating Prob Matrix for Exponential Mechanism...


In [18]:
threads = min(12, cpu_count())

In [19]:
current_directory = os.getcwd()

# Mapping filenames to the respective DataFrames
dataframes = {"train.tsv": train_df, "dev.tsv": validation_df}

output_directory = os.path.join(current_directory, "outputs")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for file_name, df in dataframes.items():
    out_file_path = os.path.join(output_directory, file_name)
    out_file = open(out_file_path, 'w')
    print(f"Processing DataFrame corresponding to {file_name}. Will write to: {out_file_path}")

    # Initialize empty lists to store docs and labels
    docs = []
    labels = []

    # SST-2 processing
    for _, row in df.iterrows():
        text = row['sentence']
        label = row['label']
        if EMBEDDING_TYPE == "glove":
            doc = [token.text for token in tokenizer(text)]
        else:
            doc = tokenizer.tokenize(text)
        docs.append(doc)
        labels.append(label)

    # Multiprocessing with Pool for sanitizing
    with Pool(threads, initializer=SanText_plus_init, initargs=(prob_matrix, word2id, sword2id, words, P, tokenizer)) as p:
        annotate_ = partial(SanText_plus)
        results = list(
            tqdm(
                p.imap(annotate_, docs, chunksize=32),
                total=len(docs),
                desc="Sanitize docs using SanText",
            )
        )
        p.close()

    print("Saving ...")
    # Saving for SST-2
    for i, predicted_text in enumerate(results):
        write_content = predicted_text + "\t" + str(labels[i]) + "\n"
        out_file.write(write_content)

    out_file.close()

Processing DataFrame corresponding to train.tsv. Will write to: /Users/deathscope/Research/Differential Privacy/privacy_rag/outputs/train.tsv


Sanitize docs using SanText: 100%|██████| 67349/67349 [00:17<00:00, 3846.77it/s]


Saving ...
Processing DataFrame corresponding to dev.tsv. Will write to: /Users/deathscope/Research/Differential Privacy/privacy_rag/outputs/dev.tsv


Sanitize docs using SanText: 100%|██████████| 872/872 [00:00<00:00, 2211.71it/s]


Saving ...


In [20]:
column_names = ["sentence", "label"]

sanitized_train = pd.read_csv(os.path.join(output_directory, "train.tsv"), sep="\t", names=column_names)
sanitized_validation = pd.read_csv(os.path.join(output_directory, "dev.tsv"), sep="\t", names=column_names)

# Using Vanilla Presidio

In [21]:
import json

from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from pprint import pprint

In [22]:
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

In [23]:
titles_recognizer = PatternRecognizer(supported_entity="TITLE", deny_list=["Mr.","Mrs.","Miss"])

pronoun_recognizer = PatternRecognizer(supported_entity="PRONOUN", deny_list=["he", "He", "his", "His", "she", "She", "hers", "Hers"])

analyzer.registry.add_recognizer(titles_recognizer)
analyzer.registry.add_recognizer(pronoun_recognizer)

# Evaluations

In [24]:
# Find indices of differing sentences
differing_indices = [i for i, (predicted, actual) in enumerate(zip(sanitized_train['sentence'], train_df['sentence'])) if predicted.strip() != actual.strip()]

# Randomly select 30 of these indices
random_indices = random.sample(differing_indices, 30)

# Print the sentences side by side for better clarity
for idx in random_indices:
    original_sentence = train_df['sentence'].iloc[idx]
    changed_sentence = sanitized_train['sentence'].iloc[idx]
    
    analyzer_results = analyzer.analyze(text=original_sentence, language='en')
    presidio_sentence = anonymizer.anonymize(
        text=original_sentence,
        analyzer_results=analyzer_results,    
        operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}), 
                    "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char" : "*", "chars_to_mask" : 12, "from_end" : True})}
    )
    print(f"Original: {original_sentence}\nPresidio: {presidio_sentence.text}\nSanText: {changed_sentence}\n{'-'*50}")

Original: forgotten the movie 
Presidio: forgotten the movie 
SanText: forgotten lunacy movie
--------------------------------------------------
Original: end up trying to drown yourself in a lake afterwards 
Presidio: end up trying to drown yourself in a lake afterwards 
SanText: backed up trying assert drown yourself in a lake staying
--------------------------------------------------
Original: , action-packed chiller 
Presidio: , action-packed chiller 
SanText: , action - packed chiller
--------------------------------------------------
Original: of a copenhagen neighborhood coping with the befuddling complications life 
Presidio: of a <ANONYMIZED> neighborhood coping with the befuddling complications life 
SanText: of a copenhagen neighborhood coping with overwhelm befuddling complications whatever
--------------------------------------------------
Original: that lifts your spirits 
Presidio: that lifts your spirits 
SanText: horns lifts your spirits
-------------------------------