In [65]:
import os
import random
import unicodedata
import numpy as np
import pandas as pd

from tqdm import tqdm
from functools import partial
from spacy.lang.en import English
from scipy.special import softmax
from collections import Counter
from datasets import load_dataset
from multiprocessing import Pool, cpu_count
from SanText import SanText_plus, SanText_plus_init
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances

In [66]:
dataset = load_dataset("sst2")

Found cached dataset sst2 (/Users/deathscope/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [67]:
# Convert to pandas DataFrames
# train_df = dataset['train'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')
# validation_df = dataset['validation'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')
# test_df = dataset['test'].to_pandas().rename(columns={'idx': 'id'}).set_index('id')

In [68]:
data = {
        "sentence": [
            "I loved the movie because of its plot and character development.",
            "The acting was subpar, and I was really disappointed.",
            "One of the best movies I've ever watched. Highly recommended!",
            "The storyline was predictable and lacked depth.",
            "Stunning visuals and outstanding performances by the lead actors.",
            "I wouldn't watch it again. The pace was too slow.",
            "A cinematic masterpiece that's both touching and captivating.",
            "The soundtrack perfectly complemented the movie's tone.",
            "While the movie had a strong start, it failed to maintain that momentum.",
            "A decent one-time watch, but not something to rave about.",
        ]
    }

train_df = pd.DataFrame(data)

In [69]:
def word_normalize(text):
    """Resolve different type of unicode encodings."""
    return unicodedata.normalize('NFD', text)

In [70]:
def get_vocab_SST2(dataset, tokenizer, tokenizer_type):
    vocab = Counter()

    # Loop through the 'sentence' column of the train_df
    for text in dataset['sentence']:
        if tokenizer_type == "subword":
            tokenized_text = tokenizer.tokenize(text)
        elif tokenizer_type == "word":
            tokenized_text = [token.text for token in tokenizer(text)]
        for token in tokenized_text:
            vocab[token] += 1

    if tokenizer_type == "subword":
        for token in tokenizer.vocab:
            vocab[token] += 1

    return vocab

In [71]:
def cal_probability(word_embed_1, word_embed_2, epsilon=2.0):
    distance = euclidean_distances(word_embed_1, word_embed_2)
    sim_matrix = -distance
    prob_matrix = softmax(epsilon * sim_matrix / 2, axis=1)
    return prob_matrix

In [72]:
SENSITIVE_WORD_PERCENTAGE = 0.9
P = 0.3
WORD_EMBEDDING_PATH = 'glove.42B.300d.txt'
EMBEDDING_TYPE = 'glove'
EPSILON = 3

In [73]:
tokenizer = English()
tokenizer_type = 'word'

In [74]:
vocab = get_vocab_SST2(train_df, tokenizer, tokenizer_type)

In [75]:
sensitive_word_count = int(SENSITIVE_WORD_PERCENTAGE * len(vocab))
words = [key for key, _ in vocab.most_common()]
sensitive_words = words[-sensitive_word_count - 1:]

In [76]:
print("WORDS: ", words)
print("SENSITIVE WORDS: ", sensitive_words)

WORDS:  ['.', 'the', 'and', 'I', 'The', 'was', 'movie', ',', 'of', 'watch', 'it', 'A', 'that', "'s", 'to', 'loved', 'because', 'its', 'plot', 'character', 'development', 'acting', 'subpar', 'really', 'disappointed', 'One', 'best', 'movies', "'ve", 'ever', 'watched', 'Highly', 'recommended', '!', 'storyline', 'predictable', 'lacked', 'depth', 'Stunning', 'visuals', 'outstanding', 'performances', 'by', 'lead', 'actors', 'would', "n't", 'again', 'pace', 'too', 'slow', 'cinematic', 'masterpiece', 'both', 'touching', 'captivating', 'soundtrack', 'perfectly', 'complemented', 'tone', 'While', 'had', 'a', 'strong', 'start', 'failed', 'maintain', 'momentum', 'decent', 'one', '-', 'time', 'but', 'not', 'something', 'rave', 'about']
SENSITIVE WORDS:  [',', 'of', 'watch', 'it', 'A', 'that', "'s", 'to', 'loved', 'because', 'its', 'plot', 'character', 'development', 'acting', 'subpar', 'really', 'disappointed', 'One', 'best', 'movies', "'ve", 'ever', 'watched', 'Highly', 'recommended', '!', 'storyli

In [77]:
sensitive_words2id = {word: k for k, word in enumerate(sensitive_words)}
print('#Total Words: %d, #Sensitive Words: %d' % (len(words),len(sensitive_words2id)))

#Total Words: 77, #Sensitive Words: 70


In [78]:
sensitive_word_embed = []
all_word_embed=[]
word2id = {}
sword2id = {}
sensitive_count = 0
all_count = 0

In [79]:
num_lines = sum(1 for _ in open(WORD_EMBEDDING_PATH))
print("Loading Word Embedding File: %s" % WORD_EMBEDDING_PATH)

with open(WORD_EMBEDDING_PATH) as f:
    # Skip first line if of form count/dim.
    line = f.readline().rstrip().split(' ')
    if len(line) != 2:
        f.seek(0)
    for row in tqdm(f, total=num_lines - 1):
        content = row.rstrip().split(' ')
        cur_word=word_normalize(content[0])
        if cur_word in vocab and cur_word not in word2id:
            word2id[cur_word] = all_count
            all_count += 1
            emb=[float(i) for i in content[1:]]
            all_word_embed.append(emb)
            if cur_word in sensitive_words2id:
                sword2id[cur_word] = sensitive_count
                sensitive_count += 1
                sensitive_word_embed.append(emb)
        assert len(word2id)==len(all_word_embed)
        assert len(sword2id) == len(sensitive_word_embed)
    f.close()

Loading Word Embedding File: glove.42B.300d.txt


1917495it [00:19, 96945.16it/s]                                                 


In [80]:
all_word_embed=np.array(all_word_embed, dtype='f')
sensitive_word_embed = np.array(sensitive_word_embed, dtype='f')

In [81]:
print("All Word Embedding Matrix: %s" % str(all_word_embed.shape))
print("Sensitive Word Embedding Matrix: %s" % str(sensitive_word_embed.shape))

All Word Embedding Matrix: (70, 300)
Sensitive Word Embedding Matrix: (65, 300)


In [82]:
print("Calculating Prob Matrix for Exponential Mechanism...")
prob_matrix = cal_probability(all_word_embed, sensitive_word_embed, EPSILON)

Calculating Prob Matrix for Exponential Mechanism...


In [83]:
print(prob_matrix)

[[9.9864072e-01 1.0672838e-05 1.4418360e-05 ... 2.7827216e-06
  3.4367824e-06 1.1529975e-06]
 [3.5734470e-03 2.3980686e-02 5.6089237e-02 ... 6.6333573e-04
  7.5063779e-04 3.0566554e-04]
 [2.2675011e-02 1.0487794e-02 6.5108766e-03 ... 1.6250034e-03
  1.5021058e-03 5.6710804e-04]
 ...
 [2.7835506e-06 1.9244369e-06 1.9149497e-06 ... 9.9893826e-01
  8.3348277e-05 6.2886234e-06]
 [3.4378870e-06 2.4326989e-06 2.4328476e-06 ... 8.3350227e-05
  9.9896169e-01 2.3809767e-05]
 [1.1539561e-06 7.6035218e-07 9.1737854e-07 ... 6.2919771e-06
  2.3821905e-05 9.9947101e-01]]


In [84]:
threads = min(12, cpu_count())

In [88]:
current_directory = os.getcwd()

# Mapping filenames to the respective DataFrames
dataframes = {"train.tsv": train_df}

output_directory = os.path.join(current_directory, "outputs")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for file_name, df in dataframes.items():
    out_file_path = os.path.join(output_directory, file_name)
    out_file = open(out_file_path, 'w')
    print(f"Processing DataFrame corresponding to {file_name}. Will write to: {out_file_path}")

    # Initialize empty lists to store docs and labels
    docs = []
    labels = []

    # SST-2 processing
    for _, row in df.iterrows():
        text = row['sentence']
        if EMBEDDING_TYPE == "glove":
            doc = [token.text for token in tokenizer(text)]
        else:
            doc = tokenizer.tokenize(text)
        docs.append(doc)

    # Multiprocessing with Pool for sanitizing
    with Pool(threads, initializer=SanText_plus_init, initargs=(prob_matrix, word2id, sword2id, words, P, tokenizer)) as p:
        annotate_ = partial(SanText_plus)
        results = list(
            tqdm(
                p.imap(annotate_, docs, chunksize=32),
                total=len(docs),
                desc="Sanitize docs using SanText",
            )
        )
        p.close()

    print("Saving ...")
    # Saving for SST-2
    for i, predicted_text in enumerate(results):
        write_content = predicted_text + "\n"
        out_file.write(write_content)

    out_file.close()

Processing DataFrame corresponding to train.tsv. Will write to: /Users/deathscope/Research/Differential Privacy/privacy_rag/outputs/train.tsv


Sanitize docs using SanText: 100%|██████████████| 10/10 [00:02<00:00,  4.86it/s]

Saving ...





In [89]:
column_names = ["sentence"]

sanitized_train = pd.read_csv(os.path.join(output_directory, "train.tsv"), sep="\t", names=column_names)
sanitized_validation = pd.read_csv(os.path.join(output_directory, "dev.tsv"), sep="\t", names=column_names)

# Using Vanilla Presidio

In [90]:
import json

from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from pprint import pprint

In [91]:
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

In [92]:
titles_recognizer = PatternRecognizer(supported_entity="TITLE", deny_list=["Mr.","Mrs.","Miss"])

pronoun_recognizer = PatternRecognizer(supported_entity="PRONOUN", deny_list=["he", "He", "his", "His", "she", "She", "hers", "Hers"])

analyzer.registry.add_recognizer(titles_recognizer)
analyzer.registry.add_recognizer(pronoun_recognizer)

# Evaluations

In [94]:
# Find indices of differing sentences
differing_indices = [i for i, (predicted, actual) in enumerate(zip(sanitized_train['sentence'], train_df['sentence'])) if predicted.strip() != actual.strip()]

# Randomly select 30 of these indices
random_indices = random.sample(differing_indices, 10)

# Print the sentences side by side for better clarity
for idx in random_indices:
    original_sentence = train_df['sentence'].iloc[idx]
    changed_sentence = sanitized_train['sentence'].iloc[idx]
    
    analyzer_results = analyzer.analyze(text=original_sentence, language='en')
    presidio_sentence = anonymizer.anonymize(
        text=original_sentence,
        analyzer_results=analyzer_results,    
        operators={"DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}), 
                    "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char" : "*", "chars_to_mask" : 12, "from_end" : True})}
    )
    print(f"Original: {original_sentence}\nPresidio: {presidio_sentence.text}\nSanText: {changed_sentence}\n{'-'*50}")

Original: The acting was subpar, and I was really disappointed.
Presidio: The acting was subpar, and I was really disappointed.
SanText: subpar acting a subpar , not ever was really disappointed .
--------------------------------------------------
Original: While the movie had a strong start, it failed to maintain that momentum.
Presidio: While the movie had a strong start, it failed to maintain that momentum.
SanText: I both movies had a strong start , it failed to maintain that momentum but
--------------------------------------------------
Original: The storyline was predictable and lacked depth.
Presidio: The storyline was predictable and lacked depth.
SanText: development storyline was predictable and lacked depth .
--------------------------------------------------
Original: The soundtrack perfectly complemented the movie's tone.
Presidio: The soundtrack perfectly complemented the movie's tone.
SanText: One soundtrack perfectly complemented the movie 's tone not
-----------------