# Load necessary libraries

In [1]:
import json
import re
import numpy as np
import pandas as pd
import scipy.spatial.distance as sp_dist
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter, defaultdict
from spacy.lang.en import English
from scipy.special import softmax
from mechanisms.detectors.presidio_detector import PresidioDetector

# Define parameters

In [2]:
TAB_FILE_PATH = "echr_train.json"
WORD_EMBEDDING_PATH = "glove.840B.300d.txt"
TOP_K = 20
EPSILON = 1
P = 2

# Load dataset

In [None]:
def normalize_distances(distances):
    """Normalize the given distances"""
    distance_range = max(distances) - min(distances)
    # Check if distance_range is zero (i.e., all distances are the same)
    if distance_range == 0:
        # If so, return an array of zeros (or some other default value)
        return [0 for _ in distances]
    min_distance = min(distances)
    return [-(dist - min_distance) / distance_range for dist in distances]

In [None]:
def has_header(file):
    """Check if the embeddings file has a header"""
    return len(file.readline().split()) == 2

In [None]:
def read_json_file(file_path):
    """Read a JSON file and return the data"""
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

In [None]:
def extract_text_values(data):
    """Extract 'text' values from data"""
    return [item["text"] for item in data if "text" in item]

In [None]:
def build_vocab_from_dataset(df):
    """Build vocabulary from dataset"""
    tokenizer = English()
    vocab = Counter()
    for text in df["sentence"]:
        tokenized_text = [
            token.text
            for token in tokenizer(text)
            if (token.is_alpha or token.is_digit)
        ]
        vocab.update(tokenized_text)
    return vocab

In [None]:
def parse_embedding_row(row):
    """Parse a row in the general_embeddings file"""
    content = row.rstrip().split(" ")
    return content[0], [float(i) for i in content[1:]]

In [None]:
def process_word_embedding(
    word,
    embedding,
    vocab,
    word_to_id,
    general_word_embeddings,
):
    """Process a single word embedding"""
    if word in vocab and word not in word_to_id and not re.match(r"^\d+$", word):
        word_to_id[word] = len(general_word_embeddings)
        general_word_embeddings.append(embedding)

In [None]:
def process_word_embeddings(vocab):
    """Process word general_embeddings and return arrays and dictionaries for words in the vocabulary"""
    word_to_id = {}
    general_word_embeddings = []
    num_lines = sum(1 for _ in open(WORD_EMBEDDING_PATH, encoding="utf-8"))

    with open(WORD_EMBEDDING_PATH, encoding="utf-8") as file:
        if not has_header(file):
            file.seek(0)
        num_lines = sum(1 for _ in file)
        file.seek(0)

        for row in tqdm(file, total=num_lines - 1):
            word, embedding = parse_embedding_row(row)
            process_word_embedding(
                word,
                embedding,
                vocab,
                word_to_id,
                general_word_embeddings,
            )

    return (np.asarray(general_word_embeddings), word_to_id)

In [None]:
detector = PresidioDetector()
data = read_json_file(TAB_FILE_PATH)
text_values = extract_text_values(data)
tab_df = pd.DataFrame(text_values, columns=["sentence"])
vocab = build_vocab_from_dataset(tab_df)
words = [key for key, _ in vocab.most_common()]
sensitive_words = detector.detect(vocab)
processed_data = process_word_embeddings(vocab)
(general_embeddings, word_to_id) = processed_data
id_to_word = {v: k for k, v in word_to_id.items()}

# Analysis

In [None]:
print(general_embeddings[word_to_id["Denmark"]]) # get GloVe embedding of a word

In [None]:
distance = sp_dist.cdist(
    general_embeddings[word_to_id["Denmark"]].reshape(1, -1),
    general_embeddings[word_to_id["Norway"]].reshape(1, -1),
    metric="minkowski",
    p=P,
)[0]

print(distance) # get distance between Denmark and Norway

In [None]:
distance = sp_dist.cdist(
    general_embeddings[word_to_id["Denmark"]].reshape(1, -1),
    general_embeddings[word_to_id["Macedonia"]].reshape(1, -1),
    metric="minkowski",
    p=P,
)[0]

print(distance)

# Calculate sensitivity

In [None]:
max_distance_dict = defaultdict(float)

for word in sensitive_words:
    if word in word_to_id and word not in max_distance_dict:
        similar_indices = sp_dist.cdist(
            general_embeddings[word_to_id[word]].reshape(1, -1),
            general_embeddings,
            metric="minkowski",
            p=P,
        )[0].argsort()[:TOP_K]
        max_distance_index = similar_indices[-1]
        max_distance = sp_dist.cdist(
            general_embeddings[word_to_id[word]].reshape(1, -1),
            general_embeddings[max_distance_index].reshape(1, -1),
            metric="minkowski",
            p=P,
        )
        max_distance_dict[word] = max_distance  

In [None]:
max_word, sensitivity = max(max_distance_dict.items(), key=lambda item: item[1])

In [None]:
print(max_word)
print(sensitivity)

In [None]:
# find substitute word for 'Denmark' based on exponential mechanism
word = "Denmark"

In [None]:
distances = sp_dist.cdist(
    general_embeddings[word_to_id[word]].reshape(1, -1),
    general_embeddings,
    metric="minkowski",
    p=P,
)[0]
sim_matrix = -distances
pow_scaled_sim_matrix = np.power(sim_matrix, 3)
prob_matrix = softmax(EPSILON * sim_matrix / (2 * sensitivity))[0]
substitute_idx = np.random.choice(len(prob_matrix), 1, p=prob_matrix)
print(id_to_word[substitute_idx[0]])

In [None]:
# Define amplification function
def amplify_probabilities(probabilities, amplification_factor, similar_indices):
    for i in range(len(probabilities)):
        if i in similar_indices:
            probabilities[i] *= amplification_factor
        else:
            probabilities[i] *= (1/amplification_factor)
    # Renormalize probabilities
    total_probability = sum(probabilities)
    probabilities = [p/total_probability for p in probabilities]
    return probabilities

In [None]:
similar_indices = sp_dist.cdist(
    general_embeddings[word_to_id[word]].reshape(1, -1),
    general_embeddings,
    metric="minkowski",
    p=P,
)[0].argsort()[:50]

prob_matrix = amplify_probabilities(prob_matrix, 2, similar_indices)
prob_matrix = amplify_probabilities(prob_matrix, 2, similar_indices)

In [None]:
substitute_idx = np.random.choice(len(prob_matrix), 1, p=prob_matrix)
print(id_to_word[substitute_idx[0]])

In [None]:
prob_matrix[word_to_id["Denmark"]]

In [None]:
prob_matrix[word_to_id["Norway"]]

In [None]:
prob_matrix[word_to_id["Norwegian"]]

In [None]:
def probability_distribution_for_word(prob_matrix, word):
    # Assuming general_embeddings, word_to_id, and index_to_word are defined, as well as P for Minkowski
    similar_indices = sp_dist.cdist(
        general_embeddings[word_to_id[word]].reshape(1, -1),
        general_embeddings,
        metric="minkowski",
        p=P,
    )[0].argsort()[:100]
    similar_words = [id_to_word[idx] for idx in similar_indices]
    
    # Extract probabilities for the similar words
    probabilities = [prob_matrix[word_to_id[sim_word]] for sim_word in similar_words]

    # Visualization
    plt.figure(figsize=(10, 8))
    plt.bar(range(len(similar_words)), probabilities, tick_label=similar_words)
    plt.xlabel('Words')
    plt.ylabel('Probability')
    plt.title(f'Probability Distribution for "{word}" Among Its 100 Closest Words')
    plt.xticks(rotation=90)  # Rotate labels to avoid overlap
    plt.show()

In [None]:
probability_distribution_for_word(prob_matrix, word)

# New Algorithm

In [3]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.predefined_recognizers import SpacyRecognizer
from spacy.lang.en import English

In [13]:
analyzer = AnalyzerEngine()
spacy_recognizer = SpacyRecognizer()
analyzer.registry.add_recognizer(spacy_recognizer)

In [4]:
def has_header(file):
    """Check if the embeddings file has a header"""
    return len(file.readline().split()) == 2

In [5]:
def get_locations_and_nrps():
    """Load word embeddings from a file and categorize them."""

    # Initializing collections
    categories = {
        "LOCATION": {"words": [], "embeddings": [], "index_to_word": [], "word_to_index": {}},
        "NRP": {"words": [], "embeddings": [], "index_to_word": [], "word_to_index": {}}
    }

    try:
        with open(WORD_EMBEDDING_PATH, "r", encoding="utf-8") as file:
            if not has_header(file):
                file.seek(0)
            
            for row in tqdm(file):
                content = row.rstrip().split(" ")
                word, vector = content[0], list(map(float, content[1:]))
                analysis_result = analyzer.analyze(text=word, language="en", entities=["LOCATION", "NRP"])
                
                for result in analysis_result:
                    entity_type = result.entity_type
                    if entity_type in categories:
                        cat_dict = categories[entity_type]
                        cat_dict["words"].append(word)
                        cat_dict["embeddings"].append(vector)
                        cat_dict["index_to_word"].append(word)
                        cat_dict["word_to_index"][word] = len(cat_dict["index_to_word"]) - 1
    except Exception as e:
        print(f"An error occurred: {e}")

    return_values = []
    for category, cat_dict in categories.items():
        return_values.extend([
            np.asarray(cat_dict["words"]),
            np.asarray(cat_dict["embeddings"]),
            np.asarray(cat_dict["index_to_word"]),
            cat_dict["word_to_index"]
        ])

    return tuple(return_values)

In [6]:
locations_and_nrps = get_locations_and_nrps()

2196017it [1:06:33, 549.93it/s]


### Location Cluster 

In [7]:
location_words = locations_and_nrps[0]
location_embeddings = locations_and_nrps[1]
location_index_to_word = locations_and_nrps[2]
location_word_to_index = locations_and_nrps[3]

### NRP Cluster 

In [8]:
nrp_words = locations_and_nrps[4]
nrp_embeddings = locations_and_nrps[5]
nrp_index_to_word = locations_and_nrps[6]
nrp_word_to_index = locations_and_nrps[7]

In [9]:
print(f"Total number of Locations: {len(location_words)}")
print(f"Total number of NRPs: {len(nrp_words)}")

Total number of Locations: 82198
Total number of NRPs: 34246


#### Example Usages 

In [10]:
# Print all words classified as locations to verify the list content.
print(location_words)

# Display the first word's embedding vector from the location_embeddings list for inspection.
print(location_embeddings[0])

# Reveal the first word in the location_index_to_word list to check the mapping from indices to words.
print(location_index_to_word[0])

# Retrieve and display the index of 'Japan' in the location_words list.
print(location_word_to_index['Japan'])

# Extract and print the embedding vector for 'Japan' using its index.
print(location_embeddings[location_word_to_index['Japan']])

['US' 'York' 'U.S.' ... 'pizookie' 'procrastina' 'wwent']
[-6.1300e-01  3.5952e-01  6.1369e-01  2.2584e-01  2.1979e-01 -3.8877e-01
 -5.6797e-01  1.1757e-01 -4.2763e-02  2.0155e+00 -4.5882e-01  9.4706e-02
 -9.0584e-02 -3.3069e-01  1.2620e-01 -4.1112e-01 -4.1397e-01  1.0438e+00
  2.4548e-01 -2.2299e-01  1.2817e-01  1.9747e-01 -1.8282e-01  3.8635e-01
 -2.0372e-01 -2.6187e-01 -3.8953e-01 -2.0268e-01  3.4471e-01  3.2418e-01
 -1.2618e-01  5.7121e-01  3.6365e-01 -2.4217e-01  4.1622e-01 -3.3974e-01
  2.5321e-01 -5.0867e-01  7.5067e-01  6.8774e-01  1.8997e-01  1.9956e-02
  7.8944e-01 -2.0719e-01  1.7613e-01  3.4474e-01  2.0023e-01 -2.9991e-01
  7.2304e-02  2.1757e-02 -1.7221e-01 -5.6177e-03 -4.1844e-01  4.6432e-01
 -5.7699e-02 -1.3214e-01  6.1830e-02 -4.0080e-01  3.1318e-01 -2.5913e-01
 -2.7152e-01 -8.5833e-02 -5.9223e-02  3.1800e-01  3.1722e-01  9.1133e-02
 -1.9776e-01  1.8802e-01 -2.2686e-01  2.4194e-01 -1.7460e-01 -2.5992e-02
 -2.5309e-01  1.4346e-01  4.0506e-01  9.9204e-02 -3.3167e-01 -4.33

In [14]:
def get_similar_words(word: str, distance_metric: str) -> list:
    """
    Finds and returns the 20 closest words to the given word based on the specified distance metric.
    
    Parameters:
    - word (str): The word to find similar words for.
    - distance_metric (str): The distance metric to use ('cosine', 'euclidean', etc.).
    
    Returns:
    - list: A list of the 20 closest words.
    """
    if analyzer.analyze(text=word, language="en", entities=["LOCATION"]):
        embeddings = location_embeddings
        word_to_index = location_word_to_index
        words = location_words
    elif word in nrp_word_to_index:
        embeddings = nrp_embeddings
        word_to_index = nrp_word_to_index
        words = nrp_words
    else:
        print(f"Word '{word}' not found.")
        return []

    # Ensure the word exists in the embeddings
    if word not in word_to_index:
        print(f"Word '{word}' does not have an embedding.")
        return []
    
    word_embedding = embeddings[word_to_index[word]].reshape(1, -1)
    distances = sp_dist.cdist(word_embedding, embeddings, metric=distance_metric).flatten()
    similar_indices = distances.argsort()[:20]
    
    return [words[index] for index in similar_indices]

In [15]:
# Example usage
similar_words = get_similar_words('Japan', 'cosine')
print(similar_words)

['Japan', 'Tokyo', 'Korea', 'Taiwan', 'Osaka', 'China', 'Asia', 'JAPAN', 'japan', 'Kyoto', 'Okinawa', 'Nippon', 'Europe', 'Hokkaido', 'Nagoya', 'Kong', 'Thailand', 'Hong', 'Seoul', 'Kyushu']


## Pre-compute sensitivities for Intra-Cluster mappings 

In [16]:
# Pre-compute sensitivity for Location cluster
max_distance_dict = defaultdict(float)

for word in location_words:
    if word not in max_distance_dict:
        distances = sp_dist.cdist(
            location_embeddings[location_word_to_index[word]].reshape(1, -1),
            location_embeddings,
            metric='euclidean'
        )[0]
        max_distance_dict[word] = max(distances)
        
word_with_greatest_dist = max(
    max_distance_dict,
    key=max_distance_dict.get
)
location_sensitivity = max_distance_dict[word_with_greatest_dist]

In [17]:
# Pre-compute sensitivity for NRP cluster
max_distance_dict = defaultdict(float)

for word in nrp_words:
    if word not in max_distance_dict:
        distances = sp_dist.cdist(
            nrp_embeddings[nrp_word_to_index[word]].reshape(1, -1),
            nrp_embeddings,
            metric='euclidean'
        )[0]
        max_distance_dict[word] = max(distances)
        
word_with_greatest_dist = max(
    max_distance_dict,
    key=max_distance_dict.get
)
nrp_sensitivity = max_distance_dict[word_with_greatest_dist]

In [20]:
# Pre-compute inter-cluster sensitivity
centroid_embeddings = [
    np.mean(location_embeddings, axis=0),  # Centroid for location cluster
    np.mean(nrp_embeddings, axis=0)        # Centroid for NRP cluster
]
cluster_sensitivity = sp_dist.cdist(
    centroid_embeddings[0].reshape(1, -1),
    centroid_embeddings[1].reshape(1, -1),
    metric='euclidean'
)[0][0]

In [21]:
print(f"Sensitivity for Location cluster: {location_sensitivity}")
print(f"Sensitivity for NRP cluster: {nrp_sensitivity}")
print(f"Inter-cluster sensitivity: {cluster_sensitivity}")

Sensitivity for Location cluster: 30.714837075298064
Sensitivity for NRP cluster: 27.285209268130146
Inter-cluster sensitivity: 0.48370653954521364


## Sanitization 

In [22]:
passage = "On a sunny day in Paris, two friends, Alex and Jordan, decided to embark on a leisurely exploration of the city's most charming spots. Alex is Jordanian and Jordan is British. Their journey began at the foot of the iconic Eiffel Tower, where they marveled at the iron lattice structure that towered above them, its peak almost touching the clear blue sky.With a map in hand and a sense of adventure in their hearts, they meandered through the cobblestone streets, making their way to the historic heart of Paris, the Marais. Here, they discovered quaint boutiques, art galleries, and bistros that seemed to have frozen in time. Alex suggested they grab a coffee at a small café tucked away on Rue des Rosiers, a spot he'd heard was beloved by locals and tourists alike for its rich espresso and warm, flaky croissants. As the day unfolded, Jordan, who had a keen interest in art history, insisted they visit the Louvre Museum. They spent hours wandering through the vast halls, admiring masterpieces from different eras and cultures. The highlight was, without a doubt, standing before the Mona Lisa, where they joined a crowd of onlookers, each trying to decipher the enigmatic smile of Leonardo da Vinci's famous subject. The sun was beginning to set, casting a golden hue over the city, when Alex and Jordan found themselves on the banks of the Seine. They decided to cap off their day with a scenic boat cruise, offering them a view of Paris from a different perspective. As the boat glided under the Pont Neuf and past the illuminated Notre-Dame Cathedral, they reflected on the beauty and history that enveloped them at every turn. Their day in Paris was a testament to the enduring charm of the City of Light, a place where every street, every corner, holds a story waiting to be discovered. As they disembarked from the boat, the Eiffel Tower sparkled in the distance, a perfect end to an unforgettable day."

In [23]:
print(passage)

On a sunny day in Paris, two friends, Alex and Jordan, decided to embark on a leisurely exploration of the city's most charming spots. Alex is Jordanian and Jordan is British. Their journey began at the foot of the iconic Eiffel Tower, where they marveled at the iron lattice structure that towered above them, its peak almost touching the clear blue sky.With a map in hand and a sense of adventure in their hearts, they meandered through the cobblestone streets, making their way to the historic heart of Paris, the Marais. Here, they discovered quaint boutiques, art galleries, and bistros that seemed to have frozen in time. Alex suggested they grab a coffee at a small café tucked away on Rue des Rosiers, a spot he'd heard was beloved by locals and tourists alike for its rich espresso and warm, flaky croissants. As the day unfolded, Jordan, who had a keen interest in art history, insisted they visit the Louvre Museum. They spent hours wandering through the vast halls, admiring masterpieces 

In [28]:
import numpy as np
from scipy.spatial import distance as sp_dist
from scipy.special import softmax
from spacy.lang.en import English

def sanitize_passage(passage, centroid_embeddings, cluster_sensitivity, location_sensitivity, nrp_sensitivity):
    """
    Sanitizes a passage by replacing specific words with substitutes based on their distance
    from the centroids of predefined clusters (locations and NRPs).

    Parameters:
    - passage (str): The passage to sanitize.
    - EPSILON, cluster_sensitivity, location_sensitivity, nrp_sensitivity (float): Parameters controlling the sensitivity of replacements.
    
    Returns:
    - list: A list of words representing the sanitized passage.
    """

    sanitized_passage = []
    tokenizer = English()
    tokens = [token.text for token in tokenizer(passage) if token.is_alpha or token.is_digit]

    for word in tokens:
        # Determine if the word belongs to the location or NRP cluster and set the appropriate embeddings and sensitivity
        if word in location_words or word in nrp_words:
            word_embedding = location_embeddings[location_word_to_index[word]].reshape(1, -1) if word in location_words else nrp_embeddings[nrp_word_to_index[word]].reshape(1, -1)
            sensitivity = location_sensitivity if word in location_words else nrp_sensitivity
            # Determine initial cluster choice based on distance to centroids
            distances_to_centroids = sp_dist.cdist(word_embedding, np.array(centroid_embeddings), metric='euclidean')[0]
            prob_matrix_to_centroids = softmax(EPSILON * (-distances_to_centroids) / (2 * cluster_sensitivity))
            initial_cluster_choice = np.random.choice([0, 1], p=prob_matrix_to_centroids)
            
            # Calculate distances within the chosen cluster
            cluster_embeddings = location_embeddings if initial_cluster_choice == 0 else nrp_embeddings
            distances_within_cluster = sp_dist.cdist(word_embedding, cluster_embeddings, metric='cosine')[0]
            prob_matrix_within_cluster = softmax(EPSILON * (-distances_within_cluster) / (2 * sensitivity))
            substitute_idx = np.random.choice(range(len(prob_matrix_within_cluster)), p=prob_matrix_within_cluster)
            
            chosen_word = location_index_to_word[substitute_idx] if initial_cluster_choice == 0 else nrp_index_to_word[substitute_idx]
            sanitized_passage.append(chosen_word)
        else:
            sanitized_passage.append(word)

    return sanitized_passage

In [29]:
sanitized_passage_text = sanitize_passage(passage, centroid_embeddings, cluster_sensitivity, location_sensitivity, nrp_sensitivity)
print(' '.join(sanitized_passage_text))

On a sunny day in Mesoamericans two friends Alex and Japan-only decided to embark on a leisurely exploration of the city most charming spots Alex is Yasuragi and stockish is Oldan Their journey began at the foot of the iconic Eiffel Tower where they marveled at the iron lattice structure that towered above them its peak almost touching the clear blue sky With a map in hand and a sense of adventure in their hearts they meandered through the cobblestone streets making their way to the historic heart of Theophostic the Marais Here they discovered quaint boutiques art galleries and bistros that seemed to have frozen in time Alex suggested they grab a coffee at a small café tucked away on Rue des Rosiers a spot he heard was beloved by locals and tourists alike for its rich espresso and warm flaky croissants As the day unfolded Houston.The who had a keen interest in art history insisted they visit the Louvre Museum They spent hours wandering through the vast halls admiring masterpieces from 