In [1]:
## Library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
# Assuming the file is in the root of your Google Drive

df = pd.read_csv('Data/Processed/technical_phase_data.csv')

In [3]:
## generate randomized response for each quantized value from all labels
def randomized_response(quantized_value, labels, epsilon):

  if quantized_value in labels:

    if random.random() < (np.exp(epsilon)-1) / (np.exp(epsilon) + len(labels) - 1):
      return quantized_value
    else:
      return random.choice(labels)

  else:
    return random.choice(labels)

## RR Top k

def new_randomized_response(quantized_value, top_k_labels, epsilon):
  if quantized_value in top_k_labels:
    # Apply randomized response if the label is in the top k
    if random.random() < (np.exp(epsilon)-1) / (np.exp(epsilon) + len(top_k_labels) - 1):
      return quantized_value
    else:
      return random.choice(top_k_labels)
  else:
    # Uniformly report one of the top k labels
    return random.choice(top_k_labels)

In [4]:
# Set your desired privacy parameter

simple_epsilon = 10  ## No top K [1, 5, 10]
noise_epsilon = 5 ## Top K [0.5, 2.5, 5]
dist_epsilon = simple_epsilon - noise_epsilon ## used for frequency distrubution

k = 50 # Set the number of top elements to find


In [5]:
# Get unique postal codes as labels
labels = df['merch_postal_code'].unique().tolist()

df['Perturbed_PostCode'] = df['merch_postal_code'].apply(lambda x: randomized_response(x, labels, simple_epsilon))

rr_labels = df['Perturbed_PostCode'].unique().tolist()


In [6]:
# Sanity Check

total_entries = len(df)
print(f"Total number of data entries: {total_entries}")
diff_count = (df['merch_postal_code'] != df['Perturbed_PostCode']).sum()
print(f"Number of entries with different original and perturbed postal codes: {diff_count}")

num_labels = len(labels)
print(f"The number of unique labels (postal codes) is: {num_labels}")
num_rr_labels = len(rr_labels)
print(f"The number of unique labels (Perturbed postal codes) is: {num_labels}")

Total number of data entries: 4180209
Number of entries with different original and perturbed postal codes: 56751
The number of unique labels (postal codes) is: 303
The number of unique labels (Perturbed postal codes) is: 303


In [7]:
# Find the frequency distribution of labels and privatized this distribution

from collections import Counter

# Calculate the frequency distribution of original labels
label_counts = df['merch_postal_code'].value_counts()

# Calculate the frequency distribution of privatized labels
perturbed_label_counts = df['Perturbed_PostCode'].value_counts()

# Function to apply Laplace Mechanism to a frequency distribution
def laplace_mechanism(frequency_distribution, sensitivity, epsilon):
  privatized_distribution = {}
  for label, count in frequency_distribution.items():
    noise = np.random.laplace(loc=0, scale=sensitivity / epsilon)
    privatized_count = count + noise
    privatized_distribution[label] = privatized_count
  return privatized_distribution

# Set the sensitivity of the frequency query (maximum possible change in count)
sensitivity = 1

# Apply the Laplace Mechanism to the original and privatized frequency distributions
privatized_label_counts_laplace = laplace_mechanism(label_counts.to_dict(), sensitivity, dist_epsilon)


In [8]:
# Select the top K most frequent perturbed labels

def get_top_k_labels(label_counts, k):
  """Selects the top K most common labels from a frequency distribution."""
  top_k_labels = label_counts.nlargest(k).index.tolist()
  return top_k_labels

top_k_perturbed_labels = get_top_k_labels(perturbed_label_counts, k)


In [9]:
# Select the top K most frequent labels

df['Perturbed_PostCode_50'] = df['merch_postal_code'].apply(lambda x: new_randomized_response(x,top_k_perturbed_labels , noise_epsilon))

new_rr_labels = df['Perturbed_PostCode_50'].unique().tolist()


In [10]:
# Sanity Check

total_entries = len(df)
print(f"Total number of data entries: {total_entries}")
diff_count = (df['merch_postal_code'] != df['Perturbed_PostCode_50']).sum()
print(f"Number of entries with different original and perturbed postal codes: {diff_count}")

num_labels = len(labels)
print(f"The number of unique labels (postal codes) is: {num_labels}")
num_new_rr_labels = len(new_rr_labels)
print(f"The number of unique labels (Perturbed_PostCode_50') is: {num_new_rr_labels}")


Total number of data entries: 4180209
Number of entries with different original and perturbed postal codes: 3237598
The number of unique labels (postal codes) is: 303
The number of unique labels (Perturbed_PostCode_50') is: 50


In [38]:
#  generate the csv file and download

df.to_csv('top50_ep10.csv', encoding = 'utf-8-sig')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>