In [1]:
## Library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
## Load the country_data from my google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load the technical phase data
# df = pd.read_csv('/content/drive/My Drive/Pets/technical_phase_data.csv')

# select all data with postal code starts with 11 and save it to google drive
# df_selected = df[df['Postal Code'].str.startswith('11', na=False)]

# Create Bogoda dataset
# df_selected.to_csv('/content/drive/My Drive/Pets/Bogoda.csv', index=False)


In [4]:
# Assuming the file is in the root of your Google Drive

# df = pd.read_csv('/content/drive/My Drive/Pets/technical_phase_data.csv')
df = pd.read_csv('/content/drive/My Drive/Pets/Bogoda.csv')

# Only load first 1000 data for test
# df = pd.read_csv('/content/drive/My Drive/Pets/Bogoda.csv', nrows=1000)

In [5]:
# randomly select 10% of the dataset
# df = df.sample(frac=0.1, random_state=42)  # 10% sample, set random_state for reproducibility

# Set random
seed = 42


In [6]:
## generate randomized response for each quantized value from all labels
def randomized_response(quantized_value, labels, epsilon):

  if quantized_value in labels:
    a = random.random()
    print(a)
    if a <= ((np.exp(epsilon)-1) / (np.exp(epsilon) + len(labels) - 1)):
      return quantized_value
    else:
      return random.choice(labels)

  else:
    return random.choice(labels)
    print('test')


In [7]:
# Set your desired privacy parameter

simple_epsilon = 1  ## No top K [1, 5, 10]
dist_epsilon = 0.5 ## Top K
noise_epsilon = simple_epsilon - dist_epsilon ## used for frequency distrubution


In [8]:
# how many ID-Postal paris are there

# Create a new column 'combo' by concatenating ID and postal code
df['combo'] = df['ID'].astype(str) + '_' + df['merch_postal_code'].astype(str)

# Now you have a new column 'combo_number' with a unique numerical representation for each combo

print(df[['ID', 'merch_postal_code', 'combo']])

num_users = df['ID'].nunique()
print(f"The number of unique users is: {num_users}")
num_postcodes = df['merch_postal_code'].nunique()
print(f"The number of unique post codes is: {num_postcodes}")
unique_combos_count = df['combo'].nunique()
print(f"The number of unique combos is: {unique_combos_count}")

           ID  merch_postal_code        combo
0           2             110621     2_110621
1           4             111941     4_111941
2           4             111111     4_111111
3           4             111061     4_111061
4           5             111321     5_111321
...       ...                ...          ...
1731075  9995             112041  9995_112041
1731076  9995             111821  9995_111821
1731077  9996             111021  9996_111021
1731078  9997             110571  9997_110571
1731079  9999             110121  9999_110121

[1731080 rows x 3 columns]
The number of unique users is: 7265
The number of unique post codes is: 80
The number of unique combos is: 477942


In [9]:
# Define new Perturbed combo

df['Perturbed_combo'] = ''
all_possible_combos = df['combo'].unique().tolist()

In [10]:
# Regular RR on combo

%%capture
for index, row in df.iterrows():
  df.at[index, 'Perturbed_combo'] = randomized_response(row['combo'], all_possible_combos, simple_epsilon)


In [11]:
# Retrive the perturbed combo to ID and merch_post_code
# Create new columns 'id_new' and 'post_code_new'
df['ID_Perturbed'] = ''
df['merch_postal_code_perturbed'] = ''

# Split the 'perturbed_combo' column into 'id_new' and 'post_code_new'
for index, row in df.iterrows():
  perturbed_combo = row['Perturbed_combo']
  id_part, post_code_part = perturbed_combo.split('_')
  df.at[index, 'ID_Perturbed'] = id_part
  df.at[index, 'merch_postal_code_perturbed'] = post_code_part

# print(df)

In [12]:
# Convert 'ID_Perturbed', and 'merch_postal_code_perturbed' to numeric

df['ID_Perturbed'] = pd.to_numeric(df['ID_Perturbed'], errors='coerce')
df['merch_postal_code_perturbed'] = pd.to_numeric(df['merch_postal_code_perturbed'], errors='coerce')

print(df.dtypes)


Unnamed: 0                       int64
ID                               int64
date                            object
merch_category                  object
merch_postal_code                int64
transaction_type                object
spendamt                       float64
nb_transactions                  int64
combo                           object
Perturbed_combo                 object
ID_Perturbed                     int64
merch_postal_code_perturbed      int64
dtype: object


In [13]:
# Sanity check to see how many ID are different from ID perturbed

diff_ids_count = df[df['ID'] != df['ID_Perturbed']].shape[0]
print(f"The number of rows where 'ID' and 'ID_Perturbed' are different: {diff_ids_count}")

# In theory
diff_ids_theory  = (unique_combos_count - 1) / (np.exp(simple_epsilon) + unique_combos_count - 1) *unique_combos_count
print(f"The number of rows where 'ID' and 'ID_Perturbed' are different in theory: {diff_ids_theory}")


The number of rows where 'ID' and 'ID_Perturbed' are different: 1730809
The number of rows where 'ID' and 'ID_Perturbed' are different in theory: 477939.2817279442


In [14]:
# Find the frequency distribution of labels and privatized this distribution

from collections import Counter

# Calculate the frequency distribution of original labels
label_counts = df['combo'].value_counts()

# Function to apply Laplace Mechanism to a frequency distribution
def laplace_mechanism(frequency_distribution, sensitivity, epsilon):
  privatized_distribution = {}
  for label, count in frequency_distribution.items():
    noise = np.random.laplace(loc=0, scale=sensitivity / epsilon)
    privatized_count = count + noise
    privatized_distribution[label] = privatized_count
  return privatized_distribution

# Set the sensitivity of the frequency query (maximum possible change in count)
sensitivity = 1

# Apply the Laplace Mechanism to the original and privatized frequency distributions
privatized_label_counts_laplace = laplace_mechanism(label_counts.to_dict(), sensitivity, dist_epsilon)


In [15]:
# Normalize the privatized frequency distribution
total_privatized_count = sum(privatized_label_counts_laplace.values())
normalized_privatized_label_counts = {label: count / total_privatized_count for label, count in privatized_label_counts_laplace.items()}

# print(normalized_privatized_label_counts)


In [16]:
# Find the top k combos that consist 90% of the this private count

def find_top_k_combos(normalized_privatized_label_counts, threshold):

  sorted_combos = sorted(normalized_privatized_label_counts.items(), key=lambda item: item[1], reverse=True)
  current_count = 0
  top_k_combos = []

  for combo, count in sorted_combos:
    top_k_combos.append(combo)
    current_count += count
    if current_count >= threshold:
      break

  return top_k_combos

threshold = 0.9
top_combos = find_top_k_combos(normalized_privatized_label_counts, threshold)

k = top_combos_length = len(top_combos)
print(f"The length of top combos is: {top_combos_length}")
print(f"Top combos constituting at least {threshold * 100}% of the private count")
# print(top_combos)


The length of top combos is: 213499
Top combos constituting at least 90.0% of the private count


In [None]:
# Select the top K most frequent labels
%%capture
for index, row in df.iterrows():
  df.at[index, 'Perturbed_combo_k'] = randomized_response(row['combo'], top_combos, noise_epsilon)

In [None]:
# Retrive the perturbed combo to ID and merch_post_code
# Create new columns 'id_new' and 'post_code_new'
df['ID_Perturbed_k'] = ''
df['merch_postal_code_perturbed_k'] = ''

# Split the 'perturbed_combo' column into 'id_new' and 'post_code_new'
for index, row in df.iterrows():
  perturbed_combo = row['Perturbed_combo_k']
  id_part, post_code_part = perturbed_combo.split('_')
  df.at[index, 'ID_Perturbed_k'] = id_part
  df.at[index, 'merch_postal_code_perturbed_k'] = post_code_part

# print(df)

In [None]:
# Convert 'ID_Perturbed', and 'merch_postal_code_perturbed' to numeric

df['ID_Perturbed_k'] = pd.to_numeric(df['ID_Perturbed_k'], errors='coerce')
df['merch_postal_code_perturbed_k'] = pd.to_numeric(df['merch_postal_code_perturbed_k'], errors='coerce')

print(df.dtypes)

In [None]:
#  generate the csv file and download

from google.colab import files
df.to_csv('topk_ep10.csv', encoding = 'utf-8-sig')
files.download('topk_ep10.csv')
