# CC5215: Privacidad de Datos

## Laboratorio 8

Integrantes:

- Nombre

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from scipy import stats
import matplotlib.pyplot as plt

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://users.dcc.uchile.cl/~mtoro/cursos/cc5215/adult_with_pii.csv')

## Randomized response

## Question 1 (5 points)

Complete the definition of `encode_rand_resp_yes_no`. The true response should be returned with probability `p`, otherwise the result should be a coin flip.

In [None]:
def encode_rand_resp_yes_no(p, true_response):
    raise NotImplementedError()

for _ in range(5):
    print('Randomized response:', encode_rand_resp_yes_no(0.2, True))

## Question 2 (10 points)

Find the equation to calculate the privacy level, $\epsilon$, of the randomized response algorithm.

First, provide an intuition for the following questions:

- What do you think the value of `epsilon` should be if `p` is 0?
- What do you think the value of `epsilon` should be if `p` is 1?

- YOUR ANSWER HERE
- YOUR ANSWER HERE

Second, explain your reasoning and complete the definition of `rr_epsilon`:

YOUR ANSWER HERE

In [None]:
def rr_epsilon(p):
    raise NotImplementedError()

In [None]:
# TESTS

assert np.isclose(rr_epsilon(0.3), 0.619039)
assert np.isclose(rr_epsilon(0.5), 1.098612)
assert np.isclose(rr_epsilon(0.8), 2.197224)


## Question 3 (6 points)

Write the code to plot the level of privacy for 1000 samples of probabilities between 0 and 1.

In [None]:
# YOUR CODE HERE



- Describe the obtained plot in a few sentences. Does the plot coincides with your intuitions given in the previous question? How does increasing the probability $p$ affects the privacy level of the algorithm?

- YOUR ANSWER HERE

## Question 4 (15 points)

Implement `decode_rand_resp_yes_no` that computes the unbiased estimator of `encode_rand_resp_yes_no`. Notice that this function is also parametrized by `p`, so the result seen in class 11 must be generalized from 0.5 to p.

_HINT_: (class 12, slide 15) -> What are the new values of `P[resp SI | ventas]` and `P[resp SI | no-ventas]`?

In [None]:
def decode_rand_resp_yes_no(p, responses):
    raise NotImplementedError()

# Example: 1000 "yesses" and 500 "nos"
true_responses = [True for _ in range(1000)] + [False for _ in range(500)]
print('Number of "True" yesses:', np.sum(true_responses))

# Randomized responses
# Each response satisfies 1.09-differential privacy
rand_responses = [encode_rand_resp_yes_no(0.8, r) for r in true_responses]

# Decode the responses by subtracting "fake" yesses
print('Decoded randomized response yesses:', decode_rand_resp_yes_no(0.8, rand_responses))

In [None]:
BASIC_P = 0.5

# TEST CASE
true_responses = [True for _ in range(1000)] + [False for _ in range(500)]

# Randomized responses
# Each response satisfies 1.09-differential privacy
rand_responses = [encode_rand_resp_yes_no(BASIC_P, r) for r in true_responses]

# Decode the responses by subtracting "fake" yesses
assert decode_rand_resp_yes_no(BASIC_P, rand_responses) < 1100
assert decode_rand_resp_yes_no(BASIC_P, rand_responses) > 900

## Question 5 (5 points)

Implement the function `calc_rand_responses` that computes the unbiased estimator of the randomized response algorithm over a list of truthful responses. Additionally, implement `generate_rr_errors` that produces a list of length `n`, of percentual errors of RR.

In [None]:
def calc_rand_responses(p, true_responses):
    raise NotImplementedError()

def generate_rr_errors(n, p, true_responses):
    raise NotImplementedError()

## Question 6 (4 points)

Observe the following plot:


In [None]:
probabilities = [0.25, 0.5, 0.75]

bins = 20

for p in probabilities:
    errors = generate_rr_errors(400, p, true_responses)

    print("Average error for p=" + str(p) + ":", np.mean(errors))

    _, binsp, _ = plt.hist(errors, bins=bins, label='p=' + str(p) + '; eps=' + str(rr_epsilon(p))[0:5], alpha=0.8)

    bins = binsp

plt.legend()



- Describe the plot in a few sentences
- What properties of the randomized response algorithm can you observe?

- YOUR ANSWER HERE
- YOUR ANSWER HERE

## Unary enconding

These are the definitions provided in class:

In [None]:
domain = adult['Occupation'].dropna().unique()

p, q = .75, .25

def encode(response):
    return [1 if d == response else 0 for d in domain]

def perturb(p, q, encoded_response):
    return [perturb_bit(p, q, b) for b in encoded_response]

def perturb_bit(p, q, bit):
    sample = np.random.random()
    if bit == 1:
        if sample <= p:
            return 1
        else:
            return 0
    elif bit == 0:
        if sample <= q:
            return 1
        else:
            return 0

def aggregate(p, q, responses):
    sums = np.sum(responses, axis=0)
    n = len(responses)

    return [(v - n*q) / (p-q) for v in sums]

def unary_epsilon(p, q):
    return np.log((p*(1-q)) / ((1-p)*q))

## Question 7 (5 points)

Complete the definition of `unary_sales_count`, that returns a private answer to how many people work in 'Sales', using the unary encoding algorithm.

In [None]:
def unary_sales_count(p, q):
    raise NotImplementedError()

## Question 8 (6 points)

Write a script that plots the relative errors for 50 iterations of the laplace mechanism and the unary mechanism. Additionally, print the average error for both mechanisms.

_HINT_: Use `plt.subplot` to make 2 plots side-by-side.

In [None]:
# YOUR CODE HERE

## Question 9 (4 points)

Analyze the plot of the previous question:

- How do the accuracies of both mechanisms compare?
- Besides the difference of accuracy between the two mechanisms, what is the primary guarantee that unary encoding offers over the laplace mechanism?

YOUR ANSWER HERE