# CS211: Data Privacy
## Homework 9

In [None]:
# Load the data and libraries
import pandas as pd
import numpy as np
import random
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def laplace_mech_vec(vec, sensitivity, epsilon):
    return [v + np.random.laplace(loc=0, scale=sensitivity / epsilon) for v in vec]

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

In [None]:
# preserves epsilon-differential privacy
def above_threshold(query_results, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    
    for idx, q in enumerate(query_results):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q + nu_i >= T_hat:
            return idx
    return None

## Question 1 (20 points)

Implement a function `above_10000` that releases the **value** of the first query in a sequence of queries whose value is above 10000. Your function should have a **total** privacy cost equal to the privacy parameter $\epsilon$ passed in when it is called.

**Note**: this function (and the rest of the ones you'll define in this assignment) take a list of *query results* rather than the queries themselves (as we saw in class). This simplification makes your code a little bit simpler.

In [None]:
def above_10000(query_results, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

queries = adult['Marital Status'].value_counts()
print(f"above_10000 #1: {above_10000(queries, 100)}")
print(f"above_10000 #2: {above_10000(queries, 1)}")
print(f"above_10000 #3: {above_10000(queries, .01)}")

In [None]:
# TEST CASE

results = [above_10000(queries, 1.0) for _ in range(20)]
print(np.mean(results))
assert np.mean(results) > 14900
assert np.mean(results) < 15000

## Question 2 (10 points)
In 2-3 sentences, argue informally (via the definition of the sparse vector technique, post-processing, and sequential composition), that your implementation of `above_10000` has a total privacy cost of $\epsilon$.

YOUR ANSWER HERE

## Question 3 (20 points)

Implement a function `bounded_all_above_10000` that releases the **value** of **$c$ queries** in a sequence of queries whose value is above 10000 (where $c$ is an analyst-provided parameter limiting the number of returned results). Your function should have a **total privacy cost** bounded by its parameter $\epsilon$.

In [None]:
def bounded_all_above_10000(query_results, c, epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()

# Note: the official solution also returns the total budget used
# This will always be <= ε, but might (often is) be less than ε
queries = list(adult['Marital Status'].value_counts())
print(f"bounded_all_above_10000 #1: {bounded_all_above_10000(queries, 3, 100)}")
print(f"bounded_all_above_10000 #2: {bounded_all_above_10000(queries, 3, 1)}")
print(f"bounded_all_above_10000 #3: {bounded_all_above_10000(queries, 3, .01)}")

In [None]:
# TEST CASE

results = [bounded_all_above_10000(queries, 2, 1.0)]
results_1 = [r[0] for r in results]
results_2 = [r[1] for r in results]

assert np.mean(results_1) > 14900
assert np.mean(results_1) < 15000
assert np.mean(results_2) > 10600
assert np.mean(results_2) < 10700

## Question 4 (10 points)

In 2-3 sentences, argue informally that your implementation of `bounded_all_above_10000` has privacy cost bounded by $\epsilon$.

YOUR ANSWER HERE

## Question 5 (30 points)

Implement a function `mean_age` that computes the mean age of participants in the `adult_data` dataset. Your function should have a **total** privacy cost of $\epsilon$. It should work as follows:

1. Compute an *upper* clipping parameter based on the data
2. Compute a *lower* clipping parameter based on the data
3. Clip the data using the lower and upper clipping parameters
4. Use `laplace_mech` to release a differentially private mean of the clipped data

*Hint*: Use the sparse vector technique (`above_threshold`) to compute the clipping parameters. Consider using a sequence of queries that looks like `df.clip(lower=b, upper=0).sum() - df.clip(lower=b+1, upper=0).sum()`.

*Hint*: Be careful of sensitivities and set the scale of the noise accordingly!

In [None]:
bs = list(range(0, 200, 10))
df = adult['Age']

def mean_age(epsilon):
    # YOUR CODE HERE
    raise NotImplementedError()
    
for epsilon in [0.001, 0.01, 0.1, 0.5, 1, 10]:
    print(f"epsilon: {epsilon}, mean age: {mean_age(epsilon)}")

In [None]:
# TEST CASE
results = [mean_age(1.0) for _ in range(20)]
assert np.mean(results) > 38
assert np.mean(results) < 39

## Question 6 (10 points)

In 3-5 sentences, describe your approach to implementing `mean_age` and argue informally that your implementation has privacy cost bounded by $\epsilon$.

YOUR ANSWER HERE