# CS211: Data Privacy
## In-Class Exercise, week of 10/31/2022

In [1]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

# adult = pd.read_csv('https://github.com/jnear/cs211-data-privacy/raw/master/homework/adult_with_pii.csv')

In [12]:
# Load data files
import numpy as np
import urllib.request
import io

url_x = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_x.npy'
url_y = 'https://github.com/jnear/cs211-data-privacy/raw/master/slides/adult_processed_y.npy'

with urllib.request.urlopen(url_x) as url:
    f = io.BytesIO(url.read())
X = np.load(f)

with urllib.request.urlopen(url_y) as url:
    f = io.BytesIO(url.read())
y = np.load(f)

In [13]:
# Split data into training and test sets
training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]

y_train = y[:training_size]
y_test = y[training_size:]

print('Train and test set sizes:', len(y_train), len(y_test))

Train and test set sizes: 36176 9044


## Question 1

Using scikit-learn, train a logistic regression model on the training data loaded above.

In [4]:
from sklearn.linear_model import LogisticRegression

In [5]:
def train_model():
    return LogisticRegression(max_iter=1000).fit(X_train, y_train)

model = train_model()
print('Model coefficients:', model.coef_[0])
print('Model accuracy:', np.sum(model.predict(X_test) == y_test)/X_test.shape[0])

Model coefficients: [ 7.12843459e-01  4.03115227e-02  2.03668555e-01  3.03399365e-01
 -3.49085623e-01 -1.30264647e-01 -7.89046462e-01 -4.84280257e-01
 -4.83121335e-01 -3.52070934e-01 -4.82226146e-01 -1.16623517e-01
 -5.10892351e-01 -6.09961426e-01  9.39198361e-02  1.72695409e-01
  5.11946054e-01  9.36487398e-01 -1.12024921e-02  7.19784481e-01
 -7.99418435e-01  1.21953621e+00  1.87253680e-01 -6.53540142e-01
  1.71197926e+00  1.51818448e+00 -3.53658922e-01 -1.06503147e+00
 -6.51703018e-01 -5.14404018e-01  8.32844519e-02  1.21773688e-01
  1.39520033e-01  8.59699325e-01 -7.61815650e-01 -5.55754524e-01
 -1.84741033e-01 -7.96587410e-01 -1.14160664e+00  6.10503587e-01
  6.12135081e-01  3.58268868e-01  6.44688013e-01  2.45838356e-03
 -1.06008120e-01  2.92462169e-01 -4.97695367e-01 -7.68881226e-01
  9.38140594e-02  9.78134655e-01 -1.93782726e-01  3.17945118e-01
 -1.54762640e-01 -8.20262822e-02  1.04452701e-01 -3.38610379e-01
  3.30436550e-01  9.40924414e-01  6.66686212e-01 -2.40896748e-01
 -1.2

## Question 2

Implement the *average gradient* of the loss below.

In [6]:
# The loss function measures how good our model is. The training goal is to minimize the loss.
# This is the logistic loss function.
def loss(theta, xi, yi):
    exponent = - yi * (xi.dot(theta))
    return np.log(1 + np.exp(exponent))

# This is the gradient of the logistic loss
# The gradient is a vector that indicates the rate of change of the loss in each direction
def gradient(theta, xi, yi):
    exponent = yi * (xi.dot(theta))
    return - (yi*xi) / (1+np.exp(exponent))

In [7]:
def avg_grad(theta, X, y):
    return np.mean([gradient(theta, xi, yi) for xi, yi in zip(X, y)], axis=0)

## Question 3

Use the average gradient from above to implement a gradient descent algorithm.

In [8]:
def gradient_descent(iterations):
    theta = np.zeros(X_train.shape[1])
    eta = 1

    for _ in range(iterations):
        theta = theta - eta * avg_grad(theta, X_train, y_train)

    return theta

theta = gradient_descent(10)
theta

array([ 1.63933476e-02, -2.62737908e-02, -3.76703876e-01,  5.75414219e-02,
       -6.45755008e-02, -2.73486282e-02, -1.30672661e-03, -5.91543132e-02,
       -7.64880798e-02, -2.62061378e-02, -1.57561761e-02, -2.86662073e-02,
       -4.72301785e-02, -3.76260473e-02, -9.90166218e-03, -1.97245917e-02,
        1.55177596e-01,  4.49929106e-02, -3.29796604e-01,  1.19518106e-01,
       -4.89454322e-03,  6.88790663e-02, -1.55396891e-01, -1.80599005e-01,
        1.54260151e-03,  3.89119966e-01, -1.97894731e-02, -5.15748994e-01,
       -5.58626968e-02, -4.09361505e-02, -1.28065226e-01, -1.62203385e-04,
       -1.08119029e-01,  1.88498244e-01, -6.47481586e-02, -8.66006277e-02,
       -9.99400278e-02, -2.05565204e-01, -1.11388992e-02,  1.61706332e-01,
       -3.59862313e-03, -1.50012146e-02,  1.68869178e-03, -5.12278071e-02,
        3.20211424e-01, -2.99730645e-01, -6.21807885e-02, -2.79987375e-01,
       -1.81265739e-01,  8.06793703e-02, -1.62147320e-02, -2.28589562e-02,
       -1.37829430e-01, -

In [9]:
# Prediction: take a model (theta) and a single example (xi) and return its predicted label
def predict(xi, theta, bias=0):
    label = np.sign(xi @ theta + bias)
    return label

def accuracy(theta):
    return np.sum(predict(X_test, theta) == y_test)/X_test.shape[0]

accuracy(theta)

0.7787483414418399

## Question 4

Implement a *noisy gradient descent* algorithm.

1. Calculate gradients for each example
2. Clip the gradients to have bounded $L2$ norm
3. Sum the clipped gradients
4. Use the Gaussian mechanism to add noise to the sum of gradients

In [18]:
def L2_clip(v, b):
    norm = np.linalg.norm(v, ord=2)
    
    if norm > b:
        return b * (v / norm)
    else:
        return v

def noisy_gradient_descent(iterations, epsilon, delta):
    theta = np.zeros(X_train.shape[1])
    eta = 1
    b = 3 # L2 clipping param


    # seq comp
    epsilon_count = .1 * epsilon
    epsilon_loop = epsilon - epsilon_count
    epsilon_i = epsilon_loop / iterations
    delta_i = delta / iterations

    noisy_count = laplace_mech(len(X_train), 1, epsilon_count)
    for _ in range(iterations):
        grads = [gradient(theta, x, y) for x, y in zip(X_train, y_train)]
        clipped_grads = [L2_clip(g, b) for g in grads]
        grad_sum = np.sum(clipped_grads, axis=0)

        noisy_grad_sum = gaussian_mech_vec(grad_sum, b, epsilon_i, delta_i)
        noisy_grad_avg = np.array(noisy_grad_sum) / noisy_count
        theta = theta - eta * noisy_grad_avg

    return theta

theta = noisy_gradient_descent(10, 0.001, 1e-5)
print('Final accuracy:', accuracy(theta))

Final accuracy: 0.5553958425475454


In [17]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70

## Question 5

What is the *total privacy cost* of the noisy gradient descent algorithm above, and why? Argue informally that the algorithm satisfies this privacy cost. Use sequential composition.

The privacy cost outside the loop is:
 laplace for count has sens 1,
 total cost is epsilon
The total privacy cost per iteration is
sensitivity: gaussian for sum of vectors with l2 norm <= b, sens b
composition: total cost is (epsilon, delta)
post-processing: theta is updated only with diff private info

total priv cost:
10 iterations
by sequential composition, total cost = (11 * epsilon, 10 * delta)

## Question 6

Repeat the above, but using advanced composition.

Outside the loop: (epsilon, 0)
Inside the loop: (epsilon, delta) per iter

use adv comp for loop, then seq comp to combine with outside loop
```
delta' = delta = 1e-5
epsilon' = 2 * epsilon * sqrt(2 * k * log(1/delta'))
= 2 * 1 * sqrt(2 * 10 * log(1/1e-5)
= 30.34
final epsilon = epsilon' + epsilon 30.34 + 1
final delta = k * delta + delta' = 10 * 1e5 + 1e5 = 0.00011
```


## Question 7

Implement a version of noisy gradient descent that satisfies a *total* of $(\epsilon, \delta)$-differential privacy. Use sequential composition.

In [19]:
# TEST CASE

assert accuracy(noisy_gradient_descent(5, 0.001, 1e-5)) < 0.76
assert accuracy(noisy_gradient_descent(5, 1.0, 1e-5)) > 0.70