In [None]:
# Classes File

In [6]:
'''
Do's
- look to replicate a regression from Chetty Friedman Atlas of Opportunity
- rubenstein
'''

"\nDo's\n- look to replicate a regression from Chetty Friedman Atlas of Opportunity\n- rubenstein\n"

In [176]:
# Local Import
import numpy as np
import pandas as pd
import math
import scipy.stats as ss
import numpy.linalg as la
from itertools import product
import numpy.random as nr
import pickle

# Plot Tools
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import cycler
colors = cycler('color', 
       ['#EE6666', '#3388BB', '#9988DD', '#EECC55', 
       '#88BB44', '#FFBBBB'])

plt.rc('axes', facecolor='#E6E6E6', edgecolor='none',
      axisbelow=True, grid=True, prop_cycle=colors)
plt.rc('grid', color='w', linestyle='solid')
plt.rc('xtick', direction='out', color='gray')
plt.rc('ytick', direction='out', color='gray')
plt.rc('patch', edgecolor='#E6E6E6')
plt.rc('lines', linewidth=2)

In [131]:
# R in Python Interface
import rpy2
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
import rpy2.robjects.numpy2ri as rnp
robustbase = importr('robustbase')
base = importr('base')
utils = importr('utils')
diffpriv = importr('diffpriv')

In [194]:
# Run this in an R console if it ever disappears
# install.packages('diffpriv')
# install.packages("devtools")
# devtools::install_github("brubinstein/diffpriv")

## a target function we'd like to run on private data X, releasing the result
target <- function(X) mean(X)

## target seeks to release a numeric, so we'll use the Laplace mechanism---a
## standard generic mechanism for privatizing numeric responses
library(diffpriv)
mech <- DPMechLaplace(target = target)

'''
To run mech on a dataset X we must first determine the sensitivity of target to small changes to input dataset. 
One avenue is to analytically bound sensitivity (on paper; see the vignette) 
and supply it via the sensitivity argument of mechanism construction: in this case not hard if we assume bounded data, 
but in general sensitivity can be very non-trivial to calculate manually. The other approach, which we follow in this example, 
is sensitivity sampling: repeated probing of target to estimate sensitivity automatically. 
We need only specify a distribution for generating random probe datasets; sensitivitySampler() takes care of the rest. 
The price we pay for this convenience is the weaker form of random differential privacy.
'''

## set a dataset sampling distribution, then estimate target sensitivity with
## sufficient samples for subsequent mechanism responses to achieve random
## differential privacy with confidence 1-gamma
distr <- function(n) rnorm(n)
mech <- sensitivitySampler(mech, oracle = distr, n = 5, gamma = 0.1)
#> Sampling sensitivity with m=285 gamma=0.1 k=285
mech@sensitivity    
## DPMech and subclasses are S4: slots accessed via @
#> [1] 0.8089517

X <- c(0.328,-1.444,-0.511,0.154,-2.062) # length is sensitivitySampler() n
r <- releaseResponse(mech, privacyParams = DPParamsEps(epsilon = 1), X = X)
cat("Private response r$response:   ", r$response,
  "\nNon-private response target(X):", target(X))
#> Private response r$response:    -1.119506 
#> Non-private response target(X): -0.707

In [5]:
# Test Data - Census Tract
%cd '/home/nbuser/library/example_code_implementation_guide/'
stata = pd.read_stata('private_data_by_cells.dta')
data = stata[stata.columns[::-1]]

/home/nbuser/library/example_code_implementation_guide


In [218]:
def npl(x):
    return np.asarray([i.tolist() for i in x])

class Wrangle:
    '''
    Data Wrangling Class'''
    def __init__(self, data, aggregate=False, partition=False):
        # Numpy-nize Census Tract
        self.Key = data.keys()
        if aggregate:
            Y = np.asarray(data.loc[:, self.Key[1]])
            X = np.asarray(data.loc[:, self.Key[2]])
            self.N = len(X)
        elif partition:
            None
        else:
            if len(self.Key) > 3:
                None
            else:
                Cell = np.unique(np.array(data[self.Key[0]])).astype(int)
                Y = [np.asarray(data.loc[data.loc[:, self.Key[0]]== i, self.Key[1]]) for i in Cell]
                X = [np.asarray(data.loc[data.loc[:, self.Key[0]]== i, self.Key[2]]) for i in Cell]
                self.N = np.asarray([len(i) for i in X])
        self.X = X
        self.Y = Y 
        self.v = len(self.Key) - 1
                
    def __call__(self):
        return self.X, self.Y, self.N, self.v

class Methods:
    ''' Class of regression methods and their parts
    '''
    def __init__(self, X, Y, N, v, method=None):
        ''' Make sure you are using at most 2-dimensional arrays for X, Y
        '''
        # We assume X is organised by index with dependent variables inside each
        self.X = np.asarray(X)
        self.Y = np.asarray(Y)
        self.method = method
        if self.method == "SMDM":
            self.form = ro.Formula("y~x")
        self.N = N
        self.v = v
        self.o = np.asarray([list(i) for i in product([0, 1], repeat = self.v)])
        if type(self.N) is int:
            self.index = [None]
        else:
            self.index = range(len(self.N))
        self.range = range(4) 
        self.nd75 = ss.norm.ppf(0.75)
    
    def sort(self, I=None, O=None):
        ''' Sort arrays and append outlier variations
        '''
        if I is None:
            y = self.Y
            x = self.X
        else:
            y = self.Y[I]
            x = self.X[I]
        if O is not None:
            y = np.block([y, self.o[O][0]])
            x = np.block([x, self.o[O][1]])
        return x, y
    
    def stack(self, x):
        ''' Add constant to estimate 
        '''
        return np.block([[np.ones(len(x))], [x.T]]).T
    
    def lstsq(self, x, y, w=None, se=False):
        ''' Calculus linear least-squares
        '''
        x = self.stack(x)
        if w is not None:
            X = x.T.dot(np.diag(w)).T
        else:
            X = x
        β̂ = np.dot(la.inv(X.T.dot(x)), X.T.dot(y))
        r = y - np.dot(X, β̂)
        if se:
            se = np.sqrt(r.dot(r) / np.sum(np.square(X.T[1] - np.mean(X.T[1]))) / (len(r) - self.v))
        return β̂, r, se

    # OLS Method:
    def OLS(self, I=None, O=None, se=False):
        x, y = self.sort(I=I, O=O)
        β̂, r, se = self.lstsq(x, y, None, se)
        return β̂, r, se
        
    # MM Method:
    def MAD(self, r):
        ''' Median Absolute Deviation 
        '''
        return np.median(np.absolute(r - np.median(r)))
    
    def Tukey(self, u, c=1.547):
        ''' Tukey's biweight function ''' 
        ρ = np.power(u, 2) / 2 - np.power(u, 4) / (2 * math.pow(c, 2)) + np.power(u, 6) / (6 * math.pow(c, 4))
        if np.any(np.abs(u) > c):
            ρ[np.abs(u) > c] = math.pow(c, 2) / 6            
        return ρ
            
    def σ(self, r, w=None, K=0.199):
        ''' The scale for the S-Estimator we wish to minimize
        '''
        if w is None:
            σ = self.MAD(r) / self.nd75 
        else:
            σ = np.sqrt(np.sum(np.multiply(np.square(r),w)) / (len(r) * K))
        return σ
    
    def u(self, r, σ):
        ''' Bisquare ratio 
        '''
        return r / σ
    
    def weight(self, u, method="S", it=False):
        ''' Estimator Reweighing
        '''
        if method=="S":
            if not it:
            # iteration = 1
                w = np.square(u / 1.547)
                if np.any(np.abs(u) > 1.547):
                    w[np.abs(u) > 1.547] = 0
                weight = np.square(1 - w)
            else:
                weight = self.Tukey(u) / np.square(u)
        if method=="MM":
            w = np.square(u / 4.685)
            if np.any(np.abs(u) > 4.685):
                w[np.abs(u) > 4.685] = 0
            weight = np.square(1 - w)
        return weight

    def MM(self, I=None, O=None, se=False, maxiter=100):
        ''' MM-Estimator regression algorithm as specified in Susanti 2009
        '''
        x, y = self.sort(I, O)
        β̂, r, se = self.lstsq(x, y, None, False)
        # 2. S-Estimation
        # 1st iteration
        σ = self.σ(r)
        u = self.u(r, σ)
        w = self.weight(u, "S", False)
        β̂, r, se = self.lstsq(x, y, w, False)
        # IRWLS
        for i in range(maxiter):
            σ = self.σ(r, w)
            u = self.u(r, σ)
            w = self.weight(u, "S", True)
            S, r, se = self.lstsq(x, y, w, False)
            if np.allclose(S, β̂, rtol=1e-09):
                break
            else:
                β̂ = S
        # We use the MAD of the residuals of the S-estimator
        σ = self.σ(r)
        # 3. MM-Estimate Convergence 
        for i in range(maxiter):
            u = self.u(r, σ)
            w = self.weight(u, "MM")
            M, r, se = self.lstsq(x, y, w, False)
            if np.allclose(M, β̂, rtol=1e-03):
                break
            else:
                β̂ = M
        if se:
            β̂s, rs, se = self.lstsq(x, y, w, True)
        return M, r, se, σ   
        
    # SMDM Method:
    def SMDM(self, I=None, O=None):
        ''' Run SMDM from R's 'lmrob' package via rpy2
        '''
        x, y = self.sort(I, O)
        try:
            nr, nc = x.shape
        except:
            nr = x.shape[0]
            nc = 1
        # Set up formula environment
        rnp.activate()
        form.environment["y"] = ro.r['matrix'](y, nrow=nr, ncol=1)
        form.environment["x"] = ro.r['matrix'](x, nrow=nr, ncol=nc)
        # Run Robust Regression
        lmr = robustbase.lmrob(self.form, method = "SMDM", setting="KS2014")
        # must copy variables because of memory constraint
        betas = np.array(lmr.rx2("coefficients"), copy=True)
        scale = np.array(lmr.rx2("scale"), copy=True)
        residuals = np.array(lmr.rx2("residuals"), copy=True)
        stderr = np.array(np.diag(lmr.rx2("cov")), copy=True)
        return betas, residuals, stderr, scale
    
    def __call__(self, withoutliers=True):
        ''' Specify regression method and run all regression in one/two line(s)
        '''
        if self.method == "OLS":
            if self.index is None:
                regression = self.OLS(None, None, False)
                if withoutliers:
                    withoutliers = np.asarray([self.OLS(None, O, False) for O in self.range])
            else:
                regression = np.asarray([self.OLS(I, None, False) for I in self.index])
                if withoutliers:
                    withoutliers = np.asarray([[self.OLS(I, O, False) for O in self.range] for I in self.index])    
        elif self.method == "MM":
            if self.index is None:
                regression = self.MM(None, None, False, 100)
                if withoutliers:
                    withoutliers = np.asarray([self.MM(None, O, False) for O in self.range])
            else:
                regression = np.asarray([self.MM(I, None, False) for I in self.index])
                if withoutliers:
                    withoutliers = np.asarray([[self.MM(I, O, False) for O in self.range] for I in self.index])
        elif self.method == "SMDM":
            if self.index is None:
                regression = self.SMDM(None, None)
                if withoutliers:
                    withoutliers = np.asarray([self.SMDM(None, O) for O in self.range])
            else:
                regression = np.asarray([self.SMDM(I, None) for I in self.index])
                if withoutliers:
                    withoutliers = np.asarray([[self.MM(I, O) for O in self.range] for I in self.index])
        return regression, withoutliers

class LocSnoises:
    ''' Calculates Local Sensivity and releases statistics a la Chetty & Friedman
    '''
    def __init__(self, regression, withoutliers, N):
        self.RA = regression
        self.RB = withoutliers
        self.range = range(4)
        self.N = N
        if type(self.N) is int:
            self.index = [None]
        else:
            self.index = range(len(self.N))
            
    def ω(self):
        ''' draws from Laplace Distribution 
        '''
        return nr.laplace(0, 1/math.sqrt(2))
    def LSR(self):
        self.oR()
        ''' 
        Calculate Local Sensitivity 
        '''
        MOSE = max(np.multiply(self.N, 
                [max([abs(self.RB[i][j][0][1] - self.RA[i][0][1]) for j in self.range]) for i in self.index]))
        return np.transpose([[self.RA[i][j] for j in [0, 2]] for i in self.index]), MOSE
    
    def noise(self, θseθ, χ):
        ''' 
        Add Noise to Statistics 
        '''
        NN = np.asarray(self.D.N).astype(float)
        S = NN * self.ϵ
        noise = lambda x: x * math.sqrt(2)
        
        nθ = [i[1] for i in θseθ[0]] + χ * noise(self.ω()) / S
        sen = np.square(θseθ[1]) + 2 * np.square(χ / S)
        senθ = np.sqrt(sen.astype(float))
        nsenθ = senθ + χ * noise(self.ω()) / S
        nsenθ = np.asarray(["Sample Size Too Small" if i==np.inf else i for i in nsenθ])
        nN = self.D.N * (1 + noise(self.ω()) / S)
        return nθ, nsenθ, nN
    
    def __call__(self):
        ''' 
        Release Noise Infused Statistics 
        '''
        m1, m2 = self.LSR()
        nθ, nsenθ, nN = self.noise(m1, m2)
        return nθ, nsenθ, nN
    
class DWORK:
    def __init__(self, Tr, r, o, ϵ, psize):
        ''' We have assumed aggregate data instead of tract partitioning
            ### b is org by cells with b's in cols
            ### B.T is B0,B1 arrays
            ### db is org by cell with alt's within - 3dim
            ### dB.T splits B_0 and B_1 alts into two arrays with 4 subarrys each
        '''
        # length of each cell
        self.n = psize
        # base for each cell (for S algorithm)
        self.base = np.asarray([1 + 1 / np.log(i) for i in self.n])
        
        # store β̂'s and β̂ᵢ's:
        self.Tb = Tr[0][0][0]
        self.b = npl(r.T[0]).T
        self.db = npl(o.T[0].T)
        
        # number of partitions range
        self.range = range(len(self.b.T))
        # total sample size
        self.N = len(Tr[0][0][1])
        self.ϵ = ϵ
        
    def ω(self, h=1):
        ''' Laplace Noise 
        '''
        return nr.laplace(0, h/self.ϵ)
    # define normal noise
    
    # 2. Run S algorithm
    def IQR(self, b, db=None):
        ''' Calc Interquartile Range of partitioned β̂'s
        '''
        if db is not None:
            # Array like O array but with iqr calc for switching old β with new β
            nb = np.asarray([[np.block([[np.delete(b.T, i, 0)], [db[i][j]]]) for j in range(4)] for i in self.range])
            IQR = np.asarray([[[np.percentile(k, 75) - np.percentile(k, 25) for k in i.T] for i in j]for j in nb])
        else:
            # Original IQR for β
            IQR = np.asarray([np.percentile(i, 75) - np.percentile(i, 25) for i in b])
        return IQR
    
    def H(self, o=False):
        ''' compute H for each β̂ and compute H' for alt β̂'s'
        '''
        if o:
            IQR = self.IQR(self.b, self.db)
            H = np.asarray([[np.log(IQR[i][j]) / np.log(self.base[i]) for j in range(4)]                          
                           for i in self.range])
        else:
            IQR = self.IQR(self.b)
            H = np.asarray([np.log(IQR) / np.log(self.base[i]) for i in self.range])
        return H
    
    def S(self, BOOL=False):
        ''' part 2 of the Dwork & Lei (2009) RH Algorithm - compute noise infused SCALE
        '''
        H = self.H()
        dH = self.H(True)
        # 2.3 compute bins for each H
        bins = np.asarray([[np.abs(H[i] - dH[i][j] + self.ω()) for j in range(4)] for i in self.range])
        # 2.4 return TRUE for violation of 2.3 <= 1
        booL = np.asarray([np.all(i > 1) for i in bins])
        if BOOL:
            return booL
        else:
            s = np.asarray([self.IQR(self.b) * self.base[i] ** self.ω() for i in self.range])
            s[booL] = np.inf
            return s

    # 3. compute h for each s
    def h(self, s):
        h = np.asarray([i / math.pow(self.N, 0.25) for i in s])
        h[h==0] = 1 / math.sqrt(self.N)
        h[h==np.inf] = 0
        return h
    
    def z(self, h):
        return np.asarray([
            self.ω(i) for i in h])
    
    def RH(self, s, BOOL=False):
        ''' part 2 of the Dwork & Lei (2009) RH Algorithm - compute true β + noise
        '''
        h = self.h(s)
        # 3.1 np.abs(alt β̂'s - β̂) <= h array
        bins = np.asarray([[np.abs(self.b.T[i] - self.db[i][j]) for j in range(4)] for i in self.range])
        booL = np.asarray([[bins[i][j] > h[i] for j in range(4)] for i in self.range])
        # return True for violation
        anybooL = np.asarray([not np.any([not np.any(booL[i][j]) for j in range(4)]) for i in self.range])
        # 3.2 for TRUE compute β + noise
        RHb = self.Tb + self.z(h)
        RHb[anybooL] = None
        # 3.3 find min(β + noise)
        if BOOL:
            return anybooL
        else:
            try:
                np.nanmin(RHb, 0)
                return np.nanmin(RHb, 0)
            except:
                return None
            
    def __call__(self, BOOL=False):
        if BOOL:
            s = self.S()
            RH = self.RH(s, BOOL)
            S = self.S(BOOL)
            return S, RH
        else:
            s = self.S()
            RH = self.RH(s, BOOL)
            if RH is None:
                return "Too Sensitive"
            else:
                return RH 

In [183]:
XX, YY, NN, vv = Wrangle(data, aggregate=True)()
X, Y, N, v = Wrangle(data)()

In [184]:
OLSA = Methods(XX, YY, NN, vv, "OLS")
OLSP = Methods(X, Y, N, v, "OLS")

MMA = Methods(XX, YY, NN, vv, "MM")
MMP = Methods(X, Y, N, v, "MM")

SMDMA = Methods(XX, YY, NN, vv, "SMDM")
SMDMP = Methods(X, Y, N, v, "SMDM")

In [185]:
# 0. calc true statistic (aggregate level)
# 1. store partition stats (cell/tract level)
OTr = np.asarray(OLSA(withoutliers=False))
Or, Oo = OLSP()
Opsize = OLSP.N

MTr = np.asarray(MMA(withoutliers=False))
Mr, Mo = MMP()
Mpsize = mP.N

STr = np.asarray(SMDMA(withoutliers=False))
Sr, So = SMDMP()
Spsize = SMDMP.N

In [192]:
#pickle.dump([OTr, Or, Oo, Opsize, MTr, Mr, Mo, Mpsize, STr, Sr, So, Spsize], open("trial.p", "wb"))
# a1, b1, c1, d1, a2, b2, c2, d2, c3, b3, c3, d3  = pickle.load(open("trial.p","rb"))

In [219]:
OLSDWORK = DWORK(OTr, Or, Oo, 4, Opsize)
MMDWORK = DWORK(MTr, Mr, Mo, 4, Mpsize)
SMDMDWORK = DWORK(STr, Sr, So, 4, Spsize)

In [220]:
OD = OLSDWORK()
OB = OLSDWORK(True)
MD = MMDWORK()
MB = MMDWORK(True)
SD = SMDMDWORK()
SB = SMDMDWORK(True)



In [186]:
# Timeit MM v SMDM

In [221]:
t = OTr[0][0][0]
print(abs(t - OD))
print(abs(t - MD))
print(abs(t - SD))

[nan nan]
[0.00124992 0.02779986]
[0.00235805 0.00128671]


In [231]:
print(np.sum(OB[0]), np.sum(OB[1]))
print(np.sum(MB[0]), np.sum(MB[1]))
print(np.sum(SB[0]), np.sum(SB[1]))

0 111
0 57
0 96
