In [None]:
# Classes File

In [None]:
'''
Do's
- use PUMS data, i.e. large scale
'''

In [1]:
# Local Import
import numpy as np
import pandas as pd
import math
import scipy.stats as ss
import numpy.linalg as la
from itertools import product
import numpy.random as nr

# Plot Tools
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import cycler
colors = cycler('color', 
       ['#EE6666', '#3388BB', '#9988DD', '#EECC55', 
       '#88BB44', '#FFBBBB'])

plt.rc('axes', facecolor='#E6E6E6', edgecolor='none',
      axisbelow=True, grid=True, prop_cycle=colors)
plt.rc('grid', color='w', linestyle='solid')
plt.rc('xtick', direction='out', color='gray')
plt.rc('ytick', direction='out', color='gray')
plt.rc('patch', edgecolor='#E6E6E6')
plt.rc('lines', linewidth=2)

In [12]:
# R in Python Interface
import rpy2
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
robustbase = importr('robustbase')
base = importr('base')
utils = importr('utils')

In [11]:
# url = 'https://github.com/Betements/8002/blob/master/private_data_by_cells.dta'
# import requests
# response = requests.get(url)

In [13]:
# Test Data - Census Tract
%cd '/home/nbuser/library/example_code_implementation_guide/'
stata = pd.read_stata('private_data_by_cells.dta')
data = stata[stata.columns[::-1]]

/home/nbuser/library/example_code_implementation_guide


In [14]:
# 1. Data Wrangle - Pandas to Numpy
class Wrangle:
    '''
    Data Wrangling Class'''
    
    def __init__(self, data, partition=None):
        # Numpy-nize Census Tract
        self.Key = data.keys()
        Cell = np.unique(np.array(data[self.Key[0]])).astype(int)    
        Y = [np.array(data.loc[data.loc[:, self.Key[0]]== i, self.Key[1]]) for i in Cell]
        if len(self.Key) > 3:
            None
        else:
            X = [np.array(data.loc[data.loc[:, self.Key[0]]== i, self.Key[2]]) for i in Cell]
        
        # optional partitioning
        if partition is not None:
            pX = [np.array_split(i, 2) for i in X]
            pY = [np.array_split(i, 2) for i in Y]
            if partition==1:
                X = np.transpose(pX)[0]
                Y = np.transpose(pY)[0]
            elif partition==2:
                X = np.transpose(pX)[1]
                Y = np.transpose(pY)[1]

        self.X = X
        self.Y = Y 
        self.N = np.array([len(i) for i in X])
        self.Cells = len(self.N) 

In [19]:
class Alg_1:
    '''
    Chetty and Friedman Algorithm 
    '''
    def __init__(self, data, method, figure=[], ϵ=4.0, partition=None):
        ''' Setup selves '''
        self.D = Wrangle(data, partition=partition)
        self.col = len(self.D.Key) - 1
        self.Outlier = np.array([list(i) for i in product([0, 1], repeat = self.col)])
        self.figure = figure
        self.method = method
        self.ϵ = ϵ
        self.nd75 = ss.norm.ppf(0.75)
        self.W = np.array([1])
        self.N = self.D.N
        self.index = range(self.D.Cells)
        self.range = range(4) 
        if self.method == "LmRob":
            self.form = ro.Formula("y~x")
    
    def ω(self):
        ''' 
        Draw from Laplace Distribution 
        '''
#         if self.method=="OLS":
#             return nr.laplace(0, 1/math.sqrt(2))
#         else:
#             return nr.laplace(0, 1/ϵ)
        return nr.laplace(0, 1/math.sqrt(2))
    
    def r(self, x):
         return range(len(x))
    
    def lsR(self, I, O=None):
        ''' 
        Sort Arrays for Outlier Variations
        '''
        if O is None:
            y = np.asarray(self.D.Y[I]) 
            x = np.asarray(self.D.X[I])
        else:
            y = np.block([self.D.Y[I], self.Outlier[O][0]])
            x = np.block([self.D.X[I], self.Outlier[O][1]])

        return x, y

    def stack(self, x):
        add = np.ones(len(x))
        return np.block([[add], [x.T]]).T
    
    def lstsq(self, x, y, stderr=False, w=None):
        '''
        Calculus Linear Least Squares Optimisation via Numpy
        '''
        # Compare to Weight in LSTSQ
        x = self.stack(x)
        if w is not None:
            XX = x.T.dot(np.diag(w)).T
        else:
            XX = x
            
        betas = np.dot(la.inv(XX.T.dot(x)), XX.T.dot(y))
        ϵ = y - np.dot(XX, betas)
        if stderr:
            if len(y) > 5:
                stderr = np.sqrt(ϵ.dot(ϵ) / np.sum(np.square(XX.T[1] - np.mean(XX.T[1]))) / (len(ϵ) - self.col))
            else:
                stderr = np.inf
        return betas, ϵ, stderr
    
    def MAD(self, ϵ):
        return np.median(np.absolute(ϵ - np.median(ϵ)))
    
    def Tukey(self, u, c=1.547):
        ρ = np.power(u, 2) / 2 - np.power(u, 4) / (2 * math.pow(c, 2)) + np.power(u, 6) / (6 * math.pow(c, 4))
        if np.any(np.abs(u) > c):
            ρ[np.abs(u) > c] = math.pow(c, 2) / 6
            
        return ρ
            
    def Scale(self, ϵ, w=None, K=0.199):
        ''' 
        Scale calculation for S-Estimator part of MM
        '''
        if w is None:
            σ = self.MAD(ϵ) / self.nd75 
        else:
            σ = np.sqrt(np.sum(np.multiply(np.square(ϵ),w)) / (len(ϵ) * K))
        
        return σ
    
    def U(self, ϵ, σ):
        return ϵ / σ
    
    def weight(self, u, method="S", it=False):
        if method=="S":
            if not it:
            # iteration = 1
                W = np.square(u / 1.547)
                if np.any(np.abs(u) > 1.547):
                    W[np.abs(u) > 1.547] = 0
                weight = np.square(1 - W)
            else:
                weight = self.Tukey(u) / np.square(u)
        if method=="MM":
            W = np.square(u / 4.685)
            if np.any(np.abs(u) > 4.685):
                W[np.abs(u) > 4.685] = 0
            weight = np.square(1 - W)
        
        return weight
    
    def OLS(self, I, O=None, stderr=False):
        # for bug testing
        R01, R02 = self.lsR(I=I, O=O)
        R, ϵ, boo = self.lstsq(R01, R02)
        return R, ϵ, boo
    
    def MM(self, I, O=None, stderr=False):
        ''' 
        MM - Estimator Regression Algorithm as specificied in S 2009
        '''
        # 1. Ordinary Least Squares
        R01, R02 = self.lsR(I=I, O=O)
        RO1, ϵ, boo = self.lstsq(R01, R02)
        beta = RO1
        # 2. S-Estimate Convergence
        σ = self.Scale(ϵ)
        U = self.U(ϵ, σ)
        W = self.weight(U, "S", False)
        S, ϵ, boo = self.lstsq(R01, R02, w=W)
        # Convergence Loop
        maxiter = 100
        for i in range(maxiter):
            σ = self.Scale(ϵ, w=W)
            U = self.U(ϵ, σ)
            W = self.weight(U, "S", True)
            S, ϵ, boo = self.lstsq(R01, R02, w=W)
            if np.allclose(S, beta, rtol=1e-09):
                break
            else:
                beta = S
        # use residuals and scale of S-estimate
        # 3. MM-Estimate Convergence
        σ = self.Scale(ϵ)
        for i in range(maxiter):
            U = self.U(ϵ, σ)
            W = self.weight(U, "MM")
            M, ϵ, boo = self.lstsq(R01, R02, w=W)
            if np.allclose(M, beta, rtol=1e-03):
                break
            else:
                beta = M
#                 print(M)

        if stderr:
            RW1, RW2, boo = self.lstsq(R01, R02, True, w=W)
        
        # return betas, residuals, stderr, scale
        return M, ϵ, boo, σ
    
    def lmrob(self, x, y):
        '''
        Run M-Estimator Robust Regression in R via rpy2
        '''
        # clear R workspace for looping
        #base.rm(list='ls()')
        # Set up formula environment
        self.form.environment["y"] = ro.r['matrix'](ro.FloatVector(y.flatten()), ncol=1)
        self.form.environment["x"] = ro.r['matrix'](ro.FloatVector(x.flatten()), ncol=1)
        # Run Robust Regression
        lmr = robustbase.lmrob(self.form, method = "SMDM", setting="KS2014")
        # must copy variables because of memory constraint
        betas = np.array(lmr.rx2("coefficients"), copy=True)
        scale = np.array(lmr.rx2("scale"), copy=True)
        residuals = np.array(lmr.rx2("residuals"), copy=True)
        stderr = np.array(lmr.rx2("cov"), copy=True)[0][0]
        return betas, residuals, stderr, scale
    
    def oR(self, bool=False):
        '''
        Specify regression method and run all regression in one/two line(s)
        '''
        if self.method == "OLS":
            R0 = np.asarray([self.lsR(i) for i in self.index])
            self.RA = np.asarray([self.lstsq(i[0], i[1], True) for i in R0])
            R1 = np.asarray([[self.lsR(i, O=j) for j in self.range] for i in self.index])
            self.RB = np.asarray([[self.lstsq(i[j][0], i[j][1]) for j in self.range] for i in R1] )           
        elif self.method == "MM":
            self.RA = np.asarray([self.MM(i, stderr=True) for i in self.index])
            self.RB = np.asarray([[self.MM(i, j) for j in self.range] for i in self.index])
        elif self.method == "LmRob":
            R0 = np.asarray([self.lsR(i) for i in self.index])
            self.RA = np.asarray([self.lmrob(i[0], i[1]) for i in R0])
            R1 = np.asarray([[self.lsR(i, O=j) for j in self.range] for i in self.index])
            self.RB = np.asarray([[self.lmrob(i[j][0], i[j][1]) for j in self.range] for i in R1])          
            
        if bool:
            return self.RA.T[0], self.RB.T[0]
        
    def LSR(self):
        self.oR()
        ''' 
        Calculate Local Sensitivity 
        '''
        if "F1" or "F2" in self.figure:
            LS = np.asarray([[max([abs(self.RB[i][j][0][1] - self.RA[i][0][1]) for j in self.range]), 
                  np.argmax([abs(self.RB[i][j][0][1] - self.RA[i][0][1]) for j in self.range])]
                  for i in self.index])
            OSE = np.multiply(self.N, np.transpose(LS)[0])
            MOSE = max(OSE)
            if "F1" in self.figure:
                self.MOSE = np.around(max(LS.T[0] * 0.25), 3)
                self.k = np.argmax(LS.T[0] * 0.25)
                self.j = LS.T[1][self.k].astype(int)
                self.cell = str(int(self.index[self.k]))
            elif "F2" in self.figure:
                self.fD = np.vstack((np.asarray(self.D.N), LS.T[0]))
                self.MOSE = MOSE
                self.LS = LS.T[0]
                self.k = np.argmax(OSE)
        else:
            MOSE = max(np.multiply(self.N, 
                [max([abs(self.RB[i][j][0][1] - self.RA[i][0][1]) for j in self.range]) for i in self.index]))
        return np.transpose([[self.RA[i][j] for j in [0, 2]] for i in self.index]), MOSE
    
    def noise(self, θseθ, χ):
        ''' 
        Add Noise to Statistics 
        '''
        NN = np.asarray(self.D.N).astype(float)
        S = NN * self.ϵ
        noise = lambda x: x * math.sqrt(2)
        
        nθ = [i[1] for i in θseθ[0]] + χ * noise(self.ω()) / S
        sen = np.square(θseθ[1]) + 2 * np.square(χ / S)
        senθ = np.sqrt(sen.astype(float))
        nsenθ = senθ + χ * noise(self.ω()) / S
        nsenθ = np.asarray(["Sample Size Too Small" if i==np.inf else i for i in nsenθ])
        nN = self.D.N * (1 + noise(self.ω()) / S)
        return nθ, nsenθ, nN
    
    def __call__(self):
        ''' 
        Release Noise Infused Statistics 
        '''
        m1, m2 = self.LSR()
        nθ, nsenθ, nN = self.noise(m1, m2)
        return nθ, nsenθ, nN
    
    def plot(self, I=None, O=None):
        fig, ax = plt.subplots(figsize = (8,6))
        if "F1" in self.figure:
            self.LSR()
            # Relevant Data
            fD = np.vstack((self.D.X[self.k], self.D.Y[self.k]))
            fo = np.vstack((self.Outlier[self.j][0], self.Outlier[self.j][1]))
            fLA = lambda λ: self.RA[self.k][0][1]*λ + self.RA[self.k][0][0]
            fLB = lambda λ: self.RB[self.k][self.j][0][1]*λ + self.RB[self.k][self.j][0][0]
            
            # Figure 1 - Effect of Outlier on Regression + MOSE line
            ax.scatter(fD[0], fD[1])
            ax.scatter(fo[0], fo[1])
            l = np.linspace(0, 1, 5)
            ax.plot(l, fLA(l), label="%s-Estimate in Actual Data"%self.method)
            ax.plot(l, fLB(l), linestyle = 'dashed', label="%s-Estimate with Outlier"%self.method)
            # LS line at 25th pctile
            p25 = [0.25, 0.25]
            yp25 = [fLA(p25[0]), fLB(p25[1])]
            ax.plot(p25, yp25, color='black')
            
            ax.set_title("FIGURE 1: Calculation of Local Sensitivity $= %s$"%str(self.MOSE), pad=35)
            ax.set_ylabel("Child's Income Rank for Tract %s"%self.cell)
            ax.set_xlabel("Parent's Income Rank for Tract %s"%self.cell)
            
            plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left', ncol=2, mode="expand", borderaxespad=0.)
            ax.set_xlim(-0.025, 1.025)
            ax.set_ylim(-0.025, 1.025)
            
        if "F2" in self.figure:
            self.LSR()
            fLA = lambda λ: self.MOSE / λ
            ax.scatter(self.fD[0], self.fD[1])
            xmin = min(self.D.N)
            xmax = max(self.D.N)
            l = [xmin, xmax]
            ax.plot(l, fLA(l), label="$MOSE = χ\,/\,N = %s\,/\,N $"%str(np.around(self.MOSE, 2)), color='black')
            ax.scatter(self.fD[0][self.k], self.fD[1][self.k], label="Tract %d"%self.k)
            ax.set_title("FIGURE 2: Maximum Observed Sensitivity Envelope for %s"%self.method, pad=35)
            ax.set_ylabel("Local Sensitivity of $̂β_1$ Estimates")
            ax.set_xlabel("Number of Individuals in Tract")
            
            ax.set_xscale('log')
            ax.set_yscale('log')
            
            ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left', ncol=2, mode="expand", borderaxespad=0.)
            ax.set_xlim(xmin *0.9, xmax * 1.1)
            ax.set_ylim(min(self.LS)*0.9, self.MOSE/(xmin *0.9))
            
        if "F3" in self.figure:
            # Figure 3
            l = np.linspace(0, 1, 5)
            if O is not None:
                ax.scatter(self.Outlier[O][0], self.Outlier[O][1])
                ao, bo = self.lsR(I, O)
                if self.method == "OLS":
                    co, do, eo = self.lstsq(ao, bo)
                if self.method == "MM":
                    co, do, eo, fo = self.MM(I, O)
                if self.method == "LmRob":
                    co, do, eo, fo = self.lmrob(ao, bo)
                lineo = lambda x: co[1]*x + co[0]
                ax.plot(l, lineo(l))
            ax.scatter(self.D.X[I], self.D.Y[I])
            a, b = self.lsR(I)
            if self.method == "OLS":
                c, d, e = self.lstsq(a, b)
            if self.method == "MM":
                c, d, e, f = self.MM(I)
            if self.method == "LmRob":
                c, d, e, f = self.lmrob(a, b)
            line = lambda x: c[1]*x + c[0]
            ax.plot(l, line(l))
            ax.set_xlim(-0.025, 1.025)
            ax.set_ylim(-0.025, 1.025)

        ax.grid(which='minor', color='w', alpha=0.3)            
        plt.savefig('fig.png')
        plt.show()

In [16]:
mmf1 = Alg_1(data, "MM", figure=["F1"])
olsf1 = Alg_1(data, "OLS", figure=["F1"])
Rf1 = Alg_1(data, "LmRob", figure=["F1"])
mmf2 = Alg_1(data, "MM", figure=["F2"])
olsf2 = Alg_1(data, "OLS", figure=["F2"])
Rf2 = Alg_1(data, "LmRob", figure=["F2"])

In [None]:
.plot()

In [None]:
testmm.plot()

In [None]:
testR.plot()

In [None]:
# %%timeit
# Test = Alg_1(data, "MM", figure=[])
# Test3 = Alg_1(data, "LmRob")
# Test3()
# A = Test.oR(True)
# B = Test2.oR(True)
# C = Test3.oR(True)
# Index = range(10)
# Range = range(4)
# # R0 = [Test.lsR(i) for i in Index]
# # RA = [Test.lmrob(i[0], i[1], True) for i in R0]
# # R1 = [[Test.lsR(i, O=j) for j in Range] for i in Index]
# # RB = [[Test.lmrob(i[j][0], i[j][1]) for j in Range] for i in R1]

In [None]:
# LS line not correct
# Not fitting data; test against scipy

x0, y0 = Test.lsR(39)
a, b, c= Test.lstsq(x0, y0)

In [None]:
MSE = []

for i in range(1,11):
    noise_θ = []
    diff = []
    
    # But also, this needs to be done 500 times
    for j in range(0,500):
        # Draws random samples from Laplace or Normal:
        ω = np.random.laplace(0, 1 / np.sqrt(2))
        
        # Noise infused Statistics
        noise_θ = [a + np.sqrt(2) * (χ / (i * b)) * ω for (a, b) in zip(θ, N)]
        diff.append(np.square(np.subtract(noise_θ, θ)))
        
    # Compute MSE
    MSE.append(np.mean(diff))

Do's List'

O. DWORK DP Algorithm

A .SDL Techniques for Microdata
 - Local Suppresion
 - Uncorrelated Additive Noise - Pertubation Methods not differentially private
    Z = X + ϵ
   ϵᵢ ~ N(0, α * σᵢ^2)
 - Shuffling


B. PUMS NY data at the 5% level from 2000 US Census
 - dumb regression model with molto numbers

C. ways to compare methods
 - proximity to true values
 - reduce sensitivity 

In [None]:
# DWORK LEI ALGORITHM

In [None]:
class Alg_2:
    def __init__(self, ϵ=1):
        pB1, pdB1 = Alg_1(data, "MM", ϵ=ϵ, partition=1).oR(True)
        pB2, pdB2 = Alg_1(data, "MM", ϵ=ϵ, partition=2).oR(True)
        self.B = np.stack((pB1, pB2)).T
        self.dB =  np.stack((pdB1, pdB2)).T
        self.n = 2 
        self.Range = range(len(self.B))
        self.ϵ = ϵ
        self.base = 1 + 1 / np.log(self.n)

    # omega (remove later)
    def ω(self):
        return nr.laplace(0, 1/self.ϵ)
    
    def iqr(self, x):
        Q = np.asarray([[np.percentile(j, 75) - np.percentile(j, 25) for j in i] for i in x])
        return Q
         
    def H(self, x=True):
        if x:
            iqr = self.iqr(self.B)
            H = np.asarray([np.log(i) / np.log(self.base) for i in iqr])
        else:
            iqr = np.asarray([self.iqr(i) for i in self.dB])
            H = np.asarray([np.log(i) / np.log(self.base) for i in np.transpose(iqr)])
        return H
    
    def i(self):
        return self.iqr(self.B)
    
    def S(self):
        iqr = self.iqr(self.B)
        H = self.H()
        b = np.floor(H)
        dH = self.H(False)
        pb = np.asarray([np.logical_and(dH.T[i] >= b[i] - 0.5, 
                            dH.T[i] < b[i] + 1.5) for i in self.Range])
        Bool = np.asarray([[np.all(pb[i].T[j]) for j in range(2)] for i in self.Range]) 
        s = np.asarray([[iqr[j] * self.base ** self.ω() if i[j] else None for j in range(2)] for i in Bool])
        return s
        
    def h(self, s):
        h = [[
            1 / math.sqrt(len(D)) if i[j]==0 else i[j] / math.power(self.n, 0.25)
        for j in self.Range] for i in s]
        return h
    
#         Bool2 = [np.all(np.logical_and(np.greater_equal(dH[i], b[i] - 0.5), 
#                                       np.less(dH[i],b[i] + 0.5))) for i in self.Range]      

In [None]:
Test3 =Alg_2()

In [None]:
Test3.S()

$\,$

In [20]:
### TREAT EACH CELL AS A SUBPARTITION

In [None]:
def M(data, S, h=None, ϵ=4):
    if h==None:
        h = len(data) ** (- 1 / 3)
    ω = np.random.laplace(0, S * h / ϵ)
    return np.median(data) + ω

def RS(a, b, p=2, ϵ=4):
    # partition algorithm
    a_split = np.array_split(a, p)
    b_split = np.array_split(b, p)
    
    # least squares β array
    β = []
    for j in np.arange(len(a_split)):
        X = np.vstack([b_split[j], np.ones(len(b_split[j]))]).T
        m, c = la.lstsq(A, a_split[j], rcond=None)[0]
        β.append(m)
        
    # run M algorithm
    return M(β, S(β))

def RH(a, b, p=2, ϵ=4):
    # compute true β
    θ = []
    X = np.vstack([b, np.ones(len(b))]).T
    m, c = la.lstsq(A, a, rcond=None)[0]
    θ.append(m)
    
    # partition algorithm
    a_split = np.array_split(a, p)
    b_split = np.array_split(b, p)
    
    # least squares β array
    β = []
    for j in np.arange(len(a_split)):
        X = np.vstack([b_split[j], np.ones(len(b_split[j]))]).T
        m, c = la.lstsq(A, a_split[j], rcond=None)[0]
        β.append(m)
    
    s = []
    h = []
    for j in len(a_split):
        data = np.concatenate(S(a_split), S(b_split))
        s.append(data)
        h.append(S(a_split) / (len(a) ** (1 / (2 * len(a_split)))))
    
    return None

$\,$

$\,$

$\,$

$\,$

$\,$

$\,$

$\,$