In [None]:
# Classes File

In [1]:
# Local Import

import numpy as np
import pandas as pd
import math
import scipy.stats as ss
import scipy.optimize as so
import numpy.linalg as la
from itertools import product
import numba as nb
import numpy.random as nr

%cd '/home/nbuser/library/example_code_implementation_guide/'

/home/nbuser/library/example_code_implementation_guide


In [2]:
# Plot Tools
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import cycler
colors = cycler('color', 
       ['#EE6666', '#3388BB', '#9988DD', '#EECC55', 
       '#88BB44', '#FFBBBB'])

plt.rc('axes', facecolor='#E6E6E6', edgecolor='none',
      axisbelow=True, grid=True, prop_cycle=colors)
plt.rc('grid', color='w', linestyle='solid')
plt.rc('xtick', direction='out', color='gray')
plt.rc('ytick', direction='out', color='gray')
plt.rc('patch', edgecolor='#E6E6E6')
plt.rc('lines', linewidth=2)

In [3]:
# Test Data - Census Tract
stata = pd.read_stata('private_data_by_cells.dta')

In [4]:
data = stata[stata.columns[::-1]]

In [5]:
# 1. Data Wrangle - Pandas to Numpy
class Wrangle:
    '''
    Data Wrangling Class'''
    
    def __init__(self, data):
        # Numpy-nize Census Tract
        self.Key = data.keys()
        self.Cell = np.unique(np.array(data[self.Key[0]])).astype(int)
        self.Cells = len(self.Cell) 
        self.N = [len(data.loc[data.loc[:, self.Key[0]]== i]) for i in self.Cell]
        self.Y = [np.array(data.loc[data.loc[:, self.Key[0]]== i, self.Key[1]]) for i in self.Cell]
        if len(self.Key) > 3:
            None
        else:
            self.X = [np.array(data.loc[data.loc[:, self.Key[0]]== i, self.Key[2]]) for i in self.Cell]

In [6]:
class Alg_1:
    '''
    Chetty and Friedman Algorithm '''
    
    def __init__(self, data, method, ϵ=4.0, figure=[]):
        '''
        Setup selves'''
        self.D = Wrangle(data)
        self.col = len(self.D.Key) - 1
        self.Outlier = np.array([list(i) for i in product([0, 1], repeat = self.col)])
        self.figure = figure
        self.method = method
        self.ϵ = ϵ
        self.nd75 = ss.norm.ppf(0.75)
#         self.G = np.array([0.5, 0.5])
        self.W = np.array([1])
        self.N = self.D.N
        self.index = self.D.Cell - 1
        self.range = range(0,4) 
        
        self.laps = 1/math.sqrt(2)
    
    def lsR(self, I, W=None, O=None):
        ''' Least-squares Set up'''
        if W is None:
            W = self.W
      
        if O is None:
            y = np.asarray(self.D.Y[I]) * np.sqrt(W)
            x = np.asarray(self.D.X[I]) * np.sqrt(W)
        else:
            y = np.concatenate((self.D.Y[I], self.Outlier[O][0]), axis=None) * np.sqrt(W)
            x = np.concatenate((self.D.X[I], self.Outlier[O][1]), axis=None) * np.sqrt(W)
        
        X = np.vstack([np.ones_like(x), x]).T
        return X, y
    
    def lstsq(self, X, y, stderr=False):
        betas = la.lstsq(X, y, rcond=None)[0]
        residuals = y - np.dot(X, betas)
        if stderr:
            stderr = np.sqrt(residuals.dot(residuals) / np.sum(np.square(X.T[1] - np.mean(X.T[1]))) / (len(residuals) - self.col))
        return betas, residuals, stderr
    
    def Scale(self, residuals, scale=None, w=None, c=4.685, K = 0.199):
        ''' Scale Calculation as per S-Estimator '''
        L = len(residuals)
        if w is None:
            scale = np.median(np.absolute(residuals - np.median(residuals))) / np.array([self.nd75])
        else:
            scale = np.sqrt((np.dot(w, np.square(residuals))) / np.array([L * K]))
        
        w = np.square(self.W - np.square(((residuals / L)<=c) / np.array([c])))
        return scale, w
    
    def MM(self, I, O=None, stderr=False):
        ''' MM - Estimator Regression '''
        # Ordinary Least Squares
        R01, R02 = self.lsR(I=I, O=O)
        RO1, RO2, RO3 = self.lstsq(R01, R02)
        S, W = self.Scale(RO2)
        beta = RO1[1]
        while True:
            # WLR
            R11, R12 = self.lsR(I=I, W=W, O=O)
            RW1, RW2, RW3 = self.lstsq(R11, R12)
            #7. Convergence
            if math.isclose(RW1[1], beta):
                break
            else:
                S, W = self.Scale(RW2, S, W)
                beta = RW1[1]
        # betas, residuals, stderr
        if stderr:
            RW1, RW2, RW3 = self.lstsq(R11, R12, True)
        return RW1, RW2, RW3
    
    def LSR(self):
        # Run every cell  Regression in one line
        if self.method == "OLS":
            R0 = [self.lsR(i) for i in self.index]
            RA = [self.lstsq(i[0], i[1], True) for i in R0]
            R1 = [[self.lsR(i, O=j) for j in self.range] for i in self.index]
            RB = [[self.lstsq(i[j][0], i[j][1]) for j in self.range] for i in R1]            
        if self.method == "MM":
            RA = [self.MM(i, stderr=True) for i in self.index]
            RB = [[self.MM(i, O=j) for j in self.range] for i in self.index]        
         
        # Return True Statistics
        if "F1" or "F2" in self.figure:
            LS = [[max([abs(RB[i][j][0][1] - RA[i][0][1]) for j in self.range]), 
                  np.argmax([abs(RB[i][j][0][1] - RA[i][0][1]) for j in self.range])]
                  for i in self.index]
            OSE = np.multiply(self.N, np.transpose(LS)[0])
            MOSE = max(OSE)
            if "F1" in self.figure:
                self.MOSE = max(OSE) * 0.25
                k = np.argmax(OSE)
                j = np.transpose(LS)[1][k].astype(int)
                self.fD = np.vstack((self.D.X[k], self.D.Y[k]))
                self.fo = np.vstack((self.D.Outlier[J[k]][0], self.D.Outlier[J[k]][0]))
                self.fLA = lambda λ: RA[k][0][1]*λ + RA[k][0][0]
                self.fLB = lambda λ: RB[k][j][0][1]*λ + RB[k][j][0][0]
                self.cell = str(int(self.index[k]))
                self.LS = np.transpose(LS)[0][k]
            elif "F2" in self.figure:
                self.fD = np.vstack((self.D.N, np.transpose(LS)[0]))
                self.MOSE = MOSE
                self.fLA = lambda λ: MOSE / λ
        else:
            MOSE = max(np.multiply(self.N, 
                [max([abs(RB[i][j][0][1] - RA[i][0][1]) for j in self.range]) for i in self.index]))
        return np.transpose([[RA[i][j] for j in [0, 2]] for i in self.index]), MOSE
    
    def noise(self, θseθ, χ):
        ''' Add Noise to Statistics '''
        NN = np.array(self.D.N)
        S = NN.astype(float) * self.ϵ
        noise = lambda x: x * math.sqrt(2)
        
        nθ = [i[1] for i in θseθ[0]] + χ * noise(nr.laplace(0, self.laps)) / S
        sen = np.square(θseθ[1]) + 2 * np.square(χ / S)
        senθ = np.sqrt(sen.astype(float))
        nsenθ = senθ + χ * noise(nr.laplace(0, self.laps)) / S
        nN = self.D.N * (1 + noise(nr.laplace(0, self.laps)) / S)
        return nθ, nsenθ, nN
            
    def __call__(self):
        ''' Release Noise Infused Statistics '''
        m1, m2 = self.LSR()
        nθ, nsenθ, nN = self.noise(m1, m2)
        return nθ, nsenθ, nN
    
    def plot(self):
        fig, ax = plt.subplots(figsize = (8,6))
        if "F1" in self.figure:
            # Figure 1 - Effect of Outlier on Regression + MOSE line
            ax.scatter(self.fD[1], self.fD[0])
            ax.scatter(self.fo[1], self.fo[0])
            l = np.linspace(0, 1, 5)
            ax.plot(l, self.fLA(l), label="%s-Estimate in Actual Data"%self.method)
            ax.plot(l, self.fLB(l), linestyle = 'dashed', label="%s-Estimate with Outlier"%self.method)
            # LS line at 25th pctile
            p25 = [0.25, 0.25]
            yp25 = [self.fLA(x[0]), self.fLB(x[1])]
            ax.plot(p25, yp25, label="LS at 25th pctile = %s"%str(self.MOSE))
            
            ax.set_title("FIGURE 1: Calculation of Local Sensitivity", pad=35)
            ax.set_ylabel("Child's Income Rank for cell = %s"%self.cell)
            ax.set_xlabel("Parent's Income Rank for cell = %s"%self.cell)
            
            plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left',
            ncol=2, mode="expand", borderaxespad=0.)
            ax.set_xlim(-0.025, 1.025)
            
        if "F2" in self.figure:
            ax.scatter(self.fD[1], self.fD[0])
            l = np.linspace(0, 1, 5)
            ax.plot(l, self.fLA(l), label="MOSE = $\frac{χ}{N}=frac{%s}{N}$"%str(self.MOSE))
            ax.xscale('log')
            ax.set_xlim(min(self.D.N), max(self.D.N))
            
        #if kind == "sdl":
            # Figure 3
            
            
            
        #if kind == "reg":
            # Figure 4
            
    
        ax.set_ylim(-0.025, 1.025)

        ax.grid(which='minor', color='w', alpha=0.3)            
        plt.show()            

In [None]:
%%timeit
Test()

In [None]:
# Do's List

# 1 - Cell Suppression Algorithm (SDL techniques) 

# 2 - 

$\,$

$\,$

$\,$

$\,$

$\,$

$\,$

$\,$