In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import itertools, random
from mbi import Dataset, FactoredInference
from cdp2adp import cdp_rho

In [2]:
# Load data
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())
display(data.domain)

Unnamed: 0,TransAmtDisc,DayOfWeek,Hour,Product,CardType,CardCategory,EmailDomain,Region,DaysSinceLastTrans,isFraud
0,2,2,7,4,1,0,3,0,2,0
1,0,2,7,4,2,0,3,4,3,0
2,2,2,7,4,3,1,8,0,3,0
3,1,2,7,4,2,1,9,0,0,0
4,1,2,7,1,2,0,3,0,3,0


Domain(TransAmtDisc: 5, DayOfWeek: 7, Hour: 24, Product: 5, CardType: 4, CardCategory: 2, EmailDomain: 10, Region: 6, DaysSinceLastTrans: 4, isFraud: 2)

In [3]:
# Set eps, delta and calculate sigma
epsilon = 1.0
delta = 1e-9
rho = cdp_rho(epsilon, delta)
sigma = np.sqrt(3/(2*rho))
display(sigma)

10.008992891756666

In [4]:
# MEASURE the 1-way marginals and log the noisy answers
cliques = [(col,) for col in data.domain]

measurements_1d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_1d.append( (I, y, sigma, cl) )

In [5]:
# SELECT the marginals we'd like to measure
cliques = [('DayOfWeek', 'Hour'),
('Product', 'CardCategory'),
('Product', 'CardType'),
('Product', 'DaysSinceLastTrans'),
('Product', 'EmailDomain'),
('Product', 'Region'),
('Product', 'isFraud'),
('TransAmtDisc','EmailDomain'),
('TransAmtDisc', 'Product'),
('TransAmtDisc', 'Region')]

In [6]:
# MEASURE the selected 2-way marginals and log the noisy answers
measurements_2d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_2d.append( (I, y, sigma, cl) )

In [7]:
# GENERATE synthetic data using Private-PGM 
engine = FactoredInference(data.domain, iters=2000)
model = engine.estimate(measurements_1d+measurements_2d)
synth = model.synthetic_data(rows = total)

In [8]:
# Evaluate the errors on 1-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 1):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.000432 



CardCategory          0.00014
DaysSinceLastTrans    0.00014
Region                0.00018
CardType              0.00024
isFraud               0.00026
Product               0.00028
TransAmtDisc          0.00030
EmailDomain           0.00046
DayOfWeek             0.00066
Hour                  0.00166
dtype: float64

In [9]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.034235111111111105 



Product             CardCategory          0.00056
CardType            isFraud               0.00064
Product             isFraud               0.00074
                    DaysSinceLastTrans    0.00136
TransAmtDisc        Product               0.00186
                    Region                0.00192
Product             CardType              0.00222
Region              isFraud               0.00264
Product             Region                0.00312
                    EmailDomain           0.00366
TransAmtDisc        EmailDomain           0.00376
DayOfWeek           isFraud               0.00546
Hour                isFraud               0.00936
DaysSinceLastTrans  isFraud               0.01006
DayOfWeek           CardType              0.01144
CardType            DaysSinceLastTrans    0.01164
DayOfWeek           Hour                  0.01220
TransAmtDisc        isFraud               0.01312
CardCategory        isFraud               0.01524
EmailDomain         isFraud               0.01626


In [10]:
# Product-Fraud example
printmsrmts = []
cl = ('Product', 'isFraud')
x = data.project(cl).datavector()
y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
z = synth.project(cl).datavector()
printmsrmts.append( (x, y, z) )
    
df = pd.DataFrame(np.concatenate(printmsrmts))
df.columns = ['P=1, F=0', 'P=1, F=1', 'P=2, F=0', 'P=2, F=1', 'P=3, F=0',
             'P=3, F=1', 'P=4, F=0', 'P=4, F=1', 'P=5, F=0', 'P=5, F=1']
df.loc[len(df.index)] =  abs(df.loc[0]-df.loc[2])
df.index = ['Original Data', 'Noisy Data', 'Synthetic Data', 'Abs Error']

display(df)

Unnamed: 0,"P=1, F=0","P=1, F=1","P=2, F=0","P=2, F=1","P=3, F=0","P=3, F=1","P=4, F=0","P=4, F=1","P=5, F=0","P=5, F=1"
Original Data,9339.0,1001.0,15356.0,302.0,13926.0,138.0,2478.0,60.0,56342.0,1058.0
Noisy Data,9351.354801,1009.501945,15342.913227,296.326562,13931.323136,145.407874,2482.929315,39.070771,56360.320213,1062.905063
Synthetic Data,9331.0,1017.0,15366.0,298.0,13928.0,133.0,2479.0,52.0,56350.0,1046.0
Abs Error,8.0,16.0,10.0,4.0,2.0,5.0,1.0,8.0,8.0,12.0
