In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import itertools, random
from mbi import Dataset, FactoredInference
from cdp2adp import cdp_rho

In [2]:
# Load data
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())
display(data.domain)

Unnamed: 0,TransAmtDisc,DayOfWeek,Hour,Product,CardType,CardCategory,EmailDomain,Region,DaysSinceLastTrans,isFraud
0,4,1,0,2,2,0,3,2,3,1
1,2,1,0,4,3,1,3,0,2,0
2,4,1,0,2,2,0,3,2,3,1
3,2,1,0,4,3,1,3,0,3,0
4,1,1,0,1,0,0,3,3,3,0


Domain(TransAmtDisc: 5, DayOfWeek: 7, Hour: 24, Product: 5, CardType: 4, CardCategory: 2, EmailDomain: 10, Region: 6, DaysSinceLastTrans: 4, isFraud: 2)

In [3]:
# Set eps, delta and calculate sigma
epsilon = 1.0
delta = 1e-9
rho = cdp_rho(epsilon, delta)
sigma = np.sqrt(3/(2*rho))
display(sigma)

10.008992891756666

In [4]:
# MEASURE the 1-way marginals and log the noisy answers
cliques = [(col,) for col in data.domain]

measurements_1d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_1d.append( (I, y, sigma, cl) )

In [5]:
# SELECT the MST marginals
df = pd.read_csv("mst_marginals.csv")
display(df)
cliques = df[["from", "to"]].to_records(index=False)
cliques = tuple(map(tuple, cliques))

Unnamed: 0,fromN,toN,from,to,error
0,8,7,Product,isFraud,0.063662
1,3,6,DayOfWeek,Hour,0.079306
2,6,9,Hour,Region,0.089581
3,2,1,CardType,CardCategory,0.093208
4,8,5,Product,EmailDomain,0.161585
5,8,1,Product,CardCategory,0.180686
6,10,8,TransAmtDisc,Product,0.213883
7,8,4,Product,DaysSinceLastTrans,0.239222
8,8,9,Product,Region,0.333495


In [6]:
# MEASURE the selected 2-way marginals and log the noisy answers
measurements_2d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_2d.append( (I, y, sigma, cl) )

In [7]:
# GENERATE synthetic data using Private-PGM 
engine = FactoredInference(data.domain, iters=2000)
model = engine.estimate(measurements_1d+measurements_2d)
synth = model.synthetic_data(rows = total)

In [8]:
synth.df.to_csv('mst_data.csv')

In [9]:
# Evaluate the errors on 1-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 1):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.0004714814187963925 



isFraud               0.000073
Product               0.000145
TransAmtDisc          0.000242
CardType              0.000242
CardCategory          0.000242
Region                0.000387
DaysSinceLastTrans    0.000411
DayOfWeek             0.000556
EmailDomain           0.000629
Hour                  0.001789
dtype: float64

In [10]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.03083461613890299 



Product             CardCategory          0.000604
CardType            CardCategory          0.000629
Product             isFraud               0.000677
                    DaysSinceLastTrans    0.001402
TransAmtDisc        Product               0.001717
Product             Region                0.002442
CardType            isFraud               0.003578
Product             EmailDomain           0.003917
DayOfWeek           isFraud               0.005126
Region              isFraud               0.007157
Hour                Region                0.012113
                    isFraud               0.014459
DayOfWeek           Hour                  0.014846
DaysSinceLastTrans  isFraud               0.015087
CardCategory        Region                0.016079
CardType            DaysSinceLastTrans    0.017215
DayOfWeek           CardType              0.018472
TransAmtDisc        isFraud               0.019125
EmailDomain         isFraud               0.020794
DayOfWeek           CardCategor

In [11]:
# Product-Fraud example
printmsrmts = []
cl = ('Product', 'isFraud')
x = data.project(cl).datavector()
y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
z = synth.project(cl).datavector()
printmsrmts.append( (x, y, z) )
    
df = pd.DataFrame(np.concatenate(printmsrmts))
df.columns = ['P=1, F=0', 'P=1, F=1', 'P=2, F=0', 'P=2, F=1', 'P=3, F=0',
             'P=3, F=1', 'P=4, F=0', 'P=4, F=1', 'P=5, F=0', 'P=5, F=1']
df.loc[len(df.index)] =  abs(df.loc[0]-df.loc[2])
df.index = ['Original Data', 'Noisy Data', 'Synthetic Data', 'Abs Error']

display(df)

Unnamed: 0,"P=1, F=0","P=1, F=1","P=2, F=0","P=2, F=1","P=3, F=0","P=3, F=1","P=4, F=0","P=4, F=1","P=5, F=0","P=5, F=1"
Original Data,6672.0,1171.0,2140.0,340.0,2947.0,320.0,1212.0,79.0,66475.0,1362.0
Noisy Data,6680.730878,1168.975671,2125.949688,334.022956,2931.78177,325.305387,1213.28567,84.438128,66491.416144,1368.429393
Synthetic Data,6673.0,1166.0,2142.0,337.0,2944.0,329.0,1219.0,72.0,66465.0,1371.0
Abs Error,1.0,5.0,2.0,3.0,3.0,9.0,7.0,7.0,10.0,9.0
