In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import itertools, random
from mbi import Dataset, FactoredInference
from cdp2adp import cdp_rho

In [2]:
# Load data
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())
display(data.domain)

Unnamed: 0,TransAmtDisc,DayOfWeek,Hour,Product,CardType,CardCategory,EmailDomain,Region,DaysSinceLastTrans,isFraud
0,4,1,0,2,2,0,3,2,3,1
1,2,1,0,4,3,1,3,0,2,0
2,4,1,0,2,2,0,3,2,3,1
3,2,1,0,4,3,1,3,0,3,0
4,1,1,0,1,0,0,3,3,3,0


Domain(TransAmtDisc: 5, DayOfWeek: 7, Hour: 24, Product: 5, CardType: 4, CardCategory: 2, EmailDomain: 10, Region: 6, DaysSinceLastTrans: 4, isFraud: 2)

In [3]:
# Set eps, delta and calculate sigma
epsilon = 1.0
delta = 1e-9
rho = cdp_rho(epsilon, delta)
sigma = np.sqrt(3/(2*rho))
display(sigma)

10.008992891756666

In [4]:
# MEASURE all the 1-way marginals and log the noisy answers
cliques = set(itertools.combinations(data.domain, 1))
measurements = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
    I = sparse.eye(x.size)
    measurements.append( (I, y, sigma, cl) )

In [5]:
# GENERATE synthetic data using Private-PGM 
engine = FactoredInference(data.domain, iters=2000)
model = engine.estimate(measurements)
synth = model.synthetic_data(rows = total)

In [6]:
synth.df.to_csv('baseline_data.csv')

In [7]:
# Evaluate the errors on 1-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 1):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.0006334775985879736 



isFraud               0.000024
CardCategory          0.000121
TransAmtDisc          0.000435
DaysSinceLastTrans    0.000435
Product               0.000459
DayOfWeek             0.000580
EmailDomain           0.000653
Region                0.000653
CardType              0.000677
Hour                  0.002297
dtype: float64

In [8]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)
df.to_csv('baseline.csv')

Average Error 0.07861731021865455 



DayOfWeek           isFraud               0.005585
CardType            isFraud               0.006649
TransAmtDisc        isFraud               0.017263
DayOfWeek           CardType              0.017288
Hour                isFraud               0.018908
DayOfWeek           CardCategory          0.023139
                    Region                0.025025
CardType            EmailDomain           0.026137
EmailDomain         isFraud               0.026790
Hour                CardType              0.027273
CardType            DaysSinceLastTrans    0.029353
Hour                CardCategory          0.029522
DayOfWeek           DaysSinceLastTrans    0.032665
                    Product               0.032689
                    EmailDomain           0.036703
DaysSinceLastTrans  isFraud               0.037090
TransAmtDisc        CardType              0.037888
Region              isFraud               0.039242
CardCategory        isFraud               0.043231
TransAmtDisc        DayOfWeek  

In [9]:
# Product-Fraud example
printmsrmts = []
cl = ('Product', 'isFraud')
x = data.project(cl).datavector()
y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
z = synth.project(cl).datavector()
printmsrmts.append( (x, y, z) )
    
df = pd.DataFrame(np.concatenate(printmsrmts))
df.columns = ['P=1, F=0', 'P=1, F=1', 'P=2, F=0', 'P=2, F=1', 'P=3, F=0',
             'P=3, F=1', 'P=4, F=0', 'P=4, F=1', 'P=5, F=0', 'P=5, F=1']
df.loc[len(df.index)] =  abs(df.loc[0]-df.loc[2])
df.index = ['Original Data', 'Noisy Data', 'Synthetic Data', 'Abs Error']

display(df)

Unnamed: 0,"P=1, F=0","P=1, F=1","P=2, F=0","P=2, F=1","P=3, F=0","P=3, F=1","P=4, F=0","P=4, F=1","P=5, F=0","P=5, F=1"
Original Data,6672.0,1171.0,2140.0,340.0,2947.0,320.0,1212.0,79.0,66475.0,1362.0
Noisy Data,6671.315498,1176.866462,2136.038252,351.206445,2937.791673,315.34517,1205.264635,69.769258,66458.429788,1364.566823
Synthetic Data,7522.0,316.0,2373.0,93.0,3134.0,133.0,1251.0,45.0,65165.0,2686.0
Abs Error,850.0,855.0,233.0,247.0,187.0,187.0,39.0,34.0,1310.0,1324.0
