In [1]:
import numpy as np
import pandas as pd
from mbi import FactoredInference, Dataset, Domain
from scipy import sparse
from disjoint_set import DisjointSet
import networkx as nx
import itertools, random
from scipy.special import logsumexp
import argparse
from cdp2adp import cdp_rho
from msthelpers import measure, compress_domain, select

In [2]:
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())

Unnamed: 0,TransAmtDisc,DayOfWeek,Hour,Product,CardType,CardCategory,EmailDomain,Region,DaysSinceLastTrans,isFraud
0,2,2,7,4,1,0,3,0,2,0
1,0,2,7,4,2,0,3,4,3,0
2,2,2,7,4,3,1,8,0,3,0
3,1,2,7,4,2,1,9,0,0,0
4,1,2,7,1,2,0,3,0,3,0


In [3]:
display(data.domain)

Domain(TransAmtDisc: 5, DayOfWeek: 7, Hour: 24, Product: 5, CardType: 4, CardCategory: 2, EmailDomain: 10, Region: 6, DaysSinceLastTrans: 4, isFraud: 2)

In [4]:
epsilon = 1.0
delta = 1e-9
rho = cdp_rho(epsilon, delta)
sigma = np.sqrt(3/(2*rho))
display(sigma)

10.008992891756666

In [5]:
cliques = [(col,) for col in data.domain]
display(cliques)
log1 = measure(data, cliques, sigma)
data, log1, undo_compress_fn = compress_domain(data, log1)
cliques = select(data, rho/3.0, log1)
display(cliques)
log2 = measure(data, cliques, sigma)

[('TransAmtDisc',),
 ('DayOfWeek',),
 ('Hour',),
 ('Product',),
 ('CardType',),
 ('CardCategory',),
 ('EmailDomain',),
 ('Region',),
 ('DaysSinceLastTrans',),
 ('isFraud',)]

[('TransAmtDisc', 'Product'),
 ('DayOfWeek', 'Hour'),
 ('Hour', 'Region'),
 ('Product', 'DaysSinceLastTrans'),
 ('Product', 'CardCategory'),
 ('Product', 'Region'),
 ('Product', 'EmailDomain'),
 ('Product', 'CardType'),
 ('Product', 'isFraud')]

In [6]:
# GENERATE synthetic data using Private-PGM
engine = FactoredInference(data.domain, iters=2000)
est = engine.estimate(log1+log2)
synth = est.synthetic_data(rows = total)

In [7]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.03432533333333333 



Product             CardCategory          0.00112
                    isFraud               0.00158
CardType            isFraud               0.00200
TransAmtDisc        Product               0.00332
Product             CardType              0.00402
Region              isFraud               0.00410
Product             DaysSinceLastTrans    0.00416
DayOfWeek           isFraud               0.00470
Product             Region                0.00652
CardType            DaysSinceLastTrans    0.00830
Product             EmailDomain           0.00992
DaysSinceLastTrans  isFraud               0.01068
Hour                isFraud               0.01136
TransAmtDisc        isFraud               0.01252
DayOfWeek           CardType              0.01418
CardCategory        isFraud               0.01534
EmailDomain         isFraud               0.01584
Region              DaysSinceLastTrans    0.02396
CardType            EmailDomain           0.02460
CardCategory        Region                0.02474
