In [1]:
import numpy as np
import pandas as pd
import itertools
from mbi import Dataset, GraphicalModel, FactoredInference
from scipy.special import softmax
from scipy import sparse
from cdp2adp import cdp_rho
from mwemhelpers import mwem_pgm, worst_approximated

In [2]:
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())

Unnamed: 0,TransAmtDisc,DayOfWeek,Hour,Product,CardType,CardCategory,EmailDomain,Region,DaysSinceLastTrans,isFraud
0,2,2,7,4,1,0,3,0,2,0
1,0,2,7,4,2,0,3,4,3,0
2,2,2,7,4,3,1,8,0,3,0
3,1,2,7,4,2,1,9,0,0,0
4,1,2,7,1,2,0,3,0,3,0


In [3]:
display(data.domain)

Domain(TransAmtDisc: 5, DayOfWeek: 7, Hour: 24, Product: 5, CardType: 4, CardCategory: 2, EmailDomain: 10, Region: 6, DaysSinceLastTrans: 4, isFraud: 2)

In [4]:
workload = list(itertools.combinations(data.domain, 2))
epsilon = 1.0
delta = 1e-9

synth = mwem_pgm(data, epsilon, delta, workload=workload, rounds = 9, pgm_iters=2000, noise = 'Gaussian')

Round 1 Selected ('CardType', 'isFraud') Model Size (MB) 0.00052642822265625
Round 2 Selected ('Product', 'DaysSinceLastTrans') Model Size (MB) 0.00054168701171875
Round 3 Selected ('EmailDomain', 'Region') Model Size (MB) 0.0006256103515625
Round 4 Selected ('Hour', 'CardCategory') Model Size (MB) 0.0009613037109375
Round 5 Selected ('Product', 'CardCategory') Model Size (MB) 0.001129150390625
Round 6 Selected ('Product', 'Region') Model Size (MB) 0.0012054443359375
Round 7 Selected ('TransAmtDisc', 'Product') Model Size (MB) 0.001434326171875
Round 8 Selected ('DayOfWeek', 'Hour') Model Size (MB) 0.0015869140625
Round 9 Selected ('Product', 'EmailDomain') Model Size (MB) 0.00296783447265625
Generating Data...


In [5]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.04341333333333334 



Product             CardCategory          0.00218
CardType            isFraud               0.00358
Product             DaysSinceLastTrans    0.00364
                    Region                0.00440
TransAmtDisc        Product               0.00608
                    isFraud               0.00688
DayOfWeek           isFraud               0.00798
Product             EmailDomain           0.01234
DaysSinceLastTrans  isFraud               0.01288
Hour                CardCategory          0.01438
DayOfWeek           CardType              0.01490
Hour                isFraud               0.01548
EmailDomain         Region                0.01578
CardCategory        isFraud               0.01598
                    Region                0.02330
Hour                CardType              0.02344
EmailDomain         isFraud               0.02476
Region              DaysSinceLastTrans    0.02646
EmailDomain         DaysSinceLastTrans    0.02872
DayOfWeek           Region                0.02930
