In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import itertools, random
from mbi import Dataset, FactoredInference
from cdp2adp import cdp_rho

In [2]:
# Load data
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())
display(data.domain)

Unnamed: 0,TransAmtDisc,DayOfWeek,Hour,Product,CardType,CardCategory,EmailDomain,Region,DaysSinceLastTrans,isFraud
0,4,1,0,2,2,0,3,2,3,1
1,2,1,0,4,3,1,3,0,2,0
2,4,1,0,2,2,0,3,2,3,1
3,2,1,0,4,3,1,3,0,3,0
4,1,1,0,1,0,0,3,3,3,0


Domain(TransAmtDisc: 5, DayOfWeek: 7, Hour: 24, Product: 5, CardType: 4, CardCategory: 2, EmailDomain: 10, Region: 6, DaysSinceLastTrans: 4, isFraud: 2)

In [3]:
# Set eps, delta and calculate sigma
epsilon = 1.0
delta = 1e-9
rho = cdp_rho(epsilon, delta)
sigma = np.sqrt(3/(2*rho))
display(sigma)

10.008992891756666

In [4]:
# MEASURE the 1-way marginals and log the noisy answers
cliques = [(col,) for col in data.domain]

measurements_1d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_1d.append( (I, y, sigma, cl) )

In [5]:
# SELECT the Fraud marginals
df = pd.read_csv("fraud_marginals.csv")
display(df)
cliques = df[["from", "to"]].to_records(index=False)
cliques = tuple(map(tuple, cliques))

Unnamed: 0,from,to
0,DayOfWeek,isFraud
1,CardType,isFraud
2,TransAmtDisc,isFraud
3,Hour,isFraud
4,EmailDomain,isFraud
5,DaysSinceLastTrans,isFraud
6,Region,isFraud
7,CardCategory,isFraud
8,Product,isFraud


In [6]:
# MEASURE the selected 2-way marginals and log the noisy answers
measurements_2d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_2d.append( (I, y, sigma, cl) )

In [7]:
# GENERATE synthetic data using Private-PGM 
engine = FactoredInference(data.domain, iters=2000)
model = engine.estimate(measurements_1d+measurements_2d)
synth = model.synthetic_data(rows = total)

In [8]:
synth.df.to_csv('fraud_wl_data.csv')

In [9]:
# Evaluate the errors on 1-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 1):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.0005150027805314441 



CardCategory          0.000073
isFraud               0.000073
DaysSinceLastTrans    0.000290
CardType              0.000314
TransAmtDisc          0.000338
DayOfWeek             0.000338
Product               0.000411
EmailDomain           0.000604
Region                0.000629
Hour                  0.002079
dtype: float64

In [10]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.07084256819018298 



CardCategory        isFraud               0.000121
DaysSinceLastTrans  isFraud               0.000338
CardType            isFraud               0.000604
Product             isFraud               0.000750
TransAmtDisc        isFraud               0.000967
Region              isFraud               0.000991
DayOfWeek           isFraud               0.001040
EmailDomain         isFraud               0.001644
Hour                isFraud               0.003820
DayOfWeek           CardType              0.017167
CardType            EmailDomain           0.021180
Hour                CardCategory          0.024154
DayOfWeek           Region                0.026983
Hour                CardType              0.027684
DayOfWeek           Product               0.028700
                    DaysSinceLastTrans    0.029885
                    CardCategory          0.030126
CardType            DaysSinceLastTrans    0.030126
DayOfWeek           EmailDomain           0.032955
TransAmtDisc        CardType   

In [11]:
# Product-Fraud example
printmsrmts = []
cl = ('Product', 'isFraud')
x = data.project(cl).datavector()
y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
z = synth.project(cl).datavector()
printmsrmts.append( (x, y, z) )
    
df = pd.DataFrame(np.concatenate(printmsrmts))
df.columns = ['P=1, F=0', 'P=1, F=1', 'P=2, F=0', 'P=2, F=1', 'P=3, F=0',
             'P=3, F=1', 'P=4, F=0', 'P=4, F=1', 'P=5, F=0', 'P=5, F=1']
df.loc[len(df.index)] =  abs(df.loc[0]-df.loc[2])
df.index = ['Original Data', 'Noisy Data', 'Synthetic Data', 'Abs Error']

display(df)

Unnamed: 0,"P=1, F=0","P=1, F=1","P=2, F=0","P=2, F=1","P=3, F=0","P=3, F=1","P=4, F=0","P=4, F=1","P=5, F=0","P=5, F=1"
Original Data,6672.0,1171.0,2140.0,340.0,2947.0,320.0,1212.0,79.0,66475.0,1362.0
Noisy Data,6670.538258,1171.006057,2136.035154,355.309709,2943.360487,308.517391,1214.605395,75.853689,66471.378045,1356.979334
Synthetic Data,6674.0,1166.0,2150.0,333.0,2950.0,325.0,1213.0,84.0,66456.0,1367.0
Abs Error,2.0,5.0,10.0,7.0,3.0,5.0,1.0,5.0,19.0,5.0
