In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import itertools, random
from mbi import Dataset, FactoredInference
from cdp2adp import cdp_rho

In [2]:
# Load data
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())
display(data.domain)

Unnamed: 0,Region,Residence.Type,Family.Composition,Population.Base,Sex,Age,Marital.Status,Student,Country.of.Birth,Health,Ethnic.Group,Religion,Economic.Activity,Occupation,Industry,Hours.worked.per.week,Approximated.Social.Grade
0,7,1,0,0,1,5,3,1,1,1,0,1,0,3,7,2,1
1,1,1,1,0,1,2,1,1,0,1,0,1,0,3,11,1,1
2,7,1,0,0,0,2,1,1,1,0,4,6,0,0,2,2,0
3,1,1,0,0,1,1,0,0,0,1,2,5,5,3,7,4,1
4,1,1,2,0,1,2,0,1,0,0,0,0,8,8,5,4,1


Domain(Region: 10, Residence.Type: 2, Family.Composition: 7, Population.Base: 3, Sex: 2, Age: 8, Marital.Status: 5, Student: 2, Country.of.Birth: 3, Health: 6, Ethnic.Group: 6, Religion: 10, Economic.Activity: 10, Occupation: 10, Industry: 13, Hours.worked.per.week: 5, Approximated.Social.Grade: 5)

In [3]:
# Set eps, delta and calculate sigma
epsilon = 1.0
delta = 1e-9
rho = cdp_rho(epsilon, delta)
sigma = np.sqrt(3/(2*rho))
display(sigma)

10.008992891756666

In [4]:
# MEASURE the 1-way marginals and log the noisy answers
cliques = [(col,) for col in data.domain]

measurements_1d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_1d.append( (I, y, sigma, cl) )

In [5]:
# SELECT the MST marginals
df = pd.read_csv("mst_marginals.csv")
display(df)
cliques = df[["from", "to"]].to_records(index=False)
cliques = tuple(map(tuple, cliques))

Unnamed: 0,fromN,toN,from,to,error
0,6,12,Family.Composition,Population.Base,0.05884
1,15,6,Residence.Type,Family.Composition,0.07382
2,13,5,Region,Ethnic.Group,0.16942
3,1,14,Age,Religion,0.2483
4,3,5,Country.of.Birth,Ethnic.Group,0.25784
5,16,11,Sex,Occupation,0.26388
6,5,14,Ethnic.Group,Religion,0.29008
7,7,4,Health,Economic.Activity,0.42308
8,17,4,Student,Economic.Activity,0.64864
9,6,10,Family.Composition,Marital.Status,0.65654


In [6]:
# MEASURE the selected 2-way marginals and log the noisy answers
measurements_2d = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.size)
    I = sparse.eye(x.size)
    measurements_2d.append( (I, y, sigma, cl) )

In [7]:
# GENERATE synthetic data using Private-PGM 
engine = FactoredInference(data.domain, iters=2000)
model = engine.estimate(measurements_1d+measurements_2d)
synth = model.synthetic_data(rows = total)

In [8]:
synth.df.to_csv('mst_data.csv')

In [9]:
# Evaluate the errors on 1-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 1):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.0006635294117647059 



Country.of.Birth             0.00020
Sex                          0.00022
Approximated.Social.Grade    0.00022
Residence.Type               0.00046
Marital.Status               0.00048
Region                       0.00048
Student                      0.00050
Occupation                   0.00064
Family.Composition           0.00064
Population.Base              0.00068
Economic.Activity            0.00070
Hours.worked.per.week        0.00072
Ethnic.Group                 0.00072
Health                       0.00108
Religion                     0.00112
Industry                     0.00118
Age                          0.00124
dtype: float64

In [10]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.07092323529411765 



Residence.Type      Sex                      0.00128
Population.Base     Sex                      0.00130
Student             Hours.worked.per.week    0.00134
Residence.Type      Family.Composition       0.00136
Sex                 Occupation               0.00138
                                              ...   
Occupation          Hours.worked.per.week    0.25392
Family.Composition  Economic.Activity        0.26240
Economic.Activity   Industry                 0.26470
Family.Composition  Age                      0.26598
Economic.Activity   Occupation               0.27522
Length: 136, dtype: float64

In [11]:
# Occupation-Sex example
printmsrmts = []
cl = ('Occupation', 'Sex')
x = data.project(cl).datavector()
y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
z = synth.project(cl).datavector()
printmsrmts.append( (x, y, z) )
    
df = pd.DataFrame(np.concatenate(printmsrmts))
df.columns = ['Senior Official, Male', 'Senior Official, Female', 'Professional, Male', 'Professional, Female', 
              'Technical, Male', 'Technical, Female', 'Administrative, Male', 'Administrative, Female', 
              'Skilled Trades, Male', 'Skilled Trades, Female', 'Caring, Male', 'Caring, Female',
              'Sales, Male', 'Sales, Female', 'Machine, Male', 'Machine, Female',
              'Elementary, Male', 'Elementary, Female', 'Unemployed, Male', 'Unemployed, Female']
df.loc[len(df.index)] =  abs(df.loc[0]-df.loc[2])
df.index = ['Original Data', 'Noisy Data', 'Synthetic Data', 'Abs Error']

display(df.T)

Unnamed: 0,Original Data,Noisy Data,Synthetic Data,Abs Error
"Senior Official, Male",4490.0,4487.941652,4494.0,4.0
"Senior Official, Female",2419.0,2414.188016,2407.0,12.0
"Professional, Male",5361.0,5357.785205,5351.0,10.0
"Professional, Female",5961.0,5966.652533,5968.0,7.0
"Technical, Male",4494.0,4494.03379,4487.0,7.0
"Technical, Female",3347.0,3349.299022,3348.0,1.0
"Administrative, Male",1849.0,1866.681574,1843.0,6.0
"Administrative, Female",7470.0,7450.941073,7473.0,3.0
"Skilled Trades, Male",7147.0,7169.094393,7134.0,13.0
"Skilled Trades, Female",1259.0,1271.767573,1266.0,7.0
