In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import itertools, random
from mbi import Dataset, FactoredInference
from cdp2adp import cdp_rho

In [2]:
# Load data
data = Dataset.load('data.csv', 'domain.json')
total = data.df.shape[0]
display(data.df.head())
display(data.domain)

Unnamed: 0,Region,Residence.Type,Family.Composition,Population.Base,Sex,Age,Marital.Status,Student,Country.of.Birth,Health,Ethnic.Group,Religion,Economic.Activity,Occupation,Industry,Hours.worked.per.week,Approximated.Social.Grade
0,7,1,0,0,1,5,3,1,1,1,0,1,0,3,7,2,1
1,1,1,1,0,1,2,1,1,0,1,0,1,0,3,11,1,1
2,7,1,0,0,0,2,1,1,1,0,4,6,0,0,2,2,0
3,1,1,0,0,1,1,0,0,0,1,2,5,5,3,7,4,1
4,1,1,2,0,1,2,0,1,0,0,0,0,8,8,5,4,1


Domain(Region: 10, Residence.Type: 2, Family.Composition: 7, Population.Base: 3, Sex: 2, Age: 8, Marital.Status: 5, Student: 2, Country.of.Birth: 3, Health: 6, Ethnic.Group: 6, Religion: 10, Economic.Activity: 10, Occupation: 10, Industry: 13, Hours.worked.per.week: 5, Approximated.Social.Grade: 5)

In [3]:
# Set eps, delta and calculate sigma
epsilon = 1.0
delta = 1e-9
rho = cdp_rho(epsilon, delta)
sigma = np.sqrt(3/(2*rho))
display(sigma)

10.008992891756666

In [4]:
# MEASURE all the 1-way marginals and log the noisy answers
cliques = set(itertools.combinations(data.domain, 1))
measurements = []
for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
    I = sparse.eye(x.size)
    measurements.append( (I, y, sigma, cl) )

In [11]:
# GENERATE synthetic data using Private-PGM 
engine = FactoredInference(data.domain, iters=2000)
model = engine.estimate(measurements)
synth = model.synthetic_data(rows = total)

In [12]:
display(synth.df)

Unnamed: 0,Region,Residence.Type,Family.Composition,Population.Base,Sex,Age,Marital.Status,Student,Country.of.Birth,Health,Ethnic.Group,Religion,Economic.Activity,Occupation,Industry,Hours.worked.per.week,Approximated.Social.Grade
0,1,1,1,0,0,1,0,1,0,0,0,8,0,1,1,4,4
1,4,1,1,0,1,1,1,0,0,0,0,0,9,6,8,4,0
2,2,1,1,0,1,7,1,1,0,0,0,0,4,0,5,1,2
3,5,1,1,0,0,2,0,1,0,0,0,1,4,8,9,3,3
4,1,1,2,1,1,1,0,1,0,1,4,1,0,4,3,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6,1,0,0,1,3,1,1,0,1,0,0,6,4,11,2,1
99996,2,1,1,0,1,0,0,1,0,3,0,0,0,9,11,2,0
99997,5,1,2,0,0,7,1,0,0,1,0,0,4,9,9,0,4
99998,3,1,1,0,0,4,0,1,0,0,0,1,0,7,8,1,1


In [None]:
synth.df.to_csv('baseline_data.csv')

In [13]:
# Evaluate the errors on 1-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 1):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.0005200000000000001 



Sex                          0.00006
Residence.Type               0.00014
Country.of.Birth             0.00016
Student                      0.00020
Population.Base              0.00028
Hours.worked.per.week        0.00030
Family.Composition           0.00036
Approximated.Social.Grade    0.00038
Marital.Status               0.00044
Occupation                   0.00062
Age                          0.00066
Health                       0.00068
Ethnic.Group                 0.00068
Religion                     0.00078
Economic.Activity            0.00088
Industry                     0.00106
Region                       0.00116
dtype: float64

In [14]:
# Evaluate the errors on 2-way marginals
def score(synth):
  errors = {}
  for cl in itertools.combinations(data.domain, 2):
    true_marginal = data.project(cl).datavector()
    est_marginal = synth.project(cl).datavector()
    errors[cl] = np.linalg.norm(true_marginal-est_marginal, 1) / data.records

  errors = pd.Series(errors).sort_values()

  print('Average Error', errors.mean(), '\n')
  return errors

df = score(synth)
display(df)

Average Error 0.21689323529411764 



Residence.Type     Sex                          0.00052
Population.Base    Sex                          0.00174
Residence.Type     Ethnic.Group                 0.00304
                   Population.Base              0.00306
                   Religion                     0.00496
                                                 ...   
Age                Marital.Status               0.75220
Economic.Activity  Hours.worked.per.week        1.00134
Age                Economic.Activity            1.02312
Occupation         Industry                     1.03802
                   Approximated.Social.Grade    1.08140
Length: 136, dtype: float64

In [None]:
df.to_csv('baseline.csv')

In [15]:
# Occupation-Sex example
printmsrmts = []
cl = ('Occupation', 'Sex')
x = data.project(cl).datavector()
y = x + np.random.normal(loc=0, scale=sigma, size=x.shape)
z = synth.project(cl).datavector()
printmsrmts.append( (x, y, z) )
    
df = pd.DataFrame(np.concatenate(printmsrmts))
df.columns = ['Senior Official, Male', 'Senior Official, Female', 'Professional, Male', 'Professional, Female', 
              'Technical, Male', 'Technical, Female', 'Administrative, Male', 'Administrative, Female', 
              'Skilled Trades, Male', 'Skilled Trades, Female', 'Caring, Male', 'Caring, Female',
              'Sales, Male', 'Sales, Female', 'Machine, Male', 'Machine, Female',
              'Elementary, Male', 'Elementary, Female', 'Unemployed, Male', 'Unemployed, Female']
df.loc[len(df.index)] =  abs(df.loc[0]-df.loc[2])
df.index = ['Original Data', 'Noisy Data', 'Synthetic Data', 'Abs Error']

display(df.T)

Unnamed: 0,Original Data,Noisy Data,Synthetic Data,Abs Error
"Senior Official, Male",4490.0,4493.507098,3393.0,1097.0
"Senior Official, Female",2419.0,2405.010955,3502.0,1083.0
"Professional, Male",5361.0,5357.694919,5616.0,255.0
"Professional, Female",5961.0,5964.680101,5698.0,263.0
"Technical, Male",4494.0,4500.225138,3869.0,625.0
"Technical, Female",3347.0,3357.063833,3968.0,621.0
"Administrative, Male",1849.0,1848.865463,4583.0,2734.0
"Administrative, Female",7470.0,7461.34266,4746.0,2724.0
"Skilled Trades, Male",7147.0,7141.090972,4223.0,2924.0
"Skilled Trades, Female",1259.0,1254.988929,4189.0,2930.0
