In [1]:
import mst as mst
import mst_fair_greedy as fairMST
import mst_fair_optimal as fairMSTOpt
import numpy as np
from mbi import FactoredInference, Dataset, Domain
import scipy
from disjoint_set import DisjointSet
import networkx as nx
import itertools
from cdp2adp import cdp_rho
from scipy.special import logsumexp
import argparse
import heapq
import timeit

In [8]:

dataset = "data/adult.csv"
domain = "data/adult-domain.json"
epsilon = 0.1
delta = 1e-9
degree = 2 
num_marginals = None
max_cells = 100000
save = "data/results.csv"
save_fair = "data/results_fair.csv"
save_fair_opt ="data/results_fair_opt.csv"
outcome = ['income>50K']
#admissible = ['age','workclass','fnlwgt','education-num','marital-status','occupation','relationship','capital-gain','capital-loss','hours-per-week']
admissible = ['workclass','fnlwgt','education-num','occupation','capital-gain','capital-loss','hours-per-week']
protected = ['race','sex','native-country']
data = Dataset.load(dataset, domain)

In [None]:
# greedy_fair_mst
data = Dataset.load(dataset, domain)

workload = list(itertools.combinations(data.domain, degree))
workload = [cl for cl in workload if data.domain.size(cl) <= max_cells]
if num_marginals is not None:
    workload = [workload[i] for i in prng.choice(len(workload), num_marginals, replace=False)]

#synth = mst.MST(data, epsilon, delta)
synth = fairMST.MST(data, epsilon, delta, outcome, admissible)

if save is not None:
    synth.df.to_csv(save_fair, index=False)

errors = []
for proj in workload:
    X = data.project(proj).datavector()
    Y = synth.project(proj).datavector()
    e = 0.5*np.linalg.norm(X/X.sum() - Y/Y.sum(), 1)
    errors.append(e)
print('Average Error: ', np.mean(errors)) 

In [None]:
#fair_mst_optimal
data = Dataset.load(dataset, domain)

workload = list(itertools.combinations(data.domain, degree))
workload = [cl for cl in workload if data.domain.size(cl) <= max_cells]
if num_marginals is not None:
    workload = [workload[i] for i in prng.choice(len(workload), num_marginals, replace=False)]


synth = fairMSTOpt.MST(data, epsilon, delta, outcome, admissible, protected)

if save is not None:
    synth.df.to_csv(save_fair_opt, index=False)

errors = []
for proj in workload:
    X = data.project(proj).datavector()
    Y = synth.project(proj).datavector()
    e = 0.5*np.linalg.norm(X/X.sum() - Y/Y.sum(), 1)
    errors.append(e)
print('Average Error: ', np.mean(errors)) 

In [None]:
#original MST
data = Dataset.load(dataset, domain)

workload = list(itertools.combinations(data.domain, degree))
workload = [cl for cl in workload if data.domain.size(cl) <= max_cells]
if num_marginals is not None:
    workload = [workload[i] for i in prng.choice(len(workload), num_marginals, replace=False)]

synth = mst.MST(data, epsilon, delta)
#synth = fairMST.MST(data, epsilon, delta, outcome, admissible)

if save is not None:
    synth.df.to_csv(save, index=False)

errors = []
for proj in workload:
    X = data.project(proj).datavector()
    Y = synth.project(proj).datavector()
    e = 0.5*np.linalg.norm(X/X.sum() - Y/Y.sum(), 1)
    errors.append(e)
print('Average Error: ', np.mean(errors)) 

## Testing over 10 runs to compare error

In [None]:
data = Dataset.load(dataset, domain)
workload = list(itertools.combinations(data.domain, degree))
workload = [cl for cl in workload if data.domain.size(cl) <= max_cells]
if num_marginals is not None:
    workload = [workload[i] for i in prng.choice(len(workload), num_marginals, replace=False)] 

In [None]:

fair_greedy_adult_times= []
for i in range(10):
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, epsilon, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times.append(elapsed)

In [None]:
fair_opt_adult_times= []
for i in range(10):
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, epsilon, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times.append(elapsed)

In [None]:
mst_adult_times = []
for i in range(10):
    start_time = timeit.default_timer()
    synth = mst.MST(data, epsilon, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times.append(elapsed)

In [None]:
fair_opt_errors = []
for i in range(10):
    synth = fairMSTOpt.MST(data, 1, delta, outcome, admissible, protected)
    synth.df.to_csv("data/fair_opt/eps=1/results_opt_" + str(i) + ".csv", index=False)

In [None]:
fair_opt_errors = []
for i in range(10):
    synth = fairMSTOpt.MST(data, 10, delta, outcome, admissible, protected)
    synth.df.to_csv("data/fair_opt/eps=10/results_opt_" + str(i) + ".csv", index=False)

In [None]:
fair_opt_errors = []
for i in range(10):
    synth = fairMSTOpt.MST(data, 0.1, delta, outcome, admissible, protected)
    synth.df.to_csv("data/fair_opt/eps=0.1/results_opt_" + str(i) + ".csv", index=False)

# Aditional Experiments

## Super High epislon

In [3]:
data = Dataset.load(dataset, domain)
workload = list(itertools.combinations(data.domain, degree))
workload = [cl for cl in workload if data.domain.size(cl) <= max_cells]
if num_marginals is not None:
    workload = [workload[i] for i in prng.choice(len(workload), num_marginals, replace=False)] 

In [7]:
eps = [1000,100,10,1,0.1]
for e in eps:
    print(str(e))
    for i in range(10):
        synth = fairMST.MST(data, e, delta, outcome, admissible)
        synth.df.to_csv("data/fair_greedy/eps=" +str(e)+"/results_greedy_" + str(i) + ".csv", index=False)
        synth = mst.MST(data, e, delta)
        synth.df.to_csv("data/original/eps=" +str(e)+"/results_original_" + str(i) + ".csv", index=False)
        synth = fairMSTOpt.MST(data, e, delta, outcome, admissible, protected)
        synth.df.to_csv("data/fair_opt/eps=" +str(e)+"/results_opt_" + str(i) + ".csv", index=False)


1000
100
10


KeyboardInterrupt: 

In [4]:
#todo when you get back
eps = [1,0.1]
for e in eps:
    print(str(e))
    for i in range(10):
        synth = fairMST.MST(data, e, delta, outcome, admissible)
        synth.df.to_csv("data/fair_greedy/eps=" +str(e)+"/results_greedy_" + str(i) + ".csv", index=False)
        synth = mst.MST(data, e, delta)
        synth.df.to_csv("data/original/eps=" +str(e)+"/results_original_" + str(i) + ".csv", index=False)
        synth = fairMSTOpt.MST(data, e, delta, outcome, admissible, protected)
        synth.df.to_csv("data/fair_opt/eps=" +str(e)+"/results_opt_" + str(i) + ".csv", index=False)


1
0.1


In [None]:
for i in range(10):
    print(i)
    synth = fairMST.MST(data, 1000000, delta, outcome, admissible)
    synth.df.to_csv("data/fair_greedy/eps=1000000/results_greedy_" + str(i) + ".csv", index=False)

In [None]:
for i in range(10):
    print(i)
    synth = fairMSTOpt.MST(data, 1000000, delta, outcome, admissible, protected)
    synth.df.to_csv("data/fair_opt/eps=1000000/results_opt_" + str(i) + ".csv", index=False)

In [None]:
for i in range(10):
    print(i)
    synth = mst.MST(data, 1000000, delta)
    synth.df.to_csv("data/original/eps=1000000/results_original_" + str(i) + ".csv", index=False)

## Smaller admissible set

In [None]:
admissible = ['education-num','occupation']
protected = ['race','sex','native-country']
data = Dataset.load(dataset, domain)
eps = [1000000,10,1,0.1]
for e in eps:
    print(str(e))
    for i in range(10):
        synth = fairMST.MST(data, e, delta, outcome, admissible)
        synth.df.to_csv("data/fair_greedy/small_admis/eps=" +str(e)+"/results_greedy_" + str(i) + ".csv", index=False)
        synth = mst.MST(data, e, delta)
        synth.df.to_csv("data/original/small_admis/eps=" +str(e)+"/results_original_" + str(i) + ".csv", index=False)
        synth = fairMSTOpt.MST(data, e, delta, outcome, admissible, protected)
        synth.df.to_csv("data/fair_opt/small_admis/eps=" +str(e)+"/results_opt_" + str(i) + ".csv", index=False)


In [None]:
admissible = ['education-num','occupation']
protected = ['race','sex','native-country']
data = Dataset.load(dataset, domain)
eps = [10,1,0.1]
for e in eps:
    print(str(e))
    for i in range(10):
        #synth = fairMST.MST(data, e, delta, outcome, admissible)
        #synth.df.to_csv("data/fair_greedy/small_admis/eps=" +str(e)+"/results_greedy_" + str(i) + ".csv", index=False)
        #synth = mst.MST(data, e, delta)
        #synth.df.to_csv("data/original/small_admis/eps=" +str(e)+"/results_original_" + str(i) + ".csv", index=False)
        synth = fairMSTOpt.MST(data, e, delta, outcome, admissible, protected)
        synth.df.to_csv("data/fair_opt/small_admis/eps=" +str(e)+"/results_opt_" + str(i) + ".csv", index=False)


# Compas Dataset

In [3]:
delta = 1e-9
dataset = "data/cleaned_Compas.csv"
domain = "data/cleanded_Compas-domain.json"
admissible = ['Misdemeanor','Number_of_Priors']
protected = ['Sex','Race']
outcome = ['Two_yr_Recidivism']
data = Dataset.load(dataset, domain)

In [5]:

#eps = [1000000,10,1,0.1]
eps = [1000,100,10,1,0.1]

for e in eps:
    print(str(e))
    for i in range(10):
        synth = fairMSTOpt.MST(data, e, delta, outcome, admissible, protected)
        synth.df.to_csv("data/fair_opt/Compas/eps=" +str(e)+"/results_opt_" + str(i) + ".csv", index=False)


1000


KeyboardInterrupt: 

In [6]:

eps = [1000,100,10,1,0.1]
#eps = [10,1,0.1]

for e in eps:
    print(str(e))
    for i in range(10):
        synth = fairMST.MST(data, e, delta, outcome, admissible)
        synth.df.to_csv("data/fair_greedy/Compas/eps=" +str(e)+"/results_greedy_" + str(i) + ".csv", index=False)
        synth = mst.MST(data, e, delta)
        synth.df.to_csv("data/original/Compas/eps=" +str(e)+"/results_original_" + str(i) + ".csv", index=False)
        synth = fairMSTOpt.MST(data, e, delta, outcome, admissible, protected)
        synth.df.to_csv("data/fair_opt/Compas/eps=" +str(e)+"/results_opt_" + str(i) + ".csv", index=False)


1000
100
10
1
0.1


# For timing

In [3]:

dataset = "data/adult.csv"
domain = "data/adult-domain.json"
epsilon = 0.1
delta = 1e-9
degree = 2 
num_marginals = None
max_cells = 10000
save = "data/results.csv"
save_fair = "data/results_fair.csv"
save_fair_opt ="data/results_fair_opt.csv"
outcome = ['income>50K']
#admissible = ['age','workclass','fnlwgt','education-num','marital-status','occupation','relationship','capital-gain','capital-loss','hours-per-week']
admissible = ['workclass','fnlwgt','education-num','occupation','capital-gain','capital-loss','hours-per-week']
protected = ['race','sex','native-country']
data = Dataset.load(dataset, domain)

In [4]:

fair_greedy_adult_times10= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 10, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times10.append(elapsed)
print(fair_greedy_adult_times10)

0
1
2
3
4
5
6
7
8
9
[212.39918466499967, 197.8623674800001, 191.32921687299995, 201.1442108350002, 193.23384540899997, 211.17635349600005, 183.79868317499995, 190.95482658300034, 174.89360575799992, 217.21736015500028]


In [5]:

fair_greedy_adult_times1= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data,1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times1.append(elapsed)
print(fair_greedy_adult_times1)

0
1
2
3
4
5
6
7
8
9
[213.80042834899996, 195.72985860100016, 163.98932678999972, 173.62428774599994, 167.13612769199972, 174.79143019999992, 170.36452583400023, 178.19464891799998, 184.05615884300005, 166.54191918800007]


In [6]:

fair_greedy_adult_times01= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 0.1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times01.append(elapsed)
print(fair_greedy_adult_times01)

0
1
2
3
4
5
6
7
8
9
[230.13836387900028, 227.0165357779997, 245.85672224399968, 241.758930512, 250.3604582730004, 223.8691247860006, 225.15802639499998, 237.54128124, 246.6268733299994, 249.30580612000085]


In [7]:

fair_greedy_adult_times100= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 100, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times100.append(elapsed)
print(fair_greedy_adult_times100)

0
1
2
3
4
5
6
7
8
9
[306.4823575900009, 164.5033842089997, 159.49412701599977, 157.80584821400043, 155.36938416500016, 154.47040704000028, 154.3644473530003, 154.479138228, 196.55185694099964, 180.52453599099863]


In [8]:

fair_greedy_adult_times1000= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 1000, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times1000.append(elapsed)
print(fair_greedy_adult_times1000)

0
1
2
3
4
5
6
7
8
9
[183.50408650899953, 175.822150598, 166.1314558539998, 171.47190003300057, 164.1450147530013, 167.235414286999, 171.44589917600024, 163.32023173800007, 143.2588310400006, 142.38663801099938]


In [4]:
fair_opt_adult_times10= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 10, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times10.append(elapsed)
print(fair_opt_adult_times10)

0
1
2
3
4
5
6
7
8
9
[364.392184947, 350.69011776300005, 351.431450637, 350.02349901499997, 348.5643997719999, 354.495522831, 349.24960375599994, 353.52176839899994, 348.85698521400036, 353.8694545070002]


In [5]:
fair_opt_adult_times1= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 1, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times1.append(elapsed)
print(fair_opt_adult_times1)

0
1
2
3
4
5
6
7
8
9
[641.1338809669996, 8687.562606071999, 726.8615125209999, 735.2726047439992, 507.31379485099933, 1146.1574729270014, 926.5620283650005, 321.7229800940004, 539.990770280001, 702.3433852269991]


In [4]:
fair_opt_adult_times01= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 0.1, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times01.append(elapsed)
print(fair_opt_adult_times01)

0
1
2
3
4
5
6
7
8
9
[339.450682611, 585.963610517, 276.90973247199986, 1298.294222519, 2056.727351571, 324.88087384400023, 1547.8208502610005, 389.5506675300003, 6285.104720495, 337.08974024300005]


In [4]:
fair_opt_adult_times1000= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 1000, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times1000.append(elapsed)
print(fair_opt_adult_times1000)

0
1
2
3


KeyboardInterrupt: 

In [5]:
print(fair_opt_adult_times1000)

[6637.577593938, 6813.157968849, 6797.356252063]


In [3]:
fair_opt_adult_times100= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 100, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times100.append(elapsed)
print(fair_opt_adult_times100)

0
1
2
3
4
5
6
7
8
9
[7425.523065003, 6122.765789476, 448.4830102529995, 7815.752163006, 443.2091261429996, 417.83751367700097, 442.09474955100086, 418.3847146629996, 432.30730597500224, 454.39109104300223]


In [9]:
mst_adult_times10 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 10, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times10.append(elapsed)
print(mst_adult_times10)

0
1
2
3
4
5
6
7
8
9
[145.80017850900003, 139.69550369099852, 149.21022252799958, 161.15337991700108, 171.58620065300056, 153.44104480499846, 174.44673466200038, 181.84304023799996, 161.9469233689997, 195.97609018899857]


In [10]:
mst_adult_times1 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times1.append(elapsed)
print(mst_adult_times1)

0
1
2
3
4
5
6
7
8
9
[188.2113847090004, 198.02416013000038, 205.9857278089985, 207.9048749859994, 212.3861125809999, 212.52139032100058, 211.42588652000086, 162.1246642089991, 153.05837820399938, 172.73556160800035]


In [11]:
mst_adult_times01 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 0.1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times01.append(elapsed)
print(mst_adult_times01)

0
1
2
3
4
5
6
7
8
9
[147.7736806720004, 135.047024427, 141.93845520099967, 139.58362263900017, 122.94265923700004, 130.26025700600076, 122.37749179200182, 123.74661494500106, 137.2774541530016, 135.90771430699897]


In [12]:
mst_adult_times100 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 100, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times100.append(elapsed)
print(mst_adult_times100)

0
1
2
3
4
5
6
7
8
9
[136.79602197699933, 140.61315133000244, 146.17913709300046, 141.9014242589983, 142.00703877400156, 159.46221452600003, 152.9059304600014, 144.18336160200124, 140.46153624499857, 143.26188585899945]


In [13]:
mst_adult_times1000 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 1000, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times1000.append(elapsed)
print(mst_adult_times1000)

0
1
2
3
4
5
6
7
8
9
[156.92372260100092, 160.14496154399967, 164.9611748520001, 149.8228728819995, 140.1212448869992, 156.5109765519992, 156.8525612980011, 134.39008572200328, 134.67402931900142, 171.3021954750002]


## Smaller Admis

In [None]:
admissible = ['education-num','occupation']
protected = ['race','sex','native-country']
data = Dataset.load(dataset, domain)

In [None]:

fair_greedy_adult_times_small10= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 10, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_small10.append(elapsed)

In [None]:

fair_greedy_adult_times_small1= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data,1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_small1.append(elapsed)

In [None]:

fair_greedy_adult_times_small01= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 0.1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_small01.append(elapsed)

In [None]:
fair_opt_adult_times_small10= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 10, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times_small10.append(elapsed)

In [None]:
fair_opt_adult_times_small1= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 1, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times_small1.append(elapsed)

In [None]:
fair_opt_adult_times_small01= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 0.1, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times_small01.append(elapsed)

In [None]:
mst_adult_times_small10 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 10, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_small10.append(elapsed)

In [None]:
mst_adult_times_small1 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_small1.append(elapsed)

In [None]:
mst_adult_times_small01 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 0.1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_small01.append(elapsed)

## Compas

In [20]:

dataset = "data/cleaned_Compas.csv"
domain = "data/cleanded_Compas-domain.json"
admissible = ['Misdemeanor','Number_of_Priors']
protected = ['Sex','Race']
outcome = ['Two_yr_Recidivism']
data = Dataset.load(dataset, domain)

In [None]:

fair_greedy_adult_times_compas10= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 10, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_compas10.append(elapsed)

In [21]:

fair_greedy_adult_times_compas1= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data,1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_compas1.append(elapsed)

print(fair_greedy_adult_times_compas1)

0
1
2
3
4
5
6
7
8
9
[71.89930995099712, 77.61293670296436, 75.01399767299881, 70.97692023898708, 74.97912407497643, 71.61850200704066, 75.56107703899033, 75.67180527100572, 73.66242839797633, 76.28192502900492]


In [None]:

fair_greedy_adult_times_compas01= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 0.1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_compas01.append(elapsed)

In [None]:
fair_opt_adult_times_small10= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 10, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times_small10.append(elapsed)

In [None]:
fair_opt_adult_times_compas1= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 1, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times_compas1.append(elapsed)

In [None]:
fair_opt_adult_times_compas01= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMSTOpt.MST(data, 0.1, delta, outcome, admissible, protected)
    #synth.df.to_csv("data/fair_opt/results_opt_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_opt_adult_times_compas01.append(elapsed)

In [None]:
mst_adult_times_compas10 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 10, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_compas10.append(elapsed)

In [22]:
mst_adult_times_compas1 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_compas1.append(elapsed)
print(mst_adult_times_compas1)

0
1
2
3
4
5
6
7
8
9
[76.3417847030214, 75.81159630499315, 76.06947568600299, 76.27165862702532, 75.95898043300258, 75.91101439803606, 76.60570637404453, 73.47383972804528, 76.67854331899434, 76.78013724205084]


In [None]:
mst_adult_times_compas01 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 0.1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_compas01.append(elapsed)

In [12]:
p = 0.3
x = scipy.stats.bernoulli.rvs(p, size=1)
print(x[0])

1


# KDD dataset

In [10]:
delta = 1e-9
delta = 1e-9
degree = 2 
num_marginals = None
max_cells = 10000

dataset = "data/cleaned_KDD.csv"
domain = "data/cleanded_KDD-domain.json"
admissible = ['age','class_worker','det_ind_code','det_occ_code','education','wage_per_hour','hs_college','marital_stat','major_ind_code','major_occ_code','hisp_origin','union_member','unemp_reason','full_or_part_emp', 'capital_gains','capital_losses','stock_dividends','tax_filer_stat','region_prev_res','state_prev_res','det_hh_fam_stat','det_hh_summ','unknown','mig_chg_msa','mig_chg_reg','mig_move_reg','mig_same','mig_prev_sunbelt','num_emp','fam_under_18','country_father','country_mother','country_self','citizenship','own_or_self','vet_question','vet_benefits','weeks_worked','year']
protected = ['race','sex']
outcome = ['income_50k']
data = Dataset.load(dataset, domain)

In [11]:
eps = [10,1,0.1]
for e in eps:
    print(str(e))
    for i in range(10):
        synth = fairMST.MST(data, e, delta, outcome, admissible)
        synth.df.to_csv("data/fair_greedy/KDD/eps=" +str(e)+"/results_greedy_" + str(i) + ".csv", index=False)
        synth = mst.MST(data, e, delta)
        synth.df.to_csv("data/original/KDD/eps=" +str(e)+"/results_original_" + str(i) + ".csv", index=False)
        print(i)


10
0
1
2
3
4
5
6
7
8
9
1
0
1
2
3
4
5
6
7
8
9
0.1
0
1
2
3
4
5
6
7
8
9


# KDD Timing


In [12]:

fair_greedy_adult_times_KDD10= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 10, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_KDD10.append(elapsed)

print(fair_greedy_adult_times_KDD10)

0
1
2
3
4
5
6
7
8
9
[401.96726351800316, 478.6330233320041, 472.11514911599807, 472.88426063398947, 465.1650229359948, 484.35546018200694, 474.82494554200093, 461.3767029540031, 464.5621049860056, 464.9241725039901]


In [14]:

fair_greedy_adult_times_KDD1= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data,1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_KDD1.append(elapsed)
print(fair_greedy_adult_times_KDD1)

0
1
2
3
4
5
6
7
8
9
[418.1049694559915, 434.814901567006, 440.23468023400346, 456.74184287500975, 445.13064547900285, 451.9480824830098, 453.9571228910063, 444.496403637997, 446.2246601700026, 449.8132188430027]


In [15]:

fair_greedy_adult_times_KDD01= []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = fairMST.MST(data, 0.1, delta, outcome, admissible)
    #synth.df.to_csv("data/fair_greedy/results_greedy_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    fair_greedy_adult_times_KDD01.append(elapsed)
print(fair_greedy_adult_times_KDD01)

0
1
2
3
4
5
6
7
8
9
[564.1048684909765, 524.2504519509966, 527.3878363190161, 548.0244833499892, 514.6405399760115, 539.5298188939923, 547.6553871179931, 566.2478995089768, 510.5129596129991, 516.5536537879962]


In [16]:
mst_adult_times_KDD10 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 10, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_KDD10.append(elapsed)
print(mst_adult_times_KDD10)

0
1
2
3
4
5
6
7
8
9
[406.3444576819893, 421.63963049900485, 417.11325825797394, 425.2309247759986, 422.5558614099864, 421.1016034630011, 418.2692585499899, 431.47393345899764, 433.071168495022, 423.27310167698306]


In [18]:
mst_adult_times_KDD1 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_KDD1.append(elapsed)
print(mst_adult_times_KDD1)

0
1
2
3
4
5
6
7
8
9
[470.9472916969971, 484.9027877660119, 493.82846953201806, 472.50624610399245, 482.53425256398623, 480.05761338199954, 484.9207532389846, 478.337113918009, 493.2337973669928, 442.34391242900165]


In [19]:
mst_adult_times_KDD01 = []
for i in range(10):
    print(str(i))
    start_time = timeit.default_timer()
    synth = mst.MST(data, 0.1, delta)
    #synth.df.to_csv("data/original/results_original_" + str(i) + ".csv", index=False)
    elapsed = timeit.default_timer() - start_time
    mst_adult_times_KDD01.append(elapsed)
print(mst_adult_times_KDD01)

0
1
2
3
4
5
6
7
8
9
[432.13935321298777, 449.60822906999965, 556.3908512299822, 537.3422294720076, 491.5021447270119, 520.1693237930012, 495.78481124699465, 490.2819970700075, 534.9495899100148, 531.9331634860137]
