Skip to content

Commit

Permalink
Add fix to noisy distribution of attributes dataframe creation to sto…
Browse files Browse the repository at this point in the history
…p process from running out of memory with large datasets
  • Loading branch information
Alex Swann committed Apr 22, 2020
1 parent 0a5f13f commit 1abe702
Showing 1 changed file with 20 additions and 2 deletions.
22 changes: 20 additions & 2 deletions DataSynthesizer/lib/PrivBayes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import random
import warnings
from itertools import combinations, product
from itertools import combinations, product, islice, chain
from math import log, ceil
from multiprocessing.pool import Pool

Expand Down Expand Up @@ -202,7 +202,25 @@ def get_noisy_distribution_of_attributes(attributes, encoded_dataset, epsilon=0.
stats = data.groupby(attributes).sum()

iterables = [range(int(encoded_dataset[attr].max()) + 1) for attr in attributes]
full_space = DataFrame(columns=attributes, data=list(product(*iterables)))
products = product(*iterables)

def grouper_it(iterable, n):
while True:
chunk_it = islice(iterable, n)
try:
first_el = next(chunk_it)
except StopIteration:
return
yield chain((first_el,), chunk_it)

full_space = None
for item in grouper_it(products, 1000000):
if full_space is None:
full_space = pd.DataFrame(columns=attributes, data=list(item))
else:
data_frame_append = pd.DataFrame(columns=attributes, data=list(item))
full_space.append(data_frame_append)

stats.reset_index(inplace=True)
stats = merge(full_space, stats, how='left')
stats.fillna(0, inplace=True)
Expand Down

0 comments on commit 1abe702

Please sign in to comment.