### Run MPoM on a dataset

This code block trains and saves MPoM model, generates synthetic data and evaluates its quality using the list of metrics provided. 

In [None]:
import sys
import os
import pandas as pd
sys.path.append('..')
sys.path.append('synthetic_algorithms_comparison/step2_synthetic_algorithms/MPoM/')
from design import Experiment
from design import Dataset
from SEERData import SEERData
from MixtureProductMultinomials import MixtureProductMultinomials

data_path = "path_to_data/synthetic.csv"
dataset = SEERData(data_path=data_path)
dataset.prepare_dataset()    
K = 30
burn_in_steps = 1000
n_gibbs_steps = 10000
methods = [MixtureProductMultinomials(K=K, burn_in_steps=burn_in_steps, n_gibbs_steps=n_gibbs_steps, name='MPoM')] 
nb_samples = 1000000 

# list of metrics to measure method's performance
metrics = ['kl_divergence',
           'cross_classification',
           'cca_accuracy',
           'cluster_measure',
           'pairwise_correlation_difference',
            'coverage',
           'membership_disclosure',
            'percentage_revealed',
           'attribute_disclosure'
]

# create an experiment and execute it
exp_folder = "mpom_seer_" + data_path.split("/")[-1].split(".")[0] + "_" + str(K) + "_" + str(burn_in_steps) + "_" + str(n_gibbs_steps)
out_folder = 'path_to_save/'
exp = Experiment(out_folder, exp_folder, dataset, methods,
                 metrics, nb_gens=1, nb_samples=nb_samples, decodeflag=0) 
exp.execute()

### Generate datasets using saved method.pkl

This code block is only for generating synthetic data of any size using a trained MPoM model which is saved as a pkl file. 

In [None]:
import os
import types
import shutil
import pickle
import numpy as np
import pandas as pd
from abc import ABCMeta, abstractmethod
import sys
sys.path.append('..')
sys.path.append('synthetic_algorithms_comparison/step2_synthetic_algorithms/MPoM/')
from design import Experiment
from design import Dataset
from MixtureProductMultinomials import MixtureProductMultinomials
from SEERData import SEERData

data_path = "path_to_data/synthetic.csv"
output_folder = '../outputs'
K = 30
burn_in_steps = 1000
n_gibbs_steps = 10000
exp_folder = "mpom_seer_" + data_path.split("/")[-1].split(".")[0] + "_" + str(K) + "_" + str(burn_in_steps) + "_" + str(n_gibbs_steps)
directory = os.path.join(output_folder, exp_folder)
method_directory = os.path.join(directory, 'MPoM')
output_fname = os.path.join(method_directory,
                            'method_{}.pkl'.format('MixtureProductMultinomials'))

with open(output_fname, 'rb') as handle:
    mpom_method = pickle.load(handle)


nb_samples = 1000000 #Number of samples to generate
dataset = SEERData(data_path=data_path)
dataset.prepare_dataset()   
synth_data = dataset.decode_data(mpom_method.generate_samples(nb_samples))

# save synthetic data to csv file
filename = 'synth_1M.csv'
output_fname = os.path.join(method_directory,
                        filename)
synth_data.to_csv(output_fname)