## Comparing distributions

In [1]:
import pandas as pd
import numpy as np
from itertools import product
numeric_variable_nums = dict([('boston', 12), ('house',8),('sim_1', 0),('sim_2',0), ('sim_1_tiny',0), ('sim_2_tiny',0)])

In [2]:
# generate complete data's conditional distributions
def generate_cond(dataset, mr, size, sample_id):
    complete_data_path = './samples/' + dataset + '/complete_' + str(mr) + '_' + str(size) + '/sample_' + str(sample_id) + '.csv'
    data = pd.read_csv(complete_data_path, header=None)

    # divide cat/num type
    num_index = list(range(data.shape[1] - numeric_variable_nums[dataset], data.shape[1]))
    cat_index = list(range(0, data.shape[1] - numeric_variable_nums[dataset]))

    # get all possible levels' combination for categorical variable
    all_levels = [np.unique(data.iloc[:,i]).tolist() for i in cat_index]
    all_levels_comb = list(product(*all_levels[:-1]))

    # calculate conditional distributions for complete data
    cond_dist_complete = dict.fromkeys(all_levels_comb, None)
    for index, item in data.iterrows():
        cond = (item[0], item[1], item[2])
        if cond_dist_complete[cond] == None:
            cond_dist_complete[cond] = [0 for x in range(len(all_levels[-1]))]
        cond_dist_complete[cond][int(item[3])] += 1

    for key in cond_dist_complete.keys():
        denom = sum(cond_dist_complete[key])
        cond_dist_complete[key] = [round(x / denom, 3) for x in cond_dist_complete[key]]

    return all_levels, all_levels_comb, cond_dist_complete

In [3]:
# output KL divergence for each pair of conditional distribution in one sample
def kl_comparison(method, imputed_data_folder, all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num):
    # calculate conditional distributions from imputed datasets
    cond_dist_imputed = dict.fromkeys(all_levels_comb, None)
    for i in range(impute_num):
        current_imputed_dir = imputed_data_folder + 'imputed_' + str(sample_id) + '_' + str(i) + '.csv'
        imputed_data = pd.read_csv(current_imputed_dir, header=None)
        for index, item in imputed_data.iterrows():
            if method == 'cart':
                cond = (item[0]-1, item[1]-1, item[2]-1)
            else:
                cond = (item[0], item[1], item[2])
            if cond_dist_imputed[cond] == None:
                cond_dist_imputed[cond] = [0 for x in range(len(all_levels[-1]))]
            cond_dist_imputed[cond][int(item[3] - 1)] += 1

    for key in cond_dist_imputed.keys():
        denom = sum(cond_dist_imputed[key])
        cond_dist_imputed[key] = [round(x / denom, 3) for x in cond_dist_imputed[key]]


    # output comparing KL divergence
    from scipy.special import rel_entr
    comparison_dict = dict.fromkeys(all_levels_comb, None)
    for key in comparison_dict.keys():
        comparison_dict[key] = round(sum(rel_entr(cond_dist_complete[key], cond_dist_imputed[key])),6)
    average_kl = np.average(list(comparison_dict.values()))
    return average_kl, comparison_dict


In [None]:
# calculate average result [complete || imputed]
def average_kl_comparison(dataset, mr, size, sample_num, impute_num, method_list):
    metric_avr_kl = pd.DataFrame(columns=['method','sample_id','avr_kl'])
    for sample_id in range(sample_num):
        all_levels, all_levels_comb, cond_dist_complete = generate_cond(dataset, mr, size, sample_id)
        for method in method_list:
            imputed_data_folder = './results/' + dataset + '/MCAR_' + str(mr) + '_' + str(size) + '/' + method + '/'
            average_kl, kl_dict = kl_comparison(method, imputed_data_folder,all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num)

## sim_1_tiny

In [4]:
# complete data
# readin datasets (complete dataset)
dataset = 'sim_1_tiny'
mr = 0.3
size = 5000
sample_id = 0
impute_num = 10
all_levels, all_levels_comb, cond_dist_complete = generate_cond(dataset, mr, size, sample_id)

In [8]:
method = 'gain'
imputed_data_folder = './results/' + dataset + '/MCAR_' + str(mr) + '_' + str(size) + '/' + method + '/'
average_kl, comparison_dict = kl_comparison(method, imputed_data_folder,all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num)
print(average_kl)
comparison_dict

0.7193284999999999


{(0.0, 0.0, 0.0): 0.897203,
 (0.0, 0.0, 1.0): 0.359239,
 (0.0, 0.0, 2.0): 0.150884,
 (0.0, 1.0, 0.0): 0.816253,
 (0.0, 1.0, 1.0): 0.131686,
 (0.0, 1.0, 2.0): 0.886153,
 (1.0, 0.0, 0.0): 0.715641,
 (1.0, 0.0, 1.0): 0.174782,
 (1.0, 0.0, 2.0): 0.494988,
 (1.0, 1.0, 0.0): 0.806424,
 (1.0, 1.0, 1.0): 0.249321,
 (1.0, 1.0, 2.0): 1.650961,
 (2.0, 0.0, 0.0): 0.435121,
 (2.0, 0.0, 1.0): 0.264683,
 (2.0, 0.0, 2.0): 1.280886,
 (2.0, 1.0, 0.0): 0.305652,
 (2.0, 1.0, 1.0): 0.475506,
 (2.0, 1.0, 2.0): 2.85253}

In [6]:
method = 'cart'
imputed_data_folder = './results/' + dataset + '/MCAR_' + str(mr) + '_' + str(size) + '/' + method + '/'
kl_comparison(method, imputed_data_folder,all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num)

(0.002291611111111111,
 {(0.0, 0.0, 0.0): 0.002585,
  (0.0, 0.0, 1.0): 0.001789,
  (0.0, 0.0, 2.0): 0.001293,
  (0.0, 1.0, 0.0): 0.001408,
  (0.0, 1.0, 1.0): 0.001067,
  (0.0, 1.0, 2.0): 0.004128,
  (1.0, 0.0, 0.0): 0.000248,
  (1.0, 0.0, 1.0): -0.000108,
  (1.0, 0.0, 2.0): 0.002078,
  (1.0, 1.0, 0.0): 0.001093,
  (1.0, 1.0, 1.0): 0.006755,
  (1.0, 1.0, 2.0): 0.004271,
  (2.0, 0.0, 0.0): 0.00599,
  (2.0, 0.0, 1.0): 0.002979,
  (2.0, 0.0, 2.0): -0.000246,
  (2.0, 1.0, 0.0): 0.003024,
  (2.0, 1.0, 1.0): 0.00034,
  (2.0, 1.0, 2.0): 0.002555})