## Comparing distributions

In [109]:
import pandas as pd
import numpy as np
from itertools import product
from scipy.special import rel_entr
numeric_variable_nums = dict([('boston', 12), ('house',8),('sim_1', 0),('sim_2',0), ('sim_1_tiny',0), ('sim_2_tiny',0)])

In [110]:
# generate complete data's conditional distributions
def generate_cond(dataset, mr, size, sample_id, is_vae = False):
    if is_vae:
        complete_data_path = '../vaeac/samples/'+ dataset + '/complete_' + str(mr) + '_' + str(size) + '/sample_' + str(sample_id) + '.csv'
    else:
        complete_data_path = './samples/' + dataset + '/complete_' + str(mr) + '_' + str(size) + '/sample_' + str(sample_id) + '.csv'
    data = pd.read_csv(complete_data_path, header=None)

    # divide cat/num type
    num_index = list(range(data.shape[1] - numeric_variable_nums[dataset], data.shape[1]))
    cat_index = list(range(0, data.shape[1] - numeric_variable_nums[dataset]))

    # get all possible levels' combination for categorical variable
    all_levels = [np.unique(data.iloc[:,i]).tolist() for i in cat_index]
    all_levels_comb = list(product(*all_levels[:-1]))

    # calculate conditional distributions for complete data
    cond_dist_complete = dict.fromkeys(all_levels_comb, None)
    for index, item in data.iterrows():
        cond = (int(item[0]), int(item[1]), int(item[2]))
        if cond_dist_complete[cond] == None:
            cond_dist_complete[cond] = [0 for x in range(len(all_levels[-1]))]
        cond_dist_complete[cond][int(item[3])] += 1

    for key in cond_dist_complete.keys():
        denom = sum(cond_dist_complete[key])
        cond_dist_complete[key] = [round(x / denom, 3) for x in cond_dist_complete[key]]
    
    pdata = data.copy()
    pdata[3] = np.random.permutation(pdata[3])

    return all_levels, all_levels_comb, cond_dist_complete, pdata

In [111]:
# calculate kl result for permutation data
def perm_kl_result(all_levels, all_levels_comb, cond_dist_complete, perm_data):
    cond_dist_imputed = dict.fromkeys(all_levels_comb, None)
    for index, item in perm_data.iterrows():
            cond = (item[0], item[1], item[2])
            if cond_dist_imputed[cond] == None:
                cond_dist_imputed[cond] = [0 for x in range(len(all_levels[-1]))]
            cond_dist_imputed[cond][int(item[3])] += 1

    for key in cond_dist_imputed.keys():
        denom = sum(cond_dist_imputed[key])
        cond_dist_imputed[key] = [round(x / denom, 3) for x in cond_dist_imputed[key]]

    # output comparing KL divergence
    comparison_dict = dict.fromkeys(all_levels_comb, None)
    for key in comparison_dict.keys():
        if cond_dist_imputed[key] == None:
            continue
        comparison_dict[key] = round(sum(rel_entr(cond_dist_complete[key], cond_dist_imputed[key])),6)
    print(list(comparison_dict.values()))
    average_kl = np.average(list(comparison_dict.values()))
    return average_kl, comparison_dict

In [126]:
# output KL divergence for each pair of conditional distribution in one sample
def kl_comparison(method, imputed_data_folder, all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num):
    # calculate conditional distributions from imputed datasets
    cond_dist_imputed = dict.fromkeys(all_levels_comb, None)
    for i in range(impute_num):
        current_imputed_dir = imputed_data_folder + 'imputed_' + str(sample_id) + '_' + str(i) + '.csv'
        imputed_data = pd.read_csv(current_imputed_dir, header=None)
        for index, item in imputed_data.iterrows():
            if (method == 'cart' or method == 'vaeac'):
                item = [int(item[0] - 1), int(item[1] - 1), int(item[2] - 1), int(item[3] - 1)]
            cond = (int(item[0]), int(item[1]), int(item[2]))
            
            if cond_dist_imputed[cond] == None:
                cond_dist_imputed[cond] = [0 for x in range(len(all_levels[-1]))]
            cond_dist_imputed[cond][int(item[3])] += 1

    for key in cond_dist_imputed.keys():
        if cond_dist_imputed[key] == None:
            continue
        denom = sum(cond_dist_imputed[key])
        cond_dist_imputed[key] = [round(x / denom, 3) for x in cond_dist_imputed[key]]

    # output comparing KL divergence
    comparison_dict = dict.fromkeys(all_levels_comb, None)
    for key in comparison_dict.keys():
        if cond_dist_imputed[key] == None:
            continue
        comparison_dict[key] = round(sum(rel_entr(cond_dist_complete[key], cond_dist_imputed[key])),6)
    print(list(comparison_dict.values()))
    average_kl = np.average(list(comparison_dict.values()))
    return average_kl, comparison_dict


In [65]:
# calculate average result [complete || imputed]
def average_kl_comparison(dataset, mr, size, sample_num, impute_num, method_list):
    metric_avr_kl = pd.DataFrame(columns=['method','sample_id','avr_kl'])
    for sample_id in range(sample_num):
        all_levels, all_levels_comb, cond_dist_complete = generate_cond(dataset, mr, size, sample_id)
        for method in method_list:
            imputed_data_folder = './results/' + dataset + '/MCAR_' + str(mr) + '_' + str(size) + '/' + method + '/'
            average_kl, kl_dict = kl_comparison(method, imputed_data_folder,all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num)

## sim_1_tiny

In [113]:
# complete data
# readin datasets (complete dataset)
dataset = 'sim_1_tiny'
mr = 0.3
size = 5000
sample_id = 0
impute_num = 10
all_levels, all_levels_comb, cond_dist_complete, perm_data = generate_cond(dataset, mr, size, sample_id)

In [114]:
# rand result
perm_kl_result(all_levels, all_levels_comb, cond_dist_complete, perm_data)

[0.391457, 0.136594, 0.034667, 0.284934, 0.016709, 0.273354, 0.316254, 0.073562, 0.142621, 0.233405, 0.025106, 0.316026, 0.219032, 0.050958, 0.40536, 0.064692, 0.08999, 0.523802]


(0.19991794444444444,
 {(0.0, 0.0, 0.0): 0.391457,
  (0.0, 0.0, 1.0): 0.136594,
  (0.0, 0.0, 2.0): 0.034667,
  (0.0, 1.0, 0.0): 0.284934,
  (0.0, 1.0, 1.0): 0.016709,
  (0.0, 1.0, 2.0): 0.273354,
  (1.0, 0.0, 0.0): 0.316254,
  (1.0, 0.0, 1.0): 0.073562,
  (1.0, 0.0, 2.0): 0.142621,
  (1.0, 1.0, 0.0): 0.233405,
  (1.0, 1.0, 1.0): 0.025106,
  (1.0, 1.0, 2.0): 0.316026,
  (2.0, 0.0, 0.0): 0.219032,
  (2.0, 0.0, 1.0): 0.050958,
  (2.0, 0.0, 2.0): 0.40536,
  (2.0, 1.0, 0.0): 0.064692,
  (2.0, 1.0, 1.0): 0.08999,
  (2.0, 1.0, 2.0): 0.523802})

In [115]:
method = 'gain'
imputed_data_folder = './results/' + dataset + '/MCAR_' + str(mr) + '_' + str(size) + '/' + method + '/'
average_kl, comparison_dict = kl_comparison(method, imputed_data_folder,all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num)
print(average_kl)
comparison_dict

[0.039814, 0.072842, 0.072016, 0.036914, 0.167206, 0.057063, 0.102783, 0.112805, 0.087603, 0.151329, 0.079316, 0.111291, 0.181049, 0.126689, 0.051071, 0.083868, 0.086296, 0.122726]
0.09681561111111109


{(0.0, 0.0, 0.0): 0.039814,
 (0.0, 0.0, 1.0): 0.072842,
 (0.0, 0.0, 2.0): 0.072016,
 (0.0, 1.0, 0.0): 0.036914,
 (0.0, 1.0, 1.0): 0.167206,
 (0.0, 1.0, 2.0): 0.057063,
 (1.0, 0.0, 0.0): 0.102783,
 (1.0, 0.0, 1.0): 0.112805,
 (1.0, 0.0, 2.0): 0.087603,
 (1.0, 1.0, 0.0): 0.151329,
 (1.0, 1.0, 1.0): 0.079316,
 (1.0, 1.0, 2.0): 0.111291,
 (2.0, 0.0, 0.0): 0.181049,
 (2.0, 0.0, 1.0): 0.126689,
 (2.0, 0.0, 2.0): 0.051071,
 (2.0, 1.0, 0.0): 0.083868,
 (2.0, 1.0, 1.0): 0.086296,
 (2.0, 1.0, 2.0): 0.122726}

In [106]:
method = 'cart'
imputed_data_folder = './results/' + dataset + '/MCAR_' + str(mr) + '_' + str(size) + '/' + method + '/'
kl_comparison(method, imputed_data_folder,all_levels, all_levels_comb, cond_dist_complete, sample_id, impute_num)

[0.003231, 0.000728, 0.000128, 0.002845, 0.002234, 0.006296, 0.000271, 0.000966, 0.001838, 0.000858, 0.005284, 0.002473, 0.009529, 0.003479, -0.000121, 0.002401, 0.000168, 0.003401]


(0.0025560555555555558,
 {(0.0, 0.0, 0.0): 0.003231,
  (0.0, 0.0, 1.0): 0.000728,
  (0.0, 0.0, 2.0): 0.000128,
  (0.0, 1.0, 0.0): 0.002845,
  (0.0, 1.0, 1.0): 0.002234,
  (0.0, 1.0, 2.0): 0.006296,
  (1.0, 0.0, 0.0): 0.000271,
  (1.0, 0.0, 1.0): 0.000966,
  (1.0, 0.0, 2.0): 0.001838,
  (1.0, 1.0, 0.0): 0.000858,
  (1.0, 1.0, 1.0): 0.005284,
  (1.0, 1.0, 2.0): 0.002473,
  (2.0, 0.0, 0.0): 0.009529,
  (2.0, 0.0, 1.0): 0.003479,
  (2.0, 0.0, 2.0): -0.000121,
  (2.0, 1.0, 0.0): 0.002401,
  (2.0, 1.0, 1.0): 0.000168,
  (2.0, 1.0, 2.0): 0.003401})

In [128]:
all_levels_vae, all_levels_comb_vae, cond_dist_complete_vae,_ = generate_cond(dataset, mr, size, sample_id, True)

method = 'vaeac'
imputed_data_folder = './results/' + dataset + '/MCAR_' + str(mr) + '_' + str(size) + '/' + method + '/'
kl_comparison(method, imputed_data_folder,all_levels_vae, all_levels_comb_vae, cond_dist_complete_vae, sample_id, impute_num)

KeyError: (1, -1, 1)

In [136]:
path = 'D:/sjx/ISU/projects/DataImputation/MissingData_DL/results/sim_1_tiny/MCAR_0.3_5000/vaeac/imputed_0_0.csv'
data = pd.read_csv(path, header = None)
data.describe()

Unnamed: 0,0,1,2,3
count,5000.0,5000.0,5000.0,5000.0
mean,3.0118,1.494,1.13,1.0398
std,0.817187,0.500014,0.901811,1.159347
min,2.0,1.0,0.0,0.0
25%,2.0,1.0,0.0,0.0
50%,3.0,1.0,1.0,1.0
75%,4.0,2.0,2.0,2.0
max,4.0,2.0,4.0,3.0
