# Results for Generalization using KL Divergence

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
#Get probability distribution files for weak generalization - Standard set transformer
wg_pt_path1 = './probability_output_files/Weak_Generalization/64_16_128_3_0.05/'
wg_pt_output_files1 = [file for file in os.listdir(wg_pt_path1) if file.startswith('output_file_') and file.endswith('.csv')]

#Get probability distribution files for strong generalization - Standard set transformer
sg_pt_path1 = './probability_output_files/Strong_Generalization/64_16_128_3_0.05/'
sg_pt_output_files1 = [file for file in os.listdir(sg_pt_path1) if file.startswith('output_file_') and file.endswith('.csv')]

#Get probability distribution files for weak generalization - contrastive pre-trained model
wg_pt_path2 = './probability_output_files/Weak_Generalization/masked_encoder_64_16_128_3_0.05/'
wg_pt_output_files2 = [file for file in os.listdir(wg_pt_path2) if file.startswith('output_file_') and file.endswith('.csv')]

#Get probability distribution files for strong generalization - contrastive pre-trained model
sg_pt_path2 = './probability_output_files/Strong_Generalization/masked_encoder_64_16_128_3_0.05/'
sg_pt_output_files2 = [file for file in os.listdir(sg_pt_path2) if file.startswith('output_file_') and file.endswith('.csv')]

In [3]:
def concat(path, output_files):
    df_dict = {}
    for i, file in enumerate(output_files):
        df_dict[f'df_{i+1}'] = pd.read_csv(path+file)
        df_dict[f'df_{i+1}'].columns = df_dict[f'df_{i+1}'].columns.astype(int)
        
    return df_dict
    
wg_pt_df_dict1 = concat(wg_pt_path1, wg_pt_output_files1)
sg_pt_df_dict1 = concat(sg_pt_path1, sg_pt_output_files1)
wg_pt_df_dict2 = concat(wg_pt_path2, wg_pt_output_files2)
sg_pt_df_dict2 = concat(sg_pt_path2, sg_pt_output_files2)

In [4]:
# Create a 3D matrix from the DataFrames
wg_pt_data_matrix1 = np.array([df1.values for df1 in wg_pt_df_dict1.values()])
sg_pt_data_matrix1 = np.array([df1.values for df1 in sg_pt_df_dict1.values()])
wg_pt_data_matrix2 = np.array([df1.values for df1 in wg_pt_df_dict2.values()])
sg_pt_data_matrix2 = np.array([df1.values for df1 in sg_pt_df_dict2.values()])

In [5]:
print(wg_pt_data_matrix1.shape)
print(sg_pt_data_matrix1.shape)
print(wg_pt_data_matrix2.shape)
print(sg_pt_data_matrix2.shape)

(10, 75, 75)
(10, 75, 75)
(10, 75, 75)
(10, 75, 75)


In [6]:
# Calculate the mean along the first axis (axis=0)
# To combine all individual run
wg_pt_combined_matrix1 = np.mean(wg_pt_data_matrix1, axis=0)
sg_pt_combined_matrix1 = np.mean(sg_pt_data_matrix1, axis=0)
wg_pt_combined_matrix2 = np.mean(wg_pt_data_matrix2, axis=0)
sg_pt_combined_matrix2 = np.mean(sg_pt_data_matrix2, axis=0)

In [7]:
# replacing diagonal values with small number for normalization
np.fill_diagonal(wg_pt_combined_matrix1, 1e-9)
np.fill_diagonal(wg_pt_combined_matrix2, 1e-9)

In [8]:
# Get sum for normalization
wg_pt_matrix_sum1 = pd.DataFrame(wg_pt_combined_matrix1).sum().sum()
print(wg_pt_matrix_sum1)
sg_pt_matrix_sum1 = pd.DataFrame(sg_pt_combined_matrix1).sum().sum()
print(sg_pt_matrix_sum1)

wg_pt_matrix_sum2 = pd.DataFrame(wg_pt_combined_matrix2).sum().sum()
print(wg_pt_matrix_sum2)
sg_pt_matrix_sum2 = pd.DataFrame(sg_pt_combined_matrix2).sum().sum()
print(sg_pt_matrix_sum2)


19.994048784534293
75.00000055265839
3.655458825502211
75.00000047316911


In [9]:
# Matrix Normalization
wg_pt_mean_matrix_partial1 = wg_pt_combined_matrix1/wg_pt_matrix_sum1
sg_pt_mean_matrix_partial1 = sg_pt_combined_matrix1/sg_pt_matrix_sum1

wg_pt_mean_matrix_partial2 = wg_pt_combined_matrix2/wg_pt_matrix_sum2
sg_pt_mean_matrix_partial2 = sg_pt_combined_matrix2/sg_pt_matrix_sum2

In [10]:
# Flatten for KL Divergence
wg_pt_mean_matrix_partial_flatten1 = wg_pt_mean_matrix_partial1.flatten()
sg_pt_mean_matrix_partial_flatten1 = sg_pt_mean_matrix_partial1.flatten()

wg_pt_mean_matrix_partial_flatten2 = wg_pt_mean_matrix_partial2.flatten()
sg_pt_mean_matrix_partial_flatten2 = sg_pt_mean_matrix_partial2.flatten()

In [11]:
# Make sure there is no 0, as KL divergence will be NAN
wg_pt_mean_matrix_partial_flatten1[wg_pt_mean_matrix_partial_flatten1 < 1e-15] = 1e-9
sg_pt_mean_matrix_partial_flatten1[sg_pt_mean_matrix_partial_flatten1 < 1e-15] = 1e-9

wg_pt_mean_matrix_partial_flatten2[wg_pt_mean_matrix_partial_flatten2 < 1e-15] = 1e-9
sg_pt_mean_matrix_partial_flatten2[sg_pt_mean_matrix_partial_flatten2 < 1e-15] = 1e-9

In [12]:
# Calculate KL divergence
def kl_divergence(p, q, epsilon=1e-9):
    return np.sum(p * np.log(p / q))

In [13]:
# Calculate KL divergence - Standard Set Transformer
kl_divergence_value1 = np.sum([kl_divergence(p_row, q_row) for p_row, q_row in zip(wg_pt_mean_matrix_partial_flatten1, sg_pt_mean_matrix_partial_flatten1)])
print("KL Divergence:", kl_divergence_value1)

# Calculate KL divergence - Contrastive Pre-trained Model
kl_divergence_value2 = np.sum([kl_divergence(p_row, q_row) for p_row, q_row in zip(wg_pt_mean_matrix_partial_flatten2, sg_pt_mean_matrix_partial_flatten2)])
print("KL Divergence:", kl_divergence_value2)

KL Divergence: 6.60602694038697
KL Divergence: 6.254128400859809


In [14]:
# KL Divergence as suggested by the chatGPT usin gentropy
from scipy.stats import entropy

# Calculate KL divergence between the two distributions
kl_divergence_array1 = entropy(wg_pt_mean_matrix_partial_flatten1, sg_pt_mean_matrix_partial_flatten1, base=2)

kl_divergence_value1 = np.mean(kl_divergence_array1)  # or np.sum(kl_divergence_array)
print("Overall KL Divergence:", kl_divergence_value1)

kl_divergence_array2 = entropy(wg_pt_mean_matrix_partial_flatten2, sg_pt_mean_matrix_partial_flatten2, base=2)

kl_divergence_value2 = np.mean(kl_divergence_array2)  # or np.sum(kl_divergence_array)
print("Overall KL Divergence:", kl_divergence_value2)

Overall KL Divergence: 9.53048244104579
Overall KL Divergence: 9.022794066617676
