In [None]:
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance, entropy

def calculate_metrics(real_path, synthetic_path, discrete_columns=[]):
 
    real = pd.read_csv(real_path)
    synthetic = pd.read_csv(synthetic_path)
    
    if not discrete_columns:
        discrete_columns = auto_detect_discrete_columns(real)
    
    results = {
        "wd_continuous": [],
        "jsd_discrete": [],
        "kl_discrete": []
    }
    
    continuous_cols = [col for col in real.columns if col not in discrete_columns]
    for col in continuous_cols:
        wd = wasserstein_distance(real[col], synthetic[col])
        results["wd_continuous"].append(wd)
    
    for col in discrete_columns:

        real_p = real[col].value_counts(normalize=True).sort_index()
        syn_p = synthetic[col].value_counts(normalize=True).sort_index()
        
        all_cats = real_p.index.union(syn_p.index)
        real_p = real_p.reindex(all_cats, fill_value=0).values + 1e-10
        syn_p = syn_p.reindex(all_cats, fill_value=0).values + 1e-10
        
        m = 0.5 * (real_p + syn_p)
        jsd = 0.5 * (entropy(real_p, m) + entropy(syn_p, m))
        results["jsd_discrete"].append(jsd)
        
        kl_real_syn = entropy(real_p, syn_p)
        kl_syn_real = entropy(syn_p, real_p)
        results["kl_discrete"].append(0.5*(kl_real_syn + kl_syn_real))
    
    final_metrics = {
        "Wasserstein Distance (Continuous)": np.mean(results["wd_continuous"]) if continuous_cols else 0,
        "Jensen-Shannon Divergence (Discrete)": np.mean(results["jsd_discrete"]) if discrete_columns else 0,
        "Symmetric KL Divergence (Discrete)": np.mean(results["kl_discrete"]) if discrete_columns else 0
    }
    
    return final_metrics

def auto_detect_discrete_columns(data, unique_ratio_threshold=0.05, unique_count_threshold=20):

    discrete_cols = []
    for col in data.columns:
        col_data = data[col].dropna()
        if col_data.dtype in [object, 'category', bool]:
            discrete_cols.append(col)
        elif np.issubdtype(col_data.dtype, np.number):
            n_unique = col_data.nunique()
            if n_unique <= unique_count_threshold:
                discrete_cols.append(col)
            elif n_unique / len(col_data) < unique_ratio_threshold:
                if (col_data == col_data.astype(int)).all():
                    discrete_cols.append(col)
    return discrete_cols

In [None]:
#real_path = "../CTAB-GAN-main/Real_Datasets/Adult3.csv"
#real_path = '..C/CTGAN-main/CTGAN-main/examples/csv/train_clean.csv'
real_path = "../CTAB-GAN-main/Real_Datasets/Credit.csv"
#real_path = "../synthcity-main/tutorials/covertype_preprocessed.csv"

fake_path = "../synthcity-main/tutorials/TransCTGAN-Credit_31.csv"
#fake_path = "../synthcity-main/tutorials/OriginalCTGAN-Cover_31.csv"
#fake_path = "../synthcity-main/tutorials/CTGAN-Cover_31.csv"
#fake_path = "../synthcity-main/tutorials/CTABGAN-Credit_31.csv"
#fake_path = "../synthcity-main/tutorials/OriginalCTGAN-Adult_31.csv"
#fake_path = "../synthcity-main/tutorials/TransCTGAN-Titani_1.csv"
#fake_path = "../synthcity-main/tutorials/DDPM-Adult_31.csv"
real_data = pd.read_csv(real_path)
discrete_cols = auto_detect_discrete_columns(real_data)

metrics = calculate_metrics(real_path, fake_path, discrete_cols)

print("Evaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Evaluation Metrics:
Wasserstein Distance (Continuous): 0.8977
Jensen-Shannon Divergence (Discrete): 0.0000
Symmetric KL Divergence (Discrete): 0.0000
