### Background 

This Jupyter notebook documents metrics related to Congruence discussed in the Zamzmi et al (2025) (Table 3). Congruence metrics include:
* Structural Similarity Index Measure (SSIM)
* Peak Signal-to-Noise Ratio (PSNR)
* Jensen-Shannon Divergence (JSD)
* Earth Mover’s Distance (EMD)
* Cosine Similarity

Zamzmi G, Subbaswamy A, Sizikova E, Margerrison E, Delfino JG, Badano A. Scorecard for synthetic medical data evaluation. Commun Eng. 2025 Jul 21;4(1):130. doi: 10.1038/s44172-025-00450-1. PMID: 40691520; PMCID: PMC12280076.

### Packages and Data-Loading

In [1]:
import os
import numpy as np
import pandas as pd

FEATURE_DIR = 'outputs/features/'
STATS_DIR = 'outputs/stats/'
os.makedirs (STATS_DIR, exist_ok=True)

In [2]:
# Define groups of synthetic and real datasets
datasets = {'synthetic': ['Mammo_medigan', 'HuggingFace'],
            'real': ['MIAS', 'VinDr', 'DDSM', 'InBreast']}

# Load features for each dataset
dataset_features = {}
for _, dataset_names in datasets.items():
    for dataset_name in dataset_names:
        # If the feature csv file already exists, pull them out
        feature_csv = FEATURE_DIR + f'/{dataset_name}_features.csv'
        if os.path.exists (feature_csv):
            dataset_df = pd.read_csv(feature_csv)
            dataset_features[dataset_name] = dataset_df  

# Print summary information 
for dataset_name, features_df in dataset_features.items():
    print ("+-------------------------------------------------")
    print(f"Extracted features for {dataset_name}:")
    print(features_df.head())  

### Metrics Functions

In [3]:
from scipy.spatial.distance import (
    cosine as sk_cosine_distance,
    jensenshannon
)
from scipy.stats import wasserstein_distance
from skimage.metrics import structural_similarity as ssim
from skimage.metrics import peak_signal_noise_ratio as psnr

def normalize_features (features):
    features = features[np.isfinite(features)]
    numerator = features - np.min(features, axis=0)
    denominator = np.max(features, axis=0) - np.min(features, axis=0) + 1e-8
    return numerator / denominator

def random_sampling (array, size, replace=False, seed=None):
    np.random.default_rng(seed)
    idx = np.random.choice(array.shape[0], size, replace=replace)
    return array[idx]

def get_cosine_similarity (r, s):
    """
    inputs
    ------
    r (1D array): 1 x n_features; each element is mean from all real images
    s (1D array): 1 x n_features; each element is mean from all synthetic images

    outputs
    -------
    cosine similarity (float): value of the metric
    """
    return float (1 - sk_cosine_distance(r, s))

def get_jensen_shannon_divergence (r, s):
    """
    inputs
    ------
    r (1D array): 1 x n_features; each element is mean from all real images
    s (1D array): 1 x n_features; each element is mean from all synthetic images

    outputs
    -------
    jensen_shannon_divergence (float): value of the metric
    """                 
    rn = r/r.sum() if r.sum() else r
    sn = s/s.sum() if s.sum() else s
    return float (jensenshannon (rn, sn, base=2.0)**2)

def get_earth_movers_distance (r, s): 
    """
    inputs
    ------
    r (1D array): 1 x n_features; each element is mean from all real images
    s (1D array): 1 x n_features; each element is mean from all synthetic images

    outputs
    -------
    earth movers distance (float): value of the metric
    """        
    return float (wasserstein_distance (r,s))

def compute_congruence (r, s, sampling=False, replace=False,
                        seed=42, is_image_feature=False):
    """
    For congruence, there must be real-synthetic pair.

    inputs
    ------
    r (1D array): 1 x n_features; each element is mean from all real images
    s (1D array): 1 x n_features; each element is mean from all synthetic images
    is_image_feature (bool): True if the feature is related to image pixel intensities

    outputs
    -------
    metric values (dict): values of all evaluation metrics
    """

    r = normalize_features (r)
    s = normalize_features (s)

    if sampling:
        size = min(r.shape[0], s.shape[0])
        r = random_sampling (r, size, replace=replace, seed=seed)
        s = random_sampling (s, size, replace=replace, seed=seed) 

    ssim_value, psnr_value = None, None
    if is_image_feature:
        ssim_value = ssim (r, s, data_range=1.0)
        psnr_value = psnr (r, s)
    
    return {'cosine_similarity': get_cosine_similarity (r, s),
            'jensen_shannon_divergence': get_jensen_shannon_divergence (r, s),
            'earth_movers_distance': get_earth_movers_distance (r, s),
            'ssim':ssim_value, 'psnr':psnr_value}

### Compute Metrics

In [6]:
FEATURE_COLUMNS = dataset_features['VinDr'].columns
IMAGE_FEATURES = ['mean', 'std', 'skewness', 'kurtosis', 'median']

# Compute congruence metrics for each feature in each synthetic-real dataset pair
congruence_metrics = {}
for real in datasets['real']:
    for synthetic in datasets['synthetic']:
        congruence_metrics[(synthetic, real)] = {}
        for feature in FEATURE_COLUMNS:
            congruence_metrics[(synthetic, real)][feature] = {}
            r = dataset_features[real][feature].values
            s = dataset_features[synthetic][feature].values
            is_image_feature = feature in IMAGE_FEATURES
            congruence_metrics[(synthetic, real)][feature] = compute_congruence(r, s, sampling=True, replace=False,
                                                                                is_image_feature=is_image_feature)


KeyError: 'VinDr'

In [None]:
# Save a summary DataFrame for the congruence metrics
summary_list = []

for (synthetic, real), metricdict in congruence_metrics.items():
    for feature, values in metricdict.items():
        summary_list.append({
            'Synthetic Dataset': synthetic,
            'Real Dataset': real,
            'Feature': feature,
            'EMD': values['jensen_shannon_divergence'],
            'Cosine Similarity': values['cosine_similarity'],
            'JSD': values['jensen_shannon_divergence'],
            'SSIM': values['ssim'],
            'PSNR': values['psnr']
        })

summary_df = pd.DataFrame(summary_list)

# Save the summary to a CSV file
summary_output_path = os.path.join(STATS_DIR, 'congruence_metrics_summary.csv')
summary_df.to_csv(summary_output_path, index=False)

# Print the summary table
print("Summary of Congruence Metrics by Synthetic-Real Dataset Pair:")
print(summary_df)


Summary of Congruence Metrics by Synthetic-Real Dataset Pair:
   Synthetic Dataset Real Dataset             Feature       EMD  \
0      Mammo_medigan         MIAS                mean  0.101217   
1      Mammo_medigan         MIAS                 std  0.121064   
2      Mammo_medigan         MIAS            skewness  0.101180   
3      Mammo_medigan         MIAS            kurtosis  0.174726   
4      Mammo_medigan         MIAS              median       NaN   
..               ...          ...                 ...       ...   
83       HuggingFace     InBreast  avg_edge_intensity  0.067638   
84       HuggingFace     InBreast     low_freq_energy  0.039387   
85       HuggingFace     InBreast    high_freq_energy  0.052083   
86       HuggingFace     InBreast             betti_0  0.047744   
87       HuggingFace     InBreast             betti_1  0.042275   

    Cosine Similarity       JSD      SSIM       PSNR  
0            0.789133  0.101217  0.022822  10.180968  
1            0.758030  

### Example Usage

In [5]:
# List of real dataset names
synthetic = 'MSYNTH'
real = 'DDSM'

# Sample usage: Calculate metrics between matched real and synthetic datasets
congruence = {}
for feature in FEATURE_COLUMNS:
    real = dataset_features[real][feature]
    synthetic = dataset_features[synthetic][feature]
    is_image_feature = feature in IMAGE_FEATURES
    congruence[feature] = compute_congruence (real, synthetic, sampling=True,
                                              replace=False,
                                              is_image_feature=is_image_feature)

    print ("+-----------------------------------------------")
    print (f"| {feature}:")
    for metric, value in congruence[feature].items():
        try:
            value = f"{value:.3f}"
        except: 
            value = 'n/a'
        print (f"|      - {metric}: {value}")

NameError: name 'FEATURE_COLUMNS' is not defined