### Background 

This Jupyter notebook documents metrics related to Coverage discussed in the Zamzmi et al (2025) (Table 3). Congruence metrics include:
* Variance
* Entropy
* Distance to centroid
* Convex hull volume

Zamzmi G, Subbaswamy A, Sizikova E, Margerrison E, Delfino JG, Badano A. Scorecard for synthetic medical data evaluation. Commun Eng. 2025 Jul 21;4(1):130. doi: 10.1038/s44172-025-00450-1. PMID: 40691520; PMCID: PMC12280076.

### Packages and Data-Loading

In [1]:
import os
import numpy as np
import pandas as pd

FEATURE_DIR = 'outputs/features/'
STATS_DIR = 'outputs/stats/'
os.makedirs (STATS_DIR, exist_ok=True)

In [2]:
# Define groups of synthetic and real datasets
datasets = {'synthetic': ['Mammo_medigan', 'HuggingFace'],
            'real': ['MIAS', 'VinDr', 'DDSM', 'InBreast']}

# Load features for each dataset
dataset_features = {}
for _, dataset_names in datasets.items():
    for dataset_name in dataset_names:
        # If the feature csv file already exists, pull them out
        feature_csv = FEATURE_DIR + f'/{dataset_name}_features.csv'
        if os.path.exists (feature_csv):
            dataset_df = pd.read_csv(feature_csv)
            dataset_features[dataset_name] = dataset_df  

# Print summary information 
for dataset_name, features_df in dataset_features.items():
    print ("+-------------------------------------------------")
    print(f"Extracted features for {dataset_name}:")
    print(features_df.head())  

+-------------------------------------------------
Extracted features for Mammo_medigan:
        mean        std  skewness  kurtosis  median  edge_density  \
0  54.769627  62.823553  0.690815 -0.692438     0.0      0.081314   
1  43.317745  66.921668  1.089029 -0.469780     0.0      0.061913   
2  65.704037  87.302317  0.726333 -1.249385     0.0      0.076012   
3  51.318485  66.329404  0.815744 -0.819561     0.0      0.088535   
4  47.165154  64.650650  1.006616 -0.316069     0.0      0.072632   

   avg_edge_intensity  low_freq_energy  high_freq_energy  betti_0  betti_1  
0          114.065585     17567.931486      16654.733806     5762     9612  
1          137.769686     11690.723656      10417.468763     3616     6303  
2          164.581502     20160.990139      22240.616477     4303     8249  
3          126.371020     14054.379624      14156.888580     4430     7846  
4          116.558666     13089.991773      13015.086145     5154     8842  
+---------------------------------

### Metrics Functions

In [3]:
from scipy.stats import entropy
from scipy.spatial import ConvexHull
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances

def normalize_features (features):
    numerator = features - np.min(features, axis=0)
    denominator = np.max(features, axis=0) - np.min(features, axis=0) + 1e-8
    return numerator / denominator

def get_variance (feature):

    grand_mean = np.mean(feature)
    # Sum of Squares Total (SS_Total)
    ss_total = np.sum((feature - grand_mean)**2)
    # Total degrees of freedom
    n_total = len(feature)
    df_total = n_total - 1
    # Total variance
    return ss_total / df_total

def get_entropy (feature):

    norm_features = feature / np.sum(feature)
    shannon_entropy = entropy(norm_features)  
    return shannon_entropy

def get_distance_to_centroid (feature):

    centroid = np.mean(feature, axis=0)
    distances = pairwise_distances(feature, [centroid])
    return np.mean(distances)

def get_convex_hull_volume (feature):

    pca = PCA(n_components=2)
    reduced_features = pca.fit_transform(feature)
    hull = ConvexHull(reduced_features)
    return hull.volume

def compute_coverage (feature):
    """
    Each dataset has a value for each coverage metric

    inputs
    ------
    feature (2D array): n_samples x n_features

    outputs
    -------
    metric values (dict): values of all evaluation metrics
    """
    
    feature = normalize_features (feature.to_numpy())
    vector = np.mean (feature, axis=0)

    return {'variance': get_variance (feature),
            'entropy': get_entropy (vector),
            'distance_to_centroid': get_distance_to_centroid (feature),
            'convex_hull_volume':get_convex_hull_volume (feature)}

### Compute Metrics

In [4]:
# Treat all handcrafted features as a 2D array.
# Compute coverage metrics for each feature in each dataset 
coverage_metrics = {}
for dataset_name, features_df in dataset_features.items():
        coverage_metrics[dataset_name] = compute_coverage(features_df)

In [5]:
# Save a summary DataFrame for the congruence metrics
summary_list = []

for dataset_name, metricdict in coverage_metrics.items():
    summary_list.append({
        'Dataset': dataset_name,        
        'Entropy': metricdict['entropy'],
        'Distance to Centroid': metricdict['distance_to_centroid'],
        'Convex Hull Volume': metricdict['convex_hull_volume'],
        'Variance': metricdict['variance']
    })

summary_df = pd.DataFrame(summary_list)

# Save the summary to a CSV file
summary_output_path = os.path.join(STATS_DIR, 'coverage_metrics_summary.csv')
summary_df.to_csv(summary_output_path, index=False)

# Print the summary table
print("Summary of Coverage Metrics by Dataset:")
print(summary_df)


Summary of Coverage Metrics by Dataset:
         Dataset   Entropy  Distance to Centroid  Convex Hull Volume  Variance
0  Mammo_medigan  2.256116              0.751953            2.732999  0.981819
1    HuggingFace  2.346368              0.434012            2.143059  0.365485
2           MIAS  2.274536              0.480027            2.361218  0.631394
3          VinDr  2.337731              0.550505            3.058191  0.559795
4           DDSM  2.058409              0.296223            1.365906  0.464890
5       InBreast  2.293621              0.488011            2.621619  0.536168


### Example Usage

In [6]:
dataset_name = 'Mammo_medigan'

# Sample usage: Calculate metrics of a given dataset
coverage = compute_coverage(dataset_features[dataset_name])

print ("+-----------------------------------------------")
print (f"| {dataset_name}:")
for mname, value in coverage.items():
    try:
        value = f"{value:.3f}"
    except: 
        value = 'n/a'
    print (f"|      - {mname}: {value}")

+-----------------------------------------------
| Mammo_medigan:
|      - variance: 0.982
|      - entropy: 2.256
|      - distance_to_centroid: 0.752
|      - convex_hull_volume: 2.733
