## Evaluation Metrics
The notebook contains the evauation measures used in the paper: Evaluation of Synthetic Electronic Health Records: A Systematic Review and Experimental Assessment

In [1]:
#libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from scipy.spatial import distance
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# consistency
np.random.seed(0)

### Evaluation Measures

### Fidelity

In [2]:
## FIDELITY
def dws(real,syn,col):
    """
    Dimension Wise Statistics for discrete variable
    real: real dataset
    syn: synthetic dataset
    col: variable name
    
    """


    real_probs = real[col].value_counts(normalize=True).sort_index()
    synthetic_probs = syn[col].value_counts(normalize=True).sort_index()

    probs = sorted(set(real_probs.index) | set(synthetic_probs.index))
    real_probs = real_probs.reindex(probs, fill_value=0)
    synthetic_probs = synthetic_probs.reindex(probs, fill_value=0)

  
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

    ax1.scatter(real_probs,real_probs, c='b', marker='o')
    ax1.set_xlabel('Real')
    ax1.set_ylabel('Real')
    ax1.set_title(f'DWS for {col} (Real vs. Real)')
    ax1.plot([0, 1], [0, 1], c='r', linestyle='--')

    ax2.scatter(real_probs,synthetic_probs, c='b', marker='o')
    ax2.set_xlabel('Real')
    ax2.set_ylabel('Synthetic')
    ax2.set_title(f'DWS for {col} (Real vs. Synthetic)')
    ax2.plot([0, 1], [0, 1], c='r', linestyle='--')

    plt.show()
    
    
def cosine_sim(real_df,syn_df, cols):
    """
    Cosine Similarity
    real_df: real dataset
    syn_df: synthetic dataset
    col: variable name
    
    """
    return distance.cosine(real_df[cols],syn_df[cols])

def jaccard_sim(real_df,syn_df, cols):
    """
    Jaccard Similarity
    real_df: real dataset
    syn_df: synthetic dataset
    col: variable name
    
    """
    return distance.jaccard(real_df[cols],syn_df[cols])

def generate_bins(num_bins, real_df, syn_df, col):
    """
    Function to generate bins from a variable distribution
    num_bins: number of required bins
    real_df: real dataset
    syn_df: synthetic dataset
    col: variable name
    
    """
    
    real={}
    syn={}
    weights_real = np.ones_like(real_df) / len(real_df)
    n_real, bins_real, patches_real = plt.hist(real_df[col], num_bins, 
                            density = True, 
                            color ='green',
                            alpha = 0.7)
    plt.title('Real data; '+col+',Bins:'+str(num_bins))
    plt.show()
    real['n']=n_real
    real['bins']=bins_real
    bins_list=bins_real.tolist()
    
    n_syn, bins_syn, patches_syn = plt.hist(syn_df[col], 
                                            bins=bins_real,
                            density = True, 
                            color ='red',
                            alpha = 0.7)
    plt.title('Syn data: '+col+',Bins:'+str(num_bins))
    plt.show()
    
    

### Utility

In [3]:
## Utility    
def tstr(real,syn,pred_col):
    """
    Train on Synthetic Test on Real
    real: real dataset
    syn: synthetic dataset
    pred_col: variable name (target)
    
    """

    
    cols_scale = [col for col in real.columns if col != pred_col]
    
    scaler_real, scaler_syn = MinMaxScaler(),MinMaxScaler()
    real[cols_scale] = scaler_real.fit_transform(real[cols_scale])
    syn[cols_scale] = scaler_syn.fit_transform(real[cols_scale])

    y_real = real[pred_col]
    X_real = real.drop(pred_col, axis=1)  
    y_synthetic = syn[pred_col]
    X_synthetic = syn.drop(pred_col, axis=1)  
   


    X_train_syn, X_test_syn, y_train_syn, y_test_syn = train_test_split(X_synthetic, y_synthetic, test_size=0.2, random_state=42)
    

    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train_syn, y_train_syn)



    y_pred_real = rf_model.predict(X_real)
    f1_real = f1_score(y_real, y_pred_real)
    print(f1_real)
    
    
    def ccd(real_df, syn_df, col1, col2):
        """
        Conditional distribution of one variable conditioned on another
        real: real dataset
        syn: synthetic dataset
        col1: variable 1
        col2: variable 2
        """
        unique_values_col2 = set(real_df[col2])
    
        fig, ax1 = plt.subplots(1, 2, figsize=(12, 4))
    
        for val in unique_values_col2:
            conditional_data = [v1 for v1, v2 in zip(real_df[col1], real_df[col2]) if v2 == val]
            conditional_counts = Counter(conditional_data)
            conditional_x = list(conditional_counts.keys())
            conditional_y = [conditional_counts[x] / len(conditional_data) for x in conditional_x]

            ax1[0].bar(conditional_x, conditional_y, alpha=0.5, label=f'{val}')
    
        ax1[0].set_xlabel('cci')
        ax1[0].set_ylabel('conditional probability')
        ax1[0].set_title(f'CCD of {col1} conditioned on {col2} (real data)')
        ax1[0].legend(loc='lower right')

        unique_values_col2_sy = set(syn_df[col2])

        for val in unique_values_col2_sy:
            conditional_data_sy = [v1 for v1, v2 in zip(syn_df[col1], syn_df[col2]) if v2 == val]
            conditional_counts_sy = Counter(conditional_data_sy)
            conditional_x_sy = list(conditional_counts_sy.keys())
            conditional_y_sy = [conditional_counts_sy[x] / len(conditional_data_sy) for x in conditional_x_sy]

            ax1[1].bar(conditional_x_sy, conditional_y_sy, alpha=0.5, label=f'{val}')
    
        ax1[1].set_xlabel('cci')
        ax1[1].set_ylabel('conditional probability')
        ax1[1].set_title(f'CCD of {col1} conditioned on {col2} (synthetic data)')
        ax1[1].legend(loc='lower right')

        plt.savefig("ccd.png")

    


### Privacy

In [4]:
##Privacy
def dbscan(reduced_real,reduced_syn,eps,min_samp):
    """
    Outlier Detection with DBSCAN
    reduced_real: real dataset reduced with dimensionality reduction
    reduced_syn: synthetic dataset reduced with dimensionality reduction
    eps: eps
    min_samp: min_sample 
    
    """
    
    #DBSCAN clustering on reduced data (PCA was used here)
    dbscan_real = DBSCAN(eps=0.1, min_samples=5)  
    labels_real = dbscan_real.fit_predict(reduced_real)


    normal_points_real = reduced_real[labels_real != -1]
    outliers_real = reduced_real[labels_real == -1]

    labels_syn = dbscan_real.fit_predict(reduced_syn)
    normal_points_syn = reduced_syn[labels_syn != -1]
    outliers_syn = reduced_syn[labels_syn == -1]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    ax1.scatter(normal_points_real[:, 0], normal_points_real[:, 1], c=labels_real[labels_real != -1], cmap='rainbow', label='Normal Points')
    ax1.scatter(outliers_real[:, 0], outliers_real[:, 1], c='red', marker='x', label='Outliers')
    ax1.set_xlabel('X-axis')
    ax1.set_ylabel('Y-axis')
    ax1.set_title('Static Real EHRs')
    ax1.legend(loc='lower right')



    ax2.scatter(normal_points_syn[:, 0], normal_points_syn[:, 1], c=labels_syn[labels_syn!= -1], cmap='rainbow', label='Normal Points')
    ax2.scatter(outliers_syn[:, 0], outliers_syn[:, 1], c='red', marker='x', label='Outliers')
    ax2.set_xlabel('X-axis')
    ax2.set_ylabel('Y-axis')
    ax2.set_title('Static Synthetic EHRs')
    ax2.legend(loc='lower right')
    plt.show()