In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import re
import math
import pickle
from sklearn.preprocessing import LabelEncoder

In [None]:
def read_metrics(path, method_list, folder_name):
    print("Utility Metrics Folder Path: "+ path)

    single_metrics_df = pd.DataFrame()
    multiple_metrics_df = pd.DataFrame()
    
    for method in method_list:
        metrics_path = path+method+'/'
        print('-------------')
        print(method)
        
        for i in range(1000):
            dataset_path = metrics_path+ '/' + folder_name + '_' + str(i)  + '-utility_metrics.csv'
            data_name = dataset_path.split('/')[-1].split('-utility_metrics')[0] 
            df = pd.read_csv(dataset_path)
            df = df.drop(columns= ['Unnamed: 0'])
            df = df.replace({'Value': {"{'-': ": '', '}': ''}}, regex=True) #remove special charactors in value of attribute_disclosure 
            df['Value'] = df['Value'].astype(float) 

            avg = df[(df['Type'] =='Multiple')]
            avg = avg.groupby('Metric')['Value'].mean().reset_index()
            avg['Method'] = method
            avg['Variable'] = '-'
            avg['Run'] = 1
            avg['Type'] = 'Single'
            cols = ['Method', 'Metric', 'Variable', 'Run', 'Type', 'Value']
            avg = avg[cols]
            df = pd.concat([df, avg], ignore_index=True)
            df['data_name'] = data_name
            df.loc[df['Metric'] == 'cluster_measure', 'Value'] = np.log(df['Value'])

            single = df[(df['Method'] ==method) & (df['Run'] ==1) & (df['Type'] =='Single')]
            single_metrics_df = pd.concat([single_metrics_df, single], ignore_index=True)
            multiple = df[(df['Method'] ==method) & (df['Run'] ==1) & (df['Type'] =='Multiple')]
            multiple_metrics_df = pd.concat([multiple_metrics_df, multiple], ignore_index=True)
        print("done")
            
    
    return single_metrics_df, multiple_metrics_df

In [None]:
# CIs for single type metrics
def stats_summary(single_metrics):
    stats = single_metrics.groupby(['Method','Metric'])['Value'].agg(['mean', 'count', 'std'])

    ci95_hi = []
    ci95_lo = []

    for i in stats.index:
        m, c, s = stats.loc[i]
        ci95_hi.append(m + 1.96*s) #https://moderndive.com/8-confidence-intervals.html#se-method
        ci95_lo.append(m - 1.96*s)

    stats['ci95_hi'] = ci95_hi
    stats['ci95_lo'] = ci95_lo
    
    return stats

In [None]:
def stats_plot(stats):
    plt.figure(figsize=(12,5))
    plt.style.use('default')  

    data = stats.loc['CLGP'].sort_index(ascending=False)
    for ci95_lo,ci95_hi,mean,y in zip(data['ci95_lo'],data['ci95_hi'], data['mean'], range(len(data))):
        plt.plot((ci95_lo,ci95_hi,mean),(y,y,y),'ro-',color='blue',label='CLGP'if y == 0 else "")

    data = stats.loc['MC-MedGAN'].sort_index(ascending=False)
    for ci95_lo,ci95_hi,mean,y in zip(data['ci95_lo'],data['ci95_hi'], data['mean'], range(len(data))):
        plt.plot((ci95_lo,ci95_hi,mean),(y,y,y),'ro-',color='green',label='MC-MedGAN'if y == 0 else "")

    data = stats.loc['MPoM'].sort_index(ascending=False)
    for ci95_lo,ci95_hi,mean,y in zip(data['ci95_lo'],data['ci95_hi'], data['mean'], range(len(data))):
        plt.plot((ci95_lo,ci95_hi,mean),(y,y,y),'ro-',color='red',label='MPoM'if y == 0 else "")    

    plt.vlines(x=[1], ymin=0, ymax=len(data), colors='grey', ls='--', lw=2)

    yvalues = ['Comb-CrCl (1)','Log-Cluster ↓','Supp. Coverage ↑','CrCl-RS (1)','PCD ↓']
    plt.xticks(np.arange(-6, 4, 1.0))
    plt.yticks(range(len(data)),yvalues[::-1])
    plt.legend(title='Method', loc='upper left')
    plt.show()

In [None]:
def boxplots_single(single_metrics):
    plt.figure(figsize=(16,10))
    plt.style.use('default')  

    ax = sns.boxplot(x = 'Metric',y = 'Value',hue ='Method', notch=True,
                     data = single_metrics[single_metrics['Metric'].isin(['pairwise_correlation_difference', 'cross_classification', 'cca_accuracy', 'coverage'])]
    )
    ax.set(xlabel='Metrix', ylabel='Value')

    plt.savefig(path+'boxplot_singletype_metrics.png')
    plt.show()

In [None]:
# box plot for multiple type metrics
def boxplots_metrics(multiple_metrics):
    plt.style.use('seaborn') #seaborn

    figure, axis = plt.subplots(2, 2, figsize=(16,10))


    axa = sns.boxplot(ax = axis[0, 0],
                x = 'Variable',
                y = 'Value',
                hue = 'Method',
                notch=True,    
                data = multiple_metrics[(multiple_metrics['Metric'] =='cross_classification')]
               )
    axa.set(xlabel='', 
            xticklabels = '',
            ylabel='CrCl-RS'
            )

    axb = sns.boxplot(ax = axis[0, 1],
                x = 'Variable',
                y = 'Value',
                hue = 'Method',
                notch=True,      
                data = multiple_metrics[(multiple_metrics['Metric'] =='cca_accuracy')]
               )
    axb.set(xlabel='', 
            xticklabels = '',
            ylabel='Comb-CrCl'
            )

    axc = sns.boxplot(ax = axis[1, 0],
                x = 'Variable',
                y = 'Value',
                hue = 'Method',
                notch=True, 
                data = multiple_metrics[(multiple_metrics['Metric'] =='kl_divergence')]
               )
    axc.set(xlabel='', 
            ylabel='KL'
            )
    axc.tick_params(labelrotation=90)

    axd = sns.boxplot(ax = axis[1, 1],
                x = 'Variable',
                y = 'Value',
                hue = 'Method',
                notch=True,        
                data = multiple_metrics[(multiple_metrics['Metric'] =='coverage')]
               )
    axd.set(xlabel='', 
            ylabel='Supp.Coverage'
            )
    axd.tick_params(labelrotation=90)
    #plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(path+'boxplot_multipletype_metrics.png')
    plt.show()

## Breast survival 2010 -2015 - step1_samples

In [None]:
#Put all utility-metric results in same folder based on different synthetic method
#Set appropriate folder_name, synthetic method and path
folder_name = 'breast_survival_1000samples'
method_list = ['CLGP','MC-MedGAN','MPoM']
path = '/DigitalTwin/Results/'

single_metrics, multiple_metrics = read_metrics(path, method_list, folder_name)
single_metrics = single_metrics[single_metrics['Metric'].isin(['pairwise_correlation_difference', 'cluster_measure', 'cross_classification', 'cca_accuracy', 'coverage'])]

In [None]:
stats = stats_summary(single_metrics)

In [None]:
stats_plot(stats)

In [None]:
boxplots_metrics(multiple_metrics)