# Distance Measures between Datasets

In [1]:
import pandas as pd
import torch
import numpy as np

## Distance Functions

In [2]:
from scipy.stats import ks_2samp

In [None]:
from scipy.stats import entropy
def kl_divergence_pairwise(source, target):
    min_val = min(source.min(), target.min())
    max_val = max(source.max(), target.max())
    bins = 100
    hist_source, bin_edges = np.histogram(source, bins=bins, range=(min_val, max_val), density=True)
    hist_target, _ = np.histogram(target, bins=bin_edges, density=True)
    hist_source += 1e-10
    hist_target += 1e-10
    kl = entropy(hist_source, hist_target)
    return kl, 0

## Overarching Functions

In [None]:
def measure_distance(distance_function, df1, df2, df3=None):
    # one value for each column
    results = {}
    for col in df1.columns:
        stat1, p_value1 = distance_function(df1[col], df2[col])
        if df3 is not None:
            stat2, p_value2 = distance_function(df1[col], df3[col])
            # source test vs target all
            stat3, p_value3 = distance_function(df2[col], df3[col])
            results[col] = {
                "source_train_vs_source_test":  stat1,
                "source_train_vs_target_test": stat2,
                "source_test_vs_target_test": stat3
            }
        else:
            results[col] = {
                "source_vs_target": stat1
            }

    return results

In [None]:
def measure_distance_classwise(distance_function, df1_class1, df1_class2, df2_class1, df2_class2, df3_class1=None, df3_class2=None):
    # one value for each column
    results = {}
    for col in df1_class1.columns:
        stat1_2_class1, p_value1 = distance_function(df1_class1[col], df2_class1[col])
        stat1_2_class2, p_value1 = distance_function(df1_class2[col], df2_class2[col])
        if df3_class1 is not None and df3_class2 is not None:
            stat2_2_class1, p_value2 = distance_function(df1_class1[col], df3_class1[col])
            stat2_2_class2, p_value2 = distance_function(df1_class2[col], df3_class2[col])
            stat3_2_class1, p_value3 = distance_function(df2_class1[col], df3_class1[col])
            stat3_2_class2, p_value3 = distance_function(df2_class2[col], df3_class2[col])
            results[col] = {
                "source_train_vs_source_test_class1":  stat1_2_class1,
                "source_train_vs_source_test_class2":  stat1_2_class2,
                "source_train_vs_target_test_class1":  stat2_2_class1,
                "source_train_vs_target_test_class2":  stat2_2_class2,
                "source_test_vs_target_test_class1":  stat3_2_class1,
                "source_test_vs_target_test_class2":  stat3_2_class2
            }
        else:
            results[col] = {
                "source_train_vs_source_test_class1":  stat1_2_class1,
                "source_train_vs_source_test_class2":  stat1_2_class2,
            }

    return results

In [None]:
def load_data(source_label, target_label, channel =''):
    source_train = torch.load(f'../data/train_{source_label}_cleaned{channel}.pt', weights_only=False)
    source_test = torch.load(f'../data/test_{source_label}_cleaned{channel}.pt', weights_only=False)

    # huh they look exactly the same
    target_train = torch.load(f'../data/train_{target_label}_cleaned{channel}.pt', weights_only=False)
    target_test = torch.load(f'../data/test_{target_label}_cleaned{channel}.pt', weights_only=False)

    return source_train, source_test, target_train, target_test

In [None]:
def measure_distance_source_train_vs_source_test_vs_target_all(label_source, label_target, distance_function):
    # here we take an average overt time for each channel!!!

    source_train, source_test, target_train, target_test = load_data(label_source, label_target)

    source_samples_train = []
    source_samples_test = []
    source_train_samples = np.array(source_train["samples"])
    source_test_samples = np.array(source_test["samples"])
    for sample in source_train_samples:
        # average across time dimension
        sample_avg = [sample[:,channel_idx].mean().item() for channel_idx in range(0,6)]
        source_samples_train.append(sample_avg)
    source_train_df = pd.DataFrame(source_samples_train, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])

    for sample in source_test_samples:
        sample_avg = [sample[:,channel_idx].mean().item() for channel_idx in range(0,6)]
        source_samples_test.append(sample_avg)
    source_test_df = pd.DataFrame(source_samples_test, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])

    # merge all target samples 
    target_samples_test = []
    target_test_samples = np.array(target_test["samples"])
    for sample in target_test_samples:
        sample_avg = [sample[:,channel_idx].mean().item() for channel_idx in range(0,6)]
        target_samples_test.append(sample_avg)
    for sample in np.array(target_train["samples"]):
        sample_avg = [sample[:,channel_idx].mean().item() for channel_idx in range(0,6)]
        target_samples_test.append(sample_avg)
        
    target_test_df = pd.DataFrame(target_samples_test, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])

    distance_measures_features = measure_distance(distance_function, source_train_df, source_test_df, target_test_df)
    
    distance_measures_features_df = pd.DataFrame(distance_measures_features)
    distance_measures_features_df.columns = [f"feature_{col}" for col in distance_measures_features_df.columns]
    distance_measures_features_df = distance_measures_features_df.transpose()
    # add a row with the mean across features
    # aggregation is tricky here, but let's keep it in addition to individual channels
    distance_measures_features_df.loc["features_sum"] = distance_measures_features_df.sum()
    distance_measures_features_df.loc["features_mean"] = distance_measures_features_df.drop(index=["features_sum"], errors="ignore").mean()
    distance_measures_features_df.loc["features_max"] = distance_measures_features_df.drop(index=["features_sum", "features_mean"], errors="ignore").max()

    # load labels
    source_train_labels = source_train["labels"].tolist()
    source_train_labels = [entry[0] for entry in source_train_labels]
    source_train_labels_df = pd.DataFrame(source_train_labels, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_train_labels_df = source_train_labels_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])

    source_test_labels = source_test["labels"].tolist()
    source_test_labels = [entry[0] for entry in source_test_labels]
    source_test_labels_df = pd.DataFrame(source_test_labels, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_test_labels_df = source_test_labels_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])

    # we evaluate on entire target set
    target_test_labels = target_train["labels"].tolist()
    for label in target_test["labels"]:
        target_test_labels.append(label)
    target_test_labels = [entry[0] for entry in target_test_labels]
    target_test_labels_df = pd.DataFrame(target_test_labels, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    target_test_labels_df = target_test_labels_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])
    
    distance_measures_labels = measure_distance(distance_function, source_train_labels_df, source_test_labels_df, target_test_labels_df)
    distance_measures_labels_df = pd.DataFrame(distance_measures_labels)
    distance_measures_labels_df.columns = [f"label_{col}" for col in distance_measures_labels_df.columns]
    distance_measures_labels_df = distance_measures_labels_df.transpose()

    # append the two dataframes
    distance_measures_all_df = pd.concat([distance_measures_features_df, distance_measures_labels_df], axis=0)
    distance_measures_all_df.loc["features_sum_SpO2_label_sum"] = distance_measures_features_df.loc["features_sum"] + distance_measures_labels_df.loc["label_SpO2"]
    distance_measures_all_df.loc["features_mean_SpO2_label_sum"] = distance_measures_features_df.loc["features_mean"] + distance_measures_labels_df.loc["label_SpO2"]
    distance_measures_all_df.loc["features_max_SpO2_label_sum"] = distance_measures_features_df.loc["features_max"] + distance_measures_labels_df.loc["label_SpO2"]
    distance_measures_all_df.loc["features_sum_NBPMean_label_sum"] = distance_measures_features_df.loc["features_sum"] + distance_measures_labels_df.loc["label_NBPMean"]
    distance_measures_all_df.loc["features_mean_NBPMean_label_sum"] = distance_measures_features_df.loc["features_mean"] + distance_measures_labels_df.loc["label_NBPMean"]
    distance_measures_all_df.loc["features_max_NBPMean_label_sum"] = distance_measures_features_df.loc["features_max"] + distance_measures_labels_df.loc["label_NBPMean"]
    return distance_measures_all_df


In [None]:
def measure_distance_source_train_vs_source_test_vs_target_all_classwise(label_source, label_target, class_label, distance_function):


    source_train = torch.load(f'../data/train_{label_source}_cleaned.pt', weights_only=False)
    source_test = torch.load(f'../data/test_{label_source}_cleaned.pt', weights_only=False)

    target_train = torch.load(f'../data/train_{label_target}_cleaned.pt', weights_only=False)
    target_test = torch.load(f'../data/test_{label_target}_cleaned.pt', weights_only=False)

    if class_label == "Hypotension":
        threshold = 65
        label_idx = 4
    else:
        threshold = 90
        label_idx = 5

    source_samples_train_class1 = []
    source_samples_train_class2 = []
    source_labels_train_class1 = []
    source_labels_train_class2 = []
    source_samples_test_class1 = []
    source_labels_test_class1 = []
    source_samples_test_class2 = []
    source_labels_test_class2 = []
    for idx in range(len(source_train["samples"])):
        # average across time dimension
        sample_avg = [source_train["samples"][idx][:,channel_idx].mean().item() for channel_idx in range(0,6)]
        if source_train["labels"][idx, 0, label_idx] < threshold:
            source_samples_train_class1.append(sample_avg)
            source_labels_train_class1.append(source_train["labels"][idx][0])
        else:
            source_samples_train_class2.append(sample_avg)
            source_labels_train_class2.append(source_train["labels"][idx][0])
    source_train_class1_df = pd.DataFrame(source_samples_train_class1, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_train_class2_df = pd.DataFrame(source_samples_train_class2, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_train_labels_class1_df = pd.DataFrame(source_labels_train_class1, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_train_labels_class1_df = source_train_labels_class1_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])
    source_train_labels_class2_df = pd.DataFrame(source_labels_train_class2, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_train_labels_class2_df = source_train_labels_class2_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])

    for idx in range(len(source_test["samples"])):
        sample_avg = [source_test["samples"][idx][:,channel_idx].mean().item() for channel_idx in range(0,6)]
        if source_test["labels"][idx, 0, label_idx] < threshold:
            source_samples_test_class1.append(sample_avg)
            source_labels_test_class1.append(source_test["labels"][idx][0])
        else:
            source_samples_test_class2.append(sample_avg)
            source_labels_test_class2.append(source_test["labels"][idx][0])
    source_test_class1_df = pd.DataFrame(source_samples_test_class1, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_test_class2_df = pd.DataFrame(source_samples_test_class2, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_test_labels_class1_df = pd.DataFrame(source_labels_test_class1, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_test_labels_class1_df = source_test_labels_class1_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])
    source_test_labels_class2_df = pd.DataFrame(source_labels_test_class2, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    source_test_labels_class2_df = source_test_labels_class2_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])

    target_samples_test_class1 = []
    target_samples_test_class2 = []
    target_labels_test_class1 = []
    target_labels_test_class2 = []
    target_test_samples = np.array(target_test["samples"])
    for idx in range(len(target_test_samples)):
        sample_avg = [target_test_samples[idx][:,channel_idx].mean().item() for channel_idx in range(0,6)]
        if target_test["labels"][idx, 0, label_idx] < threshold:
            target_samples_test_class1.append(sample_avg)
            target_labels_test_class1.append(target_test["labels"][idx][0])
        else:
            target_samples_test_class2.append(sample_avg)
            target_labels_test_class2.append(target_test["labels"][idx][0])
    for idx in range(len(target_train["samples"])):
        sample_avg = [target_train["samples"][idx][:,channel_idx].mean().item() for channel_idx in range(0,6)]
        if target_train["labels"][idx, 0, label_idx] < threshold:
            target_samples_test_class1.append(sample_avg)
            target_labels_test_class1.append(target_train["labels"][idx][0])
        else:
            target_samples_test_class2.append(sample_avg)
            target_labels_test_class2.append(target_train["labels"][idx][0])
    target_test_class1_df = pd.DataFrame(target_samples_test_class1, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    target_test_class2_df = pd.DataFrame(target_samples_test_class2, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    target_test_labels_class1_df = pd.DataFrame(target_labels_test_class1, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    target_test_labels_class1_df = target_test_labels_class1_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])
    target_test_labels_class2_df = pd.DataFrame(target_labels_test_class2, columns=["CVP", "HR", "NBPSys", "NBPDias", "NBPMean", "SpO2"])
    target_test_labels_class2_df = target_test_labels_class2_df.drop(columns=["CVP", "HR", "NBPSys", "NBPDias"])

    distance_measures_features = measure_distance_classwise(distance_function, df1_class1 = source_train_class1_df, 
                                                                             df1_class2 = source_train_class2_df, df2_class1 = source_test_class1_df, 
                                                                             df2_class2 = source_test_class2_df, df3_class1 = target_test_class1_df, 
                                                                             df3_class2= target_test_class2_df)
    distance_measures_labels = measure_distance_classwise(distance_function, df1_class1 = source_train_labels_class1_df, 
                                                                             df1_class2 = source_train_labels_class2_df, df2_class1 = source_test_labels_class1_df, 
                                                                             df2_class2 = source_test_labels_class2_df, df3_class1 = target_test_labels_class1_df, 
                                                                             df3_class2= target_test_labels_class2_df)

    distance_measures_features_df = pd.DataFrame(distance_measures_features)
    distance_measures_features_df.columns = [f"feature_{col}" for col in distance_measures_features_df.columns]
    distance_measures_features_df = distance_measures_features_df.transpose()

    distance_measures_features_df.loc["features_sum"] = distance_measures_features_df.sum()
    distance_measures_features_df.loc["features_mean"] = distance_measures_features_df.drop(index=["features_sum"], errors="ignore").mean()
    distance_measures_features_df.loc["features_max"] = distance_measures_features_df.drop(index=["features_sum", "features_mean"], errors="ignore").max()
    
    distance_measures_labels_df = pd.DataFrame(distance_measures_labels)
    distance_measures_labels_df.columns = [f"label_{col}" for col in distance_measures_labels_df.columns]
    distance_measures_labels_df = distance_measures_labels_df.transpose()

    # append the two dataframes
    distance_measures_all_df = pd.concat([distance_measures_features_df, distance_measures_labels_df], axis=0)
    distance_measures_all_df.loc["features_sum_SpO2_label_sum"] = distance_measures_features_df.loc["features_sum"] + distance_measures_labels_df.loc["label_SpO2"]
    distance_measures_all_df.loc["features_mean_SpO2_label_sum"] = distance_measures_features_df.loc["features_mean"] + distance_measures_labels_df.loc["label_SpO2"]
    distance_measures_all_df.loc["features_max_SpO2_label_sum"] = distance_measures_features_df.loc["features_max"] + distance_measures_labels_df.loc["label_SpO2"]
    distance_measures_all_df.loc["features_sum_NBPMean_label_sum"] = distance_measures_features_df.loc["features_sum"] + distance_measures_labels_df.loc["label_NBPMean"]
    distance_measures_all_df.loc["features_mean_NBPMean_label_sum"] = distance_measures_features_df.loc["features_mean"] + distance_measures_labels_df.loc["label_NBPMean"]
    distance_measures_all_df.loc["features_max_NBPMean_label_sum"] = distance_measures_features_df.loc["features_max"] + distance_measures_labels_df.loc["label_NBPMean"]

    return distance_measures_all_df



In [None]:
def measure_distance_source_train_vs_source_test_vs_target_combinations(group_a, group_b, distance_measure):
    if distance_measure =="ks_2samp":
        distance_function = ks_2samp
    elif distance_measure == "kl_divergence":
        distance_function = kl_divergence_pairwise

    distance_measures_all_df = measure_distance_source_train_vs_source_test_vs_target_all(group_a, group_b, distance_function)
    distance_measures_all_df.to_csv(f"./results/distances/{distance_measure}/{group_a}_vs_{group_b}_distance_measures.csv")
    distance_measures_all_df = measure_distance_source_train_vs_source_test_vs_target_all(group_b, group_a, distance_function)
    distance_measures_all_df.to_csv(f"./results/distances/{distance_measure}/{group_b}_vs_{group_a}_distance_measures.csv")

    # class wise 
    for class_label in ["Hypoxemia", "Hypotension"]:
        distance_measures_all_df = measure_distance_source_train_vs_source_test_vs_target_all_classwise(group_a, group_b, class_label, distance_function)
        distance_measures_all_df.to_csv(f"./results/distances/classwise/{class_label}/{distance_measure}/{group_a}_vs_{group_b}_distance_measures.csv")
        distance_measures_all_df = measure_distance_source_train_vs_source_test_vs_target_all_classwise(group_b, group_a, class_label, distance_function)
        distance_measures_all_df.to_csv(f"./results/distances/classwise/{class_label}/{distance_measure}/{group_b}_vs_{group_a}_distance_measures.csv")


# Application to Datasets

## KS_2Samp

In [10]:
measure_distance_source_train_vs_source_test_vs_target_combinations("no_cardiac_surgery", "cardiac_surgery", "ks_2samp")
measure_distance_source_train_vs_source_test_vs_target_combinations("no_respiratory_surgery", "respiratory_surgery", "ks_2samp")
measure_distance_source_train_vs_source_test_vs_target_combinations("vasopressors", "no_vasopressors", "ks_2samp")
measure_distance_source_train_vs_source_test_vs_target_combinations("no_ventilation", "ventilation", "ks_2samp")

KeyboardInterrupt: 

## KL divergence

In [None]:
measure_distance_source_train_vs_source_test_vs_target_combinations("no_cardiac_surgery", "cardiac_surgery", "kl_divergence")
measure_distance_source_train_vs_source_test_vs_target_combinations("no_respiratory_surgery", "respiratory_surgery", "kl_divergence")
measure_distance_source_train_vs_source_test_vs_target_combinations("vasopressors", "no_vasopressors", "kl_divergence")
measure_distance_source_train_vs_source_test_vs_target_combinations("no_ventilation", "ventilation", "kl_divergence")