In [1]:
import os

import matplotlib.pyplot as plt

# import seaborn as sns
import numpy as np

import pandas as pd
from sklearn.manifold import TSNE

from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

In [2]:
HER2_PATH_BASEPATH = '/storage6T/HER2/'
HER2_PATH = '/storage6T/HER2/Testing_fixed/categorized/combined_train/'

## Helper functions

In [3]:
def load_files(path):
    df = pd.read_csv(path+'clustering_results.csv')
    
    domain = df['predictions'].values
    
    vec_d = df['vec_d_labels'].values
    vec_y = df['vec_y_labels'].values
    img_locs = df['image_id_labels'].values

    Z = np.load(path + "Z_space.npy")

    #args = np.loadtxt(path + "arguments.txt", dtype=str, usecols=0)  # , delimiter = '\n')
    #img_locs = np.loadtxt(path + "img_id.txt", dtype=str)
    
    return domain, vec_d, vec_y, img_locs

In [4]:
def mean_scores_per_experiment(scores, img_locs):
    """
    Parser to get mean scores per image from the cvs file.
    The name of the images in the folders are slightly different from the names in the csv file.

    """

    M = []

    for prediction in img_locs:
        try:
            prediction = str(
                prediction.split("/")[-1]
            )  # depending if the path is full or not, take the img name only
        except:
            "not full path"

        if "s" in prediction:

            N = len(prediction) - 4 - 5
            # print(N, prediction[:N])
            # print(scores.loc[scores['file name'].str.contains(prediction[:N])])
            mean_score = scores.loc[
                scores["file name"].str.contains(prediction[:N])
            ].mean(axis=1, numeric_only=True)
        if "S" in prediction:

            N = len(prediction) - 4 - 5
            # print(N, prediction[:N])
            # print(scores.loc[scores['file name'].str.contains(prediction[:N])])
            mean_score = scores.loc[
                scores["file name"].str.contains(prediction[:N])
            ].mean(axis=1, numeric_only=True)
        else:
            N = len(prediction) - 6
            # print('secod case', N, prediction[:N])
            # print(scores.loc[scores['file name'].str.contains(prediction[:N])])
            mean_score = scores.loc[
                scores["file name"].str.contains(prediction[:N])
            ].mean(axis=1, numeric_only=True)
            # print(mean_score)
        mean_score = float(mean_score)
        # print(mean_score)
        M.append(mean_score)
    return M

In [5]:
def correlation_with_her2_class_label(cluster_pred_scalar, cluster_true_scalar):

    cluster_pred_scalar = [item-1 for item in cluster_pred_scalar]
    cost = np.zeros((len(np.unique(cluster_pred_scalar)),len(np.unique(cluster_pred_scalar))))
    cost = cost - confusion_matrix(cluster_pred_scalar, cluster_true_scalar)

    # What is the best permutation?
    row_ind, col_ind = linear_sum_assignment(cost)
    # Note that row_ind will be equal to [0, 1, ..., cost.shape[0]] because cost is a square matrix.
    conf_mat = (-1)*cost[:, col_ind]
    # Accuracy for best permutation:
    acc_d = np.diag(conf_mat).sum() / conf_mat.sum()
    print('Average correlation with the HER2 class labels', acc_d)

## Load the experimental results

In [6]:
# Please insert yout path to the directory containing the csv file associated with HER2 data (that contains scores)
# base_path = "../../../DomId/HER2"
base_path = HER2_PATH_BASEPATH

scores = pd.read_csv(
    os.path.join(base_path, "truthfile_002.csv"),
    names=["num", "file name", "s1", "s2", "s3", "s4", "s5", "s6", "s_7"],
)

In [7]:
# put path for vade, cdvade, dec experiments' results in here
ex_vade = './2024-04-16_17:15:40.338172_her2_vade/'  # vade
ex_cdvade = './2024-04-16_18:23:41.357723_her2_vade/'  # cdvade
ex_dec = './2024-04-16_17:49:57.887274_her2_dec/'   # dec

In [8]:
# Load the results of the experiment:
domain_vade, vec_d_vade, vec_y_vade, img_locs_vade = load_files(ex_vade)
# Load the pathologist assigned HER2 scores:
M_vade= mean_scores_per_experiment(scores, img_locs_vade)

In [9]:
# Load the results of the experiment:
domain_cdvade,vec_d_cdvade, vec_y_cdvade, img_locs_cdvade = load_files(ex_cdvade)
# Load the pathologist assigned HER2 scores:
M_cdvade = mean_scores_per_experiment(scores, img_locs_cdvade)

In [10]:
# Load the results of the experiment:
domain_dec, vec_d_dec, vec_y_dec, img_locs_dec = load_files(ex_dec)
# Load the pathologist assigned HER2 scores:
M_dec = mean_scores_per_experiment(scores, img_locs_dec)

# Correlation between predicted cluster and true HER2 scores

In [11]:
r = np.corrcoef(domain_vade, M_vade)
# note that the predicted (i.e., clustered) domain integer ids have not been reordered yet to correspond to HER2 classes
print('Direct Pearsons CC between predicted domain and mean score', r[0][1])

Direct Pearsons CC between predicted domain and mean score -0.0324453619443627


In [12]:
correlation_with_her2_class_label(domain_vade, vec_d_vade)

Average correlation with the HER2 class labels 0.4583333333333333


In [13]:
correlation_with_her2_class_label(domain_cdvade, vec_d_cdvade)

Average correlation with the HER2 class labels 0.41369047619047616


In [14]:
correlation_with_her2_class_label(domain_dec, vec_d_dec)

Average correlation with the HER2 class labels 0.46130952380952384


# Per Predicted domain analysis (not used in miccai submission)

In [15]:
#Hungarian algorithm to for the predicted domain/HER2 class correlation 
def domain_class_mapping(domain):
    dic1 = {1: 1, 2: 2, 3: 3}
    dic2 = {1: 1, 2: 3, 3: 2}
    dic3 = {1: 2, 2: 1, 3: 3}
    dic4 = {1: 2, 2: 3, 3: 1}
    dic5 = {1: 3, 2: 2, 3: 1}
    dic6 = {1: 3, 2: 1, 3: 2}
    dictionaries = [dic1, dic2, dic3, dic4, dic5, dic6]
    combos = []
    for i in range(0, 6):
        mapping = dictionaries[i]
        new_combination =[]
        for j in domain:
            new_combination.append(mapping[j])
        combos.append(new_combination)      
    return combos, dictionaries

In [16]:
combos_vade, dictionaries = domain_class_mapping(domain_vade)
R_values_vade = []
for i in combos_vade:
    r =  np.corrcoef(i, M_vade)
    R_values_vade.append(r[0][1])

print('Predicted domain 1 has the highest correlation of', max(R_values_vade), 'with scores of HER2 class ', dictionaries[np.argmax(R_values_vade)][1])
print('The following Hungarian mapping was used:',dictionaries[np.argmax(R_values_vade)])

Predicted domain 1 has the highest correlation of 0.3128138151676808 with scores of HER2 class  3
The following Hungarian mapping was used: {1: 3, 2: 1, 3: 2}


In [17]:
combos_cdvade, dictionaries = domain_class_mapping(domain_cdvade)
R_values_cdvade = []
for i in combos_cdvade:
    r =  np.corrcoef(i, M_cdvade)
    R_values_cdvade.append(r[0][1])

print('Predicted domain 2 has the highest correlation of', max(R_values_cdvade), 'with scores of HER2 class ', dictionaries[np.argmax(R_values_cdvade)][2])
print('The following Hungarian mapping was used:',dictionaries[np.argmax(R_values_cdvade)])

Predicted domain 2 has the highest correlation of 0.3343477687217265 with scores of HER2 class  3
The following Hungarian mapping was used: {1: 2, 2: 3, 3: 1}


In [18]:
combos_dec, dictionaries = domain_class_mapping(domain_dec)
R_values_dec = []
for i in combos_dec:
    r =  np.corrcoef(i, M_dec)
    R_values_dec.append(r[0][1])

print('Predicted domain 2 has the highest correlation of', max(R_values_dec), 'with scores of HER2 class ', dictionaries[np.argmax(R_values_dec)][2])
print('The following Hungarian mapping was used:',dictionaries[np.argmax(R_values_dec)])

Predicted domain 2 has the highest correlation of 0.45818224716368666 with scores of HER2 class  1
The following Hungarian mapping was used: {1: 3, 2: 1, 3: 2}
