In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import pickle
import json
import torch

from PIL import Image
from matplotlib import patches
from pathlib import Path
from tqdm.autonotebook import tqdm
from typing import Dict, List, Tuple

from extract_features import get_all_annotations
from utils.dataset_adaptors import load_astma_df

  from tqdm.autonotebook import tqdm


In [4]:
figure_dir = Path('figure')

In [5]:
midog_dataset_file = 'annotations/midog_2022_test.csv'
lymph_dataset_file = 'annotations/lymphocyte_detection_2.csv'
astma_dataset_file = '/data/patho/Astma/cells_df.csv'

In [6]:
midog_dataset = pd.read_csv(midog_dataset_file)
midog_test_codes =  {k: v for k, v in enumerate(midog_dataset['tumortype'].unique())}

In [7]:
lymph_dataset = pd.read_csv(lymph_dataset_file)
lymph_dataset = lymph_dataset.query('split == "test"')
lymph_test_codes = {k: v for k, v in enumerate(lymph_dataset['tumor_id'].unique())}

In [8]:
_, astma_dataset, _ = load_astma_df(astma_dataset_file) 
astma_test_codes = {0: 'None'}

In [9]:
def load_metrics(
        filename: str, 
        metric: str, 
        test_codes: dict, 
        avg_metrics: bool=False
        ):
    """
    Load metrics from a file and optionally map tumor codes using test_codes.

    Args:
        filename (str): Path to the JSON file containing metrics.
        metric (str): The metric suffix to filter (e.g., '_accuracy').
        test_codes (dict, optional): Mapping of test codes to their tumor names. Defaults to None.

    Returns:
        tuple: A dictionary of filtered metrics and the full metric data.
    """
    all_metric_data = json.load(open(filename, 'rb'))
    aggregates = all_metric_data['aggregates']
    metric_dict = {}
    
    # Process test codes to remove spaces and create an inverted mapping
    test_codes_wo = {k: ''.join(v.split(' ')) for k, v in test_codes.items()}
    inverted_test_codes_wo = {v: k for k, v in test_codes_wo.items()}
    
    if not avg_metrics:
        # Filter and map metrics using the test codes
        for key in aggregates:
            if key.startswith('tumor') and key.endswith(metric):
                tumor = key.split('_')[1]
                if tumor in inverted_test_codes_wo:
                    metric_dict[inverted_test_codes_wo[tumor]] = aggregates[key]
    else:
        # Process all metrics that match the suffix
        for key in aggregates:
            if key.endswith(metric):
                metric_dict[key] = aggregates[key]

    return metric_dict, all_metric_data

In [10]:
import utils.constants as constants
import plotly.express as px
import plotly.graph_objects as go 
import itertools
import os
import pickle 

from plotly.subplots import make_subplots


def get_similarity(
        model_name: str, 
        result_dir: str, 
        metric_dir: str, 
        detector: str='FCOS', 
        performance_metric: str='f1', 
        similarity_metric: str='hdv',
        all_models: bool=False,
        num_models: int = 5,
        plot: bool=False,
        test_codes: dict=None
        ) -> None:
    
        # get dataset codes
        abbrevs = constants.MIDOG_ABBREVATIONS
        

        # get layer codes
        if detector == 'FCOS' or detector == 'RetinaNet':
            layer_codes = constants.FCOS_LAYER_CODES
        elif detector == 'yolov7_d6':
            layer_codes = constants.YOLO_D6_LAYER_CODES
        elif detector == 'yolov7':
             layer_codes = constants.YOLO_LAYER_CODES
        else:
             raise NotImplementedError()


        if all_models:
                dfs = []
                for i in range(num_models):
                        
                    try:
                        model_name = model_name[:-1] + str(i)
                        
                        # load hdv results
                        hdv_filename = os.path.join(result_dir, similarity_metric + '_' +  model_name + '.pkl')
                        hdv_scores = pickle.load(open(hdv_filename, 'rb'))

                        # load metric results
                        metrics_filename = os.path.join(metric_dir, model_name + '.json')
                        metric_data, _ = load_metrics(metrics_filename, metric=performance_metric, test_codes=test_codes)

                        # create long dataframe
                        df = pd.DataFrame(hdv_scores)
                        df['Tumortype'] = df.index.map(test_codes)
                        df['Tumortype'] = df.index.map(abbrevs)
                        df[performance_metric] = df.index.map(metric_data)
                        df = df.melt(id_vars=['Tumortype', performance_metric], var_name='Layer')
                        df['run'] = i
                        dfs.append(df)
                        df = pd.concat(dfs)
                    except Exception as e:
                        print(e)
                        continue

        else:
                # load hdv results
                hdv_filename = os.path.join(result_dir, similarity_metric + '_' +  model_name + '.pkl')
                hdv_scores = pickle.load(open(hdv_filename, 'rb'))

                # load metric results
                metrics_filename = os.path.join(metric_dir, model_name + '.json')
                metric_data, _ = load_metrics(metrics_filename, metric=performance_metric, test_codes=test_codes)

                # create long dataframe
                df = pd.DataFrame(hdv_scores)
                df['Tumortype'] = df.index.map(abbrevs)
                df[performance_metric] = df.index.map(metric_data)
                df = df.melt(id_vars=['Tumortype', performance_metric], var_name='Layer')

        # transform aGDV
        if similarity_metric == 'gdv':
            df['value'] = df['value'] * -1

        # rename layers
        df['Layer'] = df['Layer'].map(layer_codes)

        # drop layers
        df.drop(df[df['Tumortype'] == 'HAC'].index, inplace=True)
        df.drop(df[df['Layer'] == 'P1'].index, inplace=True)

        if plot:
            fig = px.scatter(
            df, x=performance_metric, y='value', 
            color='Tumortype', facet_col='Layer', facet_col_wrap=5, labels={'value': similarity_metric})
            fig.show()

        else:
            return df 
        


def get_average_similarity(
        model_name: str, 
        result_dir: str, 
        metric_dir: str, 
        test_codes: dict, 
        performance_metric: str='AP', 
        similarity_metric: str='hdv',
        detector: str='FCOS', 
        num_models: int = 5,
        plot: bool=False
        ) -> None:
            

        # get layer codes
        if detector == 'FCOS' or detector == 'RetinaNet':
            layer_codes = constants.FCOS_LAYER_CODES
        elif detector == 'yolov7_d6':
            layer_codes = constants.YOLO_D6_LAYER_CODES
        elif detector == 'yolov7':
             layer_codes = constants.YOLO_LAYER_CODES
        else:
             raise NotImplementedError()
        

        scores_list = []
        for i in range(num_models):
                
            try:
                model_name = model_name[:-1] + str(i)

                # load hdv results
                sims_filename = os.path.join(result_dir, similarity_metric + '_' +  model_name + '.pkl')
                sims_scores = pickle.load(open(sims_filename, 'rb'))

                if similarity_metric == 'hdv': 
                    sims_scores = {layer: scores[0]['avg'] for layer, scores in sims_scores.items()}
                
                elif similarity_metric == 'gdv':
                     sims_scores = {layer: np.mean(list(scores.values())) for layer, scores in sims_scores.items()}

                # load metric results
                metrics_filename = os.path.join(metric_dir, model_name + '.json')
                metric_data, _ = load_metrics(metrics_filename, metric=performance_metric, test_codes=test_codes, avg_metrics=True)

                sims_scores.update(metric_data)
                sims_scores.update({'model': i})
                scores_list.append(sims_scores)

            except Exception as e:
                    print(e)
                    continue
            
        df = pd.DataFrame(scores_list)
        df = df.melt(id_vars=['model', performance_metric], var_name='Layer')
        
        # rename layers
        df['Layer'] = df['Layer'].map(layer_codes)

        # transform aGDV
        if similarity_metric == 'gdv':
            df['value'] = df['value'] * -1

        if plot:
            fig = px.scatter(
                df, x=performance_metric, y='value', facet_col='Layer', facet_col_wrap=5
            )
            fig.show()

        else:
             return df 
    


In [11]:
import os
import pandas as pd
import pickle
from utils.factory import ConfigCreator


def process_similarity_results(
        config_dir: str,
        performance_metric_dir: str,
        similarity_metric_dir: str,
        performance_metric: str,
        similarity_metric: str,
        model_names: list,
        model_codes: dict,
        test_codes: dict,
        avg_metrics: bool=False
    ) -> pd.DataFrame:
    """
    Process and analyze similarity results for models, generating a formatted table.

    Args:
        performance_metric_dir (str): Directory containing metric results.
        similarity_metric_dir (str): Directory containing similarity results.
        performance_metric (str): Performance metric to evaluate (e.g., 'f1').
        similarity_metric (str): Similarity metric to evaluate (e.g., 'bhatta').
        model_names (list): List of model names to process.
        model_codes (dict): Mapping of model names to human-readable descriptions.
        test_codes (dict): Mapping of test codes for the dataset.

    Returns:
        pd.DataFrame: Final DataFrame with formatted results.
    """
    dfs = []

    # Process each model
    for model_name in model_names:
        config_file = ConfigCreator.load(os.path.join(config_dir, model_name + '.yaml'))
        detector = config_file.detector

        if not avg_metrics:

            # Get similarity results
            model_df = get_similarity(
                model_name=model_name,
                result_dir=similarity_metric_dir,
                detector=detector,
                performance_metric=performance_metric,
                similarity_metric=similarity_metric,
                metric_dir=performance_metric_dir,
                all_models=True,
                test_codes=test_codes
            )

        else:

            model_df = get_average_similarity(
                model_name=model_name, 
                result_dir=similarity_metric_dir,
                detector=detector,
                test_codes=test_codes,
                performance_metric=performance_metric,
                similarity_metric=similarity_metric,
                metric_dir=performance_metric_dir,
                num_models=5
            )
        model_df['model_name'] = model_codes[model_name[:-2]]
        dfs.append(model_df)

    # Combine data
    df = pd.concat(dfs)

    # Aggregate by model name for performance metric
    agg_df = df.groupby(['model_name'])[performance_metric].agg(['mean', 'std']).reset_index()
    agg_df[performance_metric] = agg_df.apply(
        lambda row: f"{row['mean']:.2f} ± {row['std']:.2f}", axis=1
    )
    metric_df = agg_df[['model_name', performance_metric]]

    # Aggregate by model name and layer for similarity metric
    layer_agg_df = df.groupby(['model_name', 'Layer'])['value'].agg(['mean', 'std']).reset_index()
    layer_agg_df['mean_std'] = layer_agg_df.apply(
        lambda row: f"{row['mean']:.2f} ± {row['std']:.2f}", axis=1
    )
    pivot_df = layer_agg_df.pivot(index='model_name', columns='Layer', values='mean_std').reset_index()

    # Merge performance and similarity metrics
    comb_df = metric_df.merge(pivot_df, on='model_name')

    # Add extra columns for detailed results
    comb_df['Model'] = comb_df['model_name'].apply(
        lambda x: 'FCOS' if 'FCOS' in x 
        else 'RetinaNet' if 'RetinaNet' in x         
        else 'Yolov7-D6' if 'Yolov7-D6' in x 
        else 'Yolov7' if 'Yolov7' in x 
        else None
    )
    comb_df['Backbone'] = comb_df['model_name'].apply(
        lambda x: 'ResNet50' if '50' in x 
        else 'ResNet18' if '18' in x 
        else 'ELAN' if 'Yolov7' in x 
        else None
    )
    comb_df['Weights'] = comb_df['model_name'].apply(
        lambda x: 'ImageNet' if 'ImageNet' in x
        else 'Barlow Twins' if 'Barlow Twins' in x
        else 'MoCoV2' if 'MoCoV2' in x
        else 'COCO' if 'COCO' in x 
        else None
    )
    comb_df['Finetuned'] = comb_df['model_name'].apply(
        lambda x: '\\checkmark' if 'finetuned' in x else ''
    )
    comb_df['Domains'] = comb_df['model_name'].apply(
        lambda x: 'HBC' if 'HBC' in x 
        else 'HNSCC' if 'HNSCC' in x 
        else 'all' if 'all' in x
        else 'n.a.'
    )
    comb_df['SA'] = comb_df['model_name'].apply(lambda x: '\\checkmark' if 'SA' in x else '')
    comb_df['FDA'] = comb_df['model_name'].apply(lambda x: '\\checkmark' if 'FDA' in x else '')

    # Rename and reorder columns
    comb_df = comb_df.rename(columns={performance_metric: 'F1-Score'})
    final_columns = [
        'Model', 'Backbone', 'Weights', 'Domains', 'F1-Score',
        'C2', 'C3', 'C4', 'P3', 'P4', 'O3', 'O4'
    ]
    comb_df = comb_df.sort_values(by='model_name')[final_columns]

    # # Highlight maximum values in similarity metrics
    # for col in comb_df.columns[5:]:
    #     max_value = comb_df[col].max()
    #     comb_df[col] = comb_df[col].apply(
    #         lambda x: f"\\textbf{{{x}}}" if x == max_value else x
    #     )

    return comb_df


# Get MIDOG results


In [12]:
model_names = [
    'yolov7_all_0',
    'yolov7_d6_ALL_0'
]

model_codes = {
    'yolov7_all': 'Yolov7 COCO (finetuned, all domains)',
    'yolov7_d6_ALL': 'Yolov7-D6 COCO (finetuned, all domains)'
}

## Get MIDOG HDV results

In [13]:
config_dir = 'optimized_models'
metric_dir = 'results/'
result_dir = 'results/midog'
performance_metric = 'f1'
similarity_metric = 'hdv'

midog_hdv_results = process_similarity_results(
    config_dir=config_dir,
    performance_metric_dir=metric_dir,
    similarity_metric_dir=result_dir,
    performance_metric=performance_metric,
    similarity_metric=similarity_metric,
    model_names=model_names,
    model_codes=model_codes,
    test_codes=midog_test_codes
)

midog_hdv_results

[Errno 2] No such file or directory: 'results/midog/hdv_yolov7_d6_ALL_4.pkl'


Unnamed: 0,Model,Backbone,Weights,Domains,F1-Score,...,C4,P3,P4,O3,O4
0,Yolov7,ELAN,COCO,all,0.53 ± 0.08,...,0.15 ± 0.12,0.26 ± 0.07,0.27 ± 0.10,0.63 ± 0.03,0.40 ± 0.05
1,Yolov7-D6,ELAN,COCO,all,0.71 ± 0.05,...,0.49 ± 0.06,0.46 ± 0.08,0.48 ± 0.11,0.60 ± 0.03,0.54 ± 0.06


In [14]:
print(midog_hdv_results.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
Model & Backbone & Weights & Domains & F1-Score & C2 & C3 & C4 & P3 & P4 & O3 & O4 \\
\midrule
Yolov7 & ELAN & COCO & all & 0.53 ± 0.08 & 0.19 ± 0.03 & 0.30 ± 0.06 & 0.15 ± 0.12 & 0.26 ± 0.07 & 0.27 ± 0.10 & 0.63 ± 0.03 & 0.40 ± 0.05 \\
Yolov7-D6 & ELAN & COCO & all & 0.71 ± 0.05 & 0.16 ± 0.03 & 0.38 ± 0.06 & 0.49 ± 0.06 & 0.46 ± 0.08 & 0.48 ± 0.11 & 0.60 ± 0.03 & 0.54 ± 0.06 \\
\bottomrule
\end{tabular}



## Get MIDOG GDV results

In [15]:
config_dir = 'optimized_models'
metric_dir = 'results/'
result_dir = 'results/midog'
performance_metric = 'f1'
similarity_metric = 'gdv'

midog_gdv_results = process_similarity_results(
    config_dir=config_dir,
    performance_metric_dir=metric_dir,
    similarity_metric_dir=result_dir,
    performance_metric=performance_metric,
    similarity_metric=similarity_metric,
    model_names=model_names,
    model_codes=model_codes,
    test_codes=midog_test_codes
)

midog_gdv_results

[Errno 2] No such file or directory: 'results/midog/gdv_yolov7_d6_ALL_4.pkl'


Unnamed: 0,Model,Backbone,Weights,Domains,F1-Score,...,C4,P3,P4,O3,O4
0,Yolov7,ELAN,COCO,all,0.53 ± 0.08,...,0.03 ± 0.05,0.07 ± 0.03,0.09 ± 0.03,0.05 ± 0.02,0.08 ± 0.03
1,Yolov7-D6,ELAN,COCO,all,0.71 ± 0.05,...,0.21 ± 0.05,0.15 ± 0.05,0.24 ± 0.05,0.11 ± 0.03,0.18 ± 0.04


In [16]:
print(midog_gdv_results.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
Model & Backbone & Weights & Domains & F1-Score & C2 & C3 & C4 & P3 & P4 & O3 & O4 \\
\midrule
Yolov7 & ELAN & COCO & all & 0.53 ± 0.08 & 0.03 ± 0.01 & 0.07 ± 0.04 & 0.03 ± 0.05 & 0.07 ± 0.03 & 0.09 ± 0.03 & 0.05 ± 0.02 & 0.08 ± 0.03 \\
Yolov7-D6 & ELAN & COCO & all & 0.71 ± 0.05 & 0.02 ± 0.01 & 0.09 ± 0.03 & 0.21 ± 0.05 & 0.15 ± 0.05 & 0.24 ± 0.05 & 0.11 ± 0.03 & 0.18 ± 0.04 \\
\bottomrule
\end{tabular}



# Get Lymph results

In [17]:
model_names = [
    'yolov7_HNSCC_0',
]

model_codes = {
    'yolov7_HNSCC': 'Yolov7 COCO (finetuned, only HNSCC)',
}

## Get Lymph HDV results

In [18]:
config_dir = 'optimized_models'
metric_dir = 'results/lymph'
result_dir = 'results/lymph'
performance_metric = 'micro_f1_score'
similarity_metric = 'hdv'

lymph_hdv_results = process_similarity_results(
    config_dir=config_dir,
    performance_metric_dir=metric_dir,
    similarity_metric_dir=result_dir,
    performance_metric=performance_metric,
    similarity_metric=similarity_metric,
    model_names=model_names,
    model_codes=model_codes,
    test_codes=lymph_test_codes,
    avg_metrics=True
)

lymph_hdv_results

Unnamed: 0,Model,Backbone,Weights,Domains,F1-Score,...,C4,P3,P4,O3,O4
0,Yolov7,ELAN,COCO,HNSCC,0.63 ± 0.01,...,0.33 ± 0.10,0.41 ± 0.02,0.37 ± 0.04,0.41 ± 0.02,0.35 ± 0.02


In [19]:
print(lymph_hdv_results.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
Model & Backbone & Weights & Domains & F1-Score & C2 & C3 & C4 & P3 & P4 & O3 & O4 \\
\midrule
Yolov7 & ELAN & COCO & HNSCC & 0.63 ± 0.01 & 0.34 ± 0.03 & 0.36 ± 0.06 & 0.33 ± 0.10 & 0.41 ± 0.02 & 0.37 ± 0.04 & 0.41 ± 0.02 & 0.35 ± 0.02 \\
\bottomrule
\end{tabular}



## Get Lymph GDV results

In [20]:
config_dir = 'optimized_models'
metric_dir = 'results/lymph'
result_dir = 'results/lymph'
performance_metric = 'micro_f1_score'
similarity_metric = 'gdv'

lymph_gdv_results = process_similarity_results(
    config_dir=config_dir,
    performance_metric_dir=metric_dir,
    similarity_metric_dir=result_dir,
    performance_metric=performance_metric,
    similarity_metric=similarity_metric,
    model_names=model_names,
    model_codes=model_codes,
    test_codes=lymph_test_codes,
    avg_metrics=True
)

lymph_gdv_results

Unnamed: 0,Model,Backbone,Weights,Domains,F1-Score,...,C4,P3,P4,O3,O4
0,Yolov7,ELAN,COCO,HNSCC,0.63 ± 0.01,...,0.09 ± 0.03,0.16 ± 0.01,0.12 ± 0.01,0.23 ± 0.01,0.15 ± 0.03


In [21]:
print(lymph_gdv_results.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
Model & Backbone & Weights & Domains & F1-Score & C2 & C3 & C4 & P3 & P4 & O3 & O4 \\
\midrule
Yolov7 & ELAN & COCO & HNSCC & 0.63 ± 0.01 & 0.13 ± 0.01 & 0.10 ± 0.02 & 0.09 ± 0.03 & 0.16 ± 0.01 & 0.12 ± 0.01 & 0.23 ± 0.01 & 0.15 ± 0.03 \\
\bottomrule
\end{tabular}



# Get Astma results

In [22]:
model_names = [
    'yolov7_astma_0',
]

model_codes = {
    'yolov7_astma': 'Yolov7 COCO (finetuned, None)',
}

## Get Astma HDV results 

In [23]:
config_dir = 'optimized_models'
metric_dir = 'results/astma'
result_dir = 'results/astma'
performance_metric = 'micro_f1_score'
similarity_metric = 'hdv'

astma_hdv_results = process_similarity_results(
    config_dir=config_dir,
    performance_metric_dir=metric_dir,
    similarity_metric_dir=result_dir,
    performance_metric=performance_metric,
    similarity_metric=similarity_metric,
    model_names=model_names,
    model_codes=model_codes,
    test_codes=astma_test_codes,
    avg_metrics=True
)

astma_hdv_results

Unnamed: 0,Model,Backbone,Weights,Domains,F1-Score,...,C4,P3,P4,O3,O4
0,Yolov7,ELAN,COCO,n.a.,0.94 ± 0.01,...,0.46 ± 0.09,0.41 ± 0.04,0.46 ± 0.02,0.61 ± 0.02,0.49 ± 0.02


In [24]:
print(astma_hdv_results.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
Model & Backbone & Weights & Domains & F1-Score & C2 & C3 & C4 & P3 & P4 & O3 & O4 \\
\midrule
Yolov7 & ELAN & COCO & n.a. & 0.94 ± 0.01 & 0.43 ± 0.01 & 0.44 ± 0.03 & 0.46 ± 0.09 & 0.41 ± 0.04 & 0.46 ± 0.02 & 0.61 ± 0.02 & 0.49 ± 0.02 \\
\bottomrule
\end{tabular}



## Get Astma GDV results

In [25]:
config_dir = 'optimized_models'
metric_dir = 'results/astma'
result_dir = 'results/astma'
performance_metric = 'micro_f1_score'
similarity_metric = 'gdv'

astma_gdv_results = process_similarity_results(
    config_dir=config_dir,
    performance_metric_dir=metric_dir,
    similarity_metric_dir=result_dir,
    performance_metric=performance_metric,
    similarity_metric=similarity_metric,
    model_names=model_names,
    model_codes=model_codes,
    test_codes=astma_test_codes,
    avg_metrics=True
)

astma_gdv_results

Unnamed: 0,Model,Backbone,Weights,Domains,F1-Score,...,C4,P3,P4,O3,O4
0,Yolov7,ELAN,COCO,n.a.,0.94 ± 0.01,...,0.53 ± 0.19,0.49 ± 0.05,0.45 ± 0.02,0.40 ± 0.06,0.56 ± 0.03


In [26]:
print(astma_gdv_results.to_latex(index=False))

\begin{tabular}{llllllllllll}
\toprule
Model & Backbone & Weights & Domains & F1-Score & C2 & C3 & C4 & P3 & P4 & O3 & O4 \\
\midrule
Yolov7 & ELAN & COCO & n.a. & 0.94 ± 0.01 & 0.55 ± 0.03 & 0.53 ± 0.02 & 0.53 ± 0.19 & 0.49 ± 0.05 & 0.45 ± 0.02 & 0.40 ± 0.06 & 0.56 ± 0.03 \\
\bottomrule
\end{tabular}

