# Import & config

In [1]:
%load_ext autoreload
%autoreload 2
import os
os.chdir('C:\\Users\\Usuario\\TFG\\digipanca\\')

In [2]:
import torch
from scripts.neweval import load_trained_model
from src.utils.config import load_config
import csv
from tqdm.notebook import tqdm
from src.data.dataset2d import PancreasDataset2D
from src.data.dataset3d import PancreasDataset3D
from src.metrics.sma import SegmentationMetricsAccumulator as SMA
from src.training.setup.transforms_factory import get_transforms
from src.training.setup.dataset_factory import get_dataset
from torch.utils.data import DataLoader
from src.utils.data import get_patients_in_processed_folder
import torch.nn.functional as F

# __Classes__

## __Base class: Evaluator__

In [23]:
class Evaluator:
    def __init__(self, model, config, test_dir, device):
        self.model = model.to(device)
        self.config = config
        self.device = device
        self.test_dir = test_dir
        self.transform = get_transforms(config)
        self.sma_patient = SMA(include_background=False)
        self.sma_global = SMA(include_background=False)

    def _export_results_to_csv(self, output_folder, metrics, cm):
        os.makedirs(output_folder, exist_ok=True)
        def write_csv(file_path, headers, data_dict, fields):
            with open(file_path, mode="w", newline="") as file:
                writer = csv.writer(file)
                writer.writerow(headers)
                for pid, pdata in data_dict.items():
                    writer.writerow([pid] + [pdata[field] for field in fields])
    
        # Export metrics
        metrics_file = os.path.join(output_folder, 'evaluation_metrics.csv')
        metric_headers = [
            'patient_id',
            'dice_class_1', 'dice_class_2', 'dice_class_3', 'dice_class_4', 'dice_mean'
        ]
        metric_fields = ['dice_class_1', 'dice_class_2', 'dice_class_3', 'dice_class_4', 'dice']
        write_csv(metrics_file, metric_headers, metrics, metric_fields)
    
        # Export confusion matrix
        cm_file = os.path.join(output_folder, 'evaluation_cm.csv')
        cm_headers = [
            'patient_id',
            'tp_class_1', 'tp_class_2', 'tp_class_3', 'tp_class_4', 'tp',
            'fp_class_1', 'fp_class_2', 'fp_class_3', 'fp_class_4', 'fp',
            'fn_class_1', 'fn_class_2', 'fn_class_3', 'fn_class_4', 'fn'
        ]
        cm_fields = [
            'tp_class_1', 'tp_class_2', 'tp_class_3', 'tp_class_4', 'tp',
            'fp_class_1', 'fp_class_2', 'fp_class_3', 'fp_class_4', 'fp',
            'fn_class_1', 'fn_class_2', 'fn_class_3', 'fn_class_4', 'fn'
        ]
        write_csv(cm_file, cm_headers, cm, cm_fields)

        print(f"Results saved in: {output_folder}")

    def evaluate(self, patient_ids=None, csv_folder=None):

        if patient_ids is None:
            patient_ids = get_patients_in_processed_folder(self.test_dir)
        
        loop = tqdm(
            patient_ids,
            colour="red",
            leave=True
        )
        loop.set_description(f"Evaluating patients")

        all_metrics, all_cms = {}, {}
        
        for patient_id in loop:
            p_metrics, p_cm = self.evaluate_patient(patient_id)
            all_metrics[patient_id] = p_metrics
            all_cms[patient_id] = p_cm

        # Global results
        all_metrics['global'] = self.sma_global.aggregate()
        all_cms['global'] = self.sma_global.aggregate_global_cm()

        # Save data on CSV file if specified
        if csv_folder is not None:
            self._export_results_to_csv(csv_folder, all_metrics, all_cms)

        return all_metrics, all_cms

    def evaluate_patient(self, patient_id):
        raise NotImplementedError("Implemented in subclasses")

## __Subclasses: 2D and 3D__

In [24]:
class Evaluator2D(Evaluator):
    def evaluate_patient(self, patient_id):
        self.model.eval()

        p_dataset = PancreasDataset2D(
            data_dir=self.test_dir,
            transform=self.transform,
            load_into_memory=False,
            patient_ids=[patient_id],
            verbose=False
        );

        p_dl = DataLoader(
            p_dataset,
            batch_size=self.config['data']['batch_size'],
            shuffle=False,
            num_workers=self.config['data']['num_workers'],
            pin_memory=True
        )

        patient_loop = tqdm(
            p_dl,
            leave=False,
            colour="blue"
        )
        patient_loop.set_description(f"Patient {patient_id}")

        all_preds = []
        all_gts = []

        with torch.no_grad():
            for images, masks, _ in patient_loop:
                images, masks = images.to(self.device), masks.to(self.device)

                outputs = self.model(images)
                
                if isinstance(outputs, dict):
                    outputs = outputs["out"]

                all_preds.append(outputs)
                all_gts.append(masks)

        # Concatenate predictions and ground truths (from 2D slices to a single 3D volume)
        all_preds = torch.cat(all_preds, dim=0).permute(1, 0, 2, 3).unsqueeze(0)
        all_gts = torch.cat(all_gts, dim=0).unsqueeze(0)

        # Update metrics
        _ = self.sma_patient.update(all_preds, all_gts) # Patient accumulator
        _ = self.sma_global.update(all_preds, all_gts)  # Global accumulator

        # Get aggregated scores and confusion matrix
        p_metrics = self.sma_patient.aggregate()
        p_cm = self.sma_patient.aggregate_global_cm()
        
        self.sma_patient.reset() # Reset patient accumulator

        return p_metrics, p_cm

In [25]:
class Evaluator3D(Evaluator):
    def evaluate_patient(self, patient_id):
        self.model.eval()

        p_dataset = PancreasDataset3D(
            data_dir=self.test_dir,
            transform=self.transform,
            load_into_memory=False,
            patient_ids=[patient_id],
            verbose=False
        );

        p_dl = DataLoader(
            p_dataset,
            batch_size=self.config['data']['batch_size'],
            shuffle=False,
            num_workers=self.config['data']['num_workers'],
            pin_memory=True
        )

        patient_loop = tqdm(
            p_dl,
            leave=False,
            colour="blue"
        )
        patient_loop.set_description(f"Patient {patient_id}")

        all_preds = []
        all_slices = p_dataset.get_patient_subvolumes_slices(patient_id)
        D = all_slices[-1][1] + 1 # add 1 to the total number of slices

        with torch.no_grad():
            for images, masks, _ in patient_loop:
                images, masks = images.to(self.device), masks.to(self.device)

                outputs = self.model(images)
                
                if isinstance(outputs, dict):
                    outputs = outputs["out"]

                all_preds.append(F.softmax(outputs, dim=1))

        # Post-process: get a single 3D volume for the patient
        all_preds = torch.cat(all_preds, dim=0)
        print(all_preds.shape)
        C, _, H, W = all_preds.shape[1:]
        sum_probs = torch.zeros((C, D, H, W), dtype=torch.float64)
        count = torch.zeros((D, H, W), dtype=torch.int8)

        for i, (start, end) in enumerate(all_slices):
            sum_probs[:, start:end+1, :, :] += all_preds[i]
            count[start:end+1, :, :] += 1

        avg_probs = sum_probs / count.unsqueeze(0)
        pred_vol = avg_probs.unsqueeze(0) # B, C, D, H, W

        # Get mask reconstruction
        _, recon_mask = p_dataset.get_patient_volume(patient_id)

        # Update metrics
        _ = self.sma_patient.update(pred_vol, recon_mask) # Patient accumulator
        _ = self.sma_global.update(pred_vol, recon_mask)  # Global accumulator

        # Get aggregated scores and confusion matrix
        p_metrics = self.sma_patient.aggregate()
        p_cm = self.sma_patient.aggregate_global_cm()
        
        self.sma_patient.reset() # Reset patient accumulator

        return p_metrics, p_cm

# __Testing__

## __2D__

In [4]:
config = load_config('configs/experiments/deep_aug_5.yaml')
model_path = 'experiments/deep_aug/deep_aug_20250415_215856/checkpoints/best_model_epoch60.pth'
model = load_trained_model(config, model_path)
config_device = config['training']['device']
device = torch.device(config_device if torch.cuda.is_available() else "cpu")
test_dir = 'data/processed/2d/train/'
patient_ids = ["rtum79", "rtum1", "rtum33", "rtum3", "rtum20", "rtum70", "rtum19", "rtum26", "rtum13", "rtum71", "rtum87", "rtum69", "rtum58", "rtum82", "rtum86", "rtum68", "rtum4", "rtum81"]

In [26]:
evaluator = Evaluator2D(model, config, test_dir, device)

In [16]:
metrics_2d, cm_2d = evaluator.evaluate_patient('rtum1')

  0%|          | 0/23 [00:00<?, ?it/s]

In [10]:
print(metrics_2d)
print(cm_2d)

{'dice_class_1': 0.6346235275268555, 'dice_class_2': 0.0, 'dice_class_3': 0.7477684617042542, 'dice_class_4': 0.73532634973526, 'iou_class_1': 0.4647974371910095, 'iou_class_2': 0.0, 'iou_class_3': 0.5971487164497375, 'iou_class_4': 0.5814356207847595, 'precision_class_1': 0.5534848570823669, 'precision_class_2': 0.0, 'precision_class_3': 0.6505431532859802, 'precision_class_4': 0.7012190818786621, 'recall_class_1': 0.7436378598213196, 'recall_class_2': 0.0, 'recall_class_3': 0.8791614174842834, 'recall_class_4': 0.7729211449623108, 'dice': 0.52942955493927, 'iou': 0.41084545850753784, 'precision': 0.4763117730617523, 'recall': 0.5989301204681396}
{'tp_class_1': 21799.0, 'tp_class_2': 0.0, 'tp_class_3': 36232.0, 'tp_class_4': 40954.0, 'fp_class_1': 17586.0, 'fp_class_2': 69.0, 'fp_class_3': 19463.0, 'fp_class_4': 17450.0, 'fn_class_1': 7515.0, 'fn_class_2': 3250.0, 'fn_class_3': 4980.0, 'fn_class_4': 12032.0, 'tn_class_1': 5916876.0, 'tn_class_2': 5960457.0, 'tn_class_3': 5903101.0, 't

In [27]:
export_folder = 'test_evaluator'
all_metrics_2d, all_cms_2d = evaluator.evaluate(patient_ids, export_folder)

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

Results saved in: test_evaluator


In [40]:
print(all_metrics_2d)

{'rtum79': {'dice_class_1': 0.823541522026062, 'dice_class_2': 0.39977914094924927, 'dice_class_3': 0.8402805924415588, 'dice_class_4': 0.7739779949188232, 'iou_class_1': 0.700017511844635, 'iou_class_2': 0.24982747435569763, 'iou_class_3': 0.7245551347732544, 'iou_class_4': 0.6312920451164246, 'precision_class_1': 0.8946693539619446, 'precision_class_2': 0.2894676625728607, 'precision_class_3': 0.8961020112037659, 'precision_class_4': 0.868422269821167, 'recall_class_1': 0.7628903388977051, 'recall_class_2': 0.6459342241287231, 'recall_class_3': 0.7910060286521912, 'recall_class_4': 0.6980611681938171, 'dice': 0.7093948125839233, 'iou': 0.5764230489730835, 'precision': 0.7371653318405151, 'recall': 0.7244729399681091}, 'rtum1': {'dice_class_1': 0.6346235275268555, 'dice_class_2': 0.0, 'dice_class_3': 0.7477684617042542, 'dice_class_4': 0.73532634973526, 'iou_class_1': 0.4647974371910095, 'iou_class_2': 0.0, 'iou_class_3': 0.5971487164497375, 'iou_class_4': 0.5814356207847595, 'precisi

## __Process output__

In [46]:
def mostrar_dice_pacientes(resultados):
    encabezado = f"{'Paciente':<10} {'Class 1':>10} {'Class 2':>10} {'Class 3':>10} {'Class 4':>10} {'Mean Dice':>12}"
    print(encabezado)
    print("-" * len(encabezado))
    for paciente, valores in resultados.items():
        c1 = valores['dice_class_1']
        c2 = valores['dice_class_2']
        c3 = valores['dice_class_3']
        c4 = valores['dice_class_4']
        mean_dice = valores['dice']
        print(f"{paciente:<10} {c1:10.4f} {c2:10.4f} {c3:10.4f} {c4:10.4f} {mean_dice:12.4f}")

def mostrar_estadisticas(resultados):
    import math
    claves = ['dice_class_1', 'dice_class_2', 'dice_class_3', 'dice_class_4', 'dice']
    datos = {clave: [] for clave in claves}

    for valores in resultados.values():
        for clave in claves:
            datos[clave].append(valores[clave])

    def media(lista):
        return sum(lista) / len(lista)

    def std(lista):
        m = media(lista)
        return math.sqrt(sum((x - m) ** 2 for x in lista) / len(lista))

    encabezado = f"{'Métrica':<15} {'Media':>10} {'Desviación Std':>20}"
    print("\n" + encabezado)
    print("-" * len(encabezado))
    for clave in claves:
        m = media(datos[clave])
        s = std(datos[clave])
        print(f"{clave:<15} {m:10.4f} {s:20.4f}")

In [44]:
mostrar_dice_pacientes(all_metrics_2d)

Paciente      Class 1    Class 2    Class 3    Class 4    Mean Dice
-------------------------------------------------------------------
rtum79         0.8235     0.3998     0.8403     0.7740       0.7094
rtum1          0.6346     0.0000     0.7478     0.7353       0.5294
rtum33         0.7226     0.7411     0.7933     0.8568       0.7784
rtum3          0.8011     0.5883     0.8437     0.7235       0.7391
rtum20         0.6590     0.4207     0.8256     0.8223       0.6819
rtum70         0.5661     0.0000     0.8074     0.7640       0.5344
rtum19         0.6933     0.0000     0.7513     0.7544       0.5497
rtum26         0.8797     0.8762     0.8856     0.8745       0.8790
rtum13         0.8590     0.0000     0.8214     0.8076       0.6220
rtum71         0.8004     0.0000     0.8434     0.7481       0.5980
rtum87         0.7669     0.0000     0.8061     0.8165       0.5974
rtum69         0.8496     0.1628     0.8490     0.7808       0.6606
rtum58         0.5213     0.2777     0.8670     

In [47]:
mostrar_estadisticas(all_metrics_2d)


Métrica              Media       Desviación Std
-----------------------------------------------
dice_class_1        0.7331               0.1056
dice_class_2        0.3090               0.2740
dice_class_3        0.8205               0.0358
dice_class_4        0.7812               0.0453
dice                0.6610               0.0858


# __3D__

In [52]:
config_3d = load_config('configs/experiments/unet3d_4.yaml')
model_path_3d = 'experiments/unet3d/unet3d_20250423_064027/fold_1/checkpoints/best_model_epoch86.pth'
model_3d = load_trained_model(config_3d, model_path_3d)
config_device_3d = config_3d['training']['device']
test_dir_3d = 'data/processed/3d/train/'
device_3d = torch.device(config_device_3d if torch.cuda.is_available() else "cpu")

In [50]:
patient_ids_3d = [
    "rtum1",
    "rtum13",
    "rtum19",
    "rtum20",
    "rtum26",
    "rtum3",
    "rtum33",
    "rtum4",
    "rtum58",
    "rtum68",
    "rtum69",
    "rtum70",
    "rtum71",
    "rtum79",
    "rtum81",
    "rtum82",
    "rtum86",
    "rtum87"
]

In [53]:
evaluator_3d = Evaluator3D(model_3d, config_3d, test_dir_3d, device_3d)

In [54]:
metrics_3d, cm_3d = evaluator_3d.evaluate_patient('rtum1')

  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([2, 5, 64, 256, 256])


In [55]:
print(metrics_3d)
print(cm_3d)

{'dice_class_1': 0.7761029601097107, 'dice_class_2': 0.6096978187561035, 'dice_class_3': 0.8397595882415771, 'dice_class_4': 0.7756595611572266, 'iou_class_1': 0.6341243386268616, 'iou_class_2': 0.4385361969470978, 'iou_class_3': 0.7237806916236877, 'iou_class_4': 0.6335326433181763, 'precision_class_1': 0.6756852269172668, 'precision_class_2': 0.5611901879310608, 'precision_class_3': 0.8118982315063477, 'precision_class_4': 0.7118887305259705, 'recall_class_1': 0.9115780591964722, 'recall_class_2': 0.6673846244812012, 'recall_class_3': 0.8696010708808899, 'recall_class_4': 0.8519797921180725, 'dice': 0.7503049969673157, 'iou': 0.60749351978302, 'precision': 0.690165638923645, 'recall': 0.8251358866691589}
{'tp_class_1': 26722.0, 'tp_class_2': 2169.0, 'tp_class_3': 35838.0, 'tp_class_4': 45143.0, 'fp_class_1': 12826.0, 'fp_class_2': 1696.0, 'fp_class_3': 8303.0, 'fp_class_4': 18270.0, 'fn_class_1': 2592.0, 'fn_class_2': 1081.0, 'fn_class_3': 5374.0, 'fn_class_4': 7843.0, 'tn_class_1': 

In [56]:
all_metrics_3d, all_cms_3d = evaluator_3d.evaluate(patient_ids)

  0%|          | 0/18 [00:00<?, ?it/s]

0it [00:00, ?it/s]

ValueError: No sub-volumes found for patient ID: rtum79

# __Implementation test__

In [18]:
from src.evaluation import Evaluator2D
from src.utils.export import write_csv_from_dict

In [19]:
my_evaluator2d = Evaluator2D(model, config, test_dir, device)

In [7]:
metrics_2d, cm_2d = my_evaluator2d.evaluate_patient('rtum1')

                                                                                                                                                           2.03s/it][0m

In [8]:
print(metrics_2d)
print(cm_2d)

{'dice_class_1': 0.6346235275268555, 'dice_class_2': 0.0, 'dice_class_3': 0.7477684617042542, 'dice_class_4': 0.73532634973526, 'iou_class_1': 0.4647974371910095, 'iou_class_2': 0.0, 'iou_class_3': 0.5971487164497375, 'iou_class_4': 0.5814356207847595, 'precision_class_1': 0.5534848570823669, 'precision_class_2': 0.0, 'precision_class_3': 0.6505431532859802, 'precision_class_4': 0.7012190818786621, 'recall_class_1': 0.7436378598213196, 'recall_class_2': 0.0, 'recall_class_3': 0.8791614174842834, 'recall_class_4': 0.7729211449623108, 'dice': 0.52942955493927, 'iou': 0.41084545850753784, 'precision': 0.4763117730617523, 'recall': 0.5989301204681396}
{'tp_class_1': 21799.0, 'tp_class_2': 0.0, 'tp_class_3': 36232.0, 'tp_class_4': 40954.0, 'fp_class_1': 17586.0, 'fp_class_2': 69.0, 'fp_class_3': 19463.0, 'fp_class_4': 17450.0, 'fn_class_1': 7515.0, 'fn_class_2': 3250.0, 'fn_class_3': 4980.0, 'fn_class_4': 12032.0, 'tn_class_1': 5916876.0, 'tn_class_2': 5960457.0, 'tn_class_3': 5903101.0, 't

In [20]:
export_folder = 'test_evaluator'
all_metrics_2d, all_cms_2d = my_evaluator2d.evaluate(patient_ids, export_folder)

Evaluating patients:   0%|[31m                                                                                                          [0m| 0/18 [00:00<?, ?it/s][0m
[A%|[34m                                                                                                                               [0m| 0/26 [00:00<?, ?it/s][0m
[Aient rtum79:   0%|[34m                                                                                                               [0m| 0/26 [00:00<?, ?it/s][0m
[Aient rtum79:   4%|[34m███▉                                                                                                   [0m| 1/26 [00:05<02:13,  5.35s/it][0m
[Aient rtum79:   8%|[34m███████▉                                                                                               [0m| 2/26 [00:07<01:18,  3.25s/it][0m
[Aient rtum79:  12%|[34m███████████▉                                                                                           [0m| 3/26 [00:09<01:07,  

Results saved in: test_evaluator



