In [1]:
from pathlib import Path
import random
import h5py
from PIL import Image
import pandas as pd
import os
import shutil
import numpy as np
import matplotlib.pyplot as plt

random.seed(42)
pd.set_option("display.max_colwidth", None)

In [28]:
def make_pcam_data(input_dir, output_dir):

    input_dir = Path(input_dir).resolve()
    paths = list(input_dir.rglob("*.h5"))
    images = [p for p in paths if "test_x" in str(p)][0]
    labels = [p for p in paths if "test_y" in str(p)][0]
    
    with h5py.File(images, "r") as h5imgs:
        images = h5imgs["x"][:]

    with h5py.File(labels, "r") as h5labels:
        labels = h5labels["y"][:]

    labels = labels.flatten()
    
    # check that we are writing into an empty directory
    img_dir = input_dir.parent / "full_imgs"
    if os.path.exists(img_dir):
        shutil.rmtree(img_dir)
    os.makedirs(img_dir)

    print(images.shape, type(images))
    print(labels.shape, type(labels))

    samples = []
    for idx, (img, label) in enumerate(zip(images, labels, strict=True)):
        img = Image.fromarray(img)
        fname = f"{label}-PCAM-{idx}"
        save_name = f"{img_dir}/{fname}.png"
        img.save(save_name)
        samples.append([fname, label, [save_name]])
    
    samples = pd.DataFrame(samples, columns=["fname", "label", "path"])
    samples.rename(columns={"label": "orig_label"}, inplace=True)
    samples["label"] = samples.orig_label.map({1: "TUM", 0: "NORM"})
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    samples.to_csv(f"{output_dir}/pcam_full_samples.csv", index=False)


In [None]:
### Set variables

h5paths = ""
output_dir = ""

# make_pcam_data(h5paths, output_dir)