In [88]:
import h5py
from torch.utils.data import Dataset
import h5py
from skimage import color
import numpy as np

In [89]:
class HDF5Dataset(Dataset):
    def __init__(self, file_path, dataset_name, transform=None):
        self.file_path = file_path
        self.dataset_name = dataset_name
        self.transform = transform
        with h5py.File(self.file_path, 'r') as file:
            self.dataset_len = len(file[self.dataset_name])

    def __len__(self):
        return self.dataset_len

    def __getitem__(self, idx):
        with h5py.File(self.file_path, 'r') as file:
            # Retrieve data and ensure it is a numpy array
            data = np.array(file[self.dataset_name][idx])

        # Apply the transformations
        if self.transform:
            data = self.transform(data)

        return data

# Use the custom dataset
hdf5_dataset = HDF5Dataset('/Users/daviddrexlin/Code/Master/data/pcam/camelyonpatch_level_2_split_train_x.h5-002', 'x')

In [90]:
def getavgstd(image):
    avg = []
    std = []
    image_avg_l = np.mean(image[:, :, 0])
    image_std_l = np.std(image[:, :, 0])
    image_avg_a = np.mean(image[:, :, 1])
    image_std_a = np.std(image[:, :, 1])
    image_avg_b = np.mean(image[:, :, 2])
    image_std_b = np.std(image[:, :, 2])
    avg.append(image_avg_l)
    avg.append(image_avg_a)
    avg.append(image_avg_b)
    std.append(image_std_l)
    std.append(image_std_a)
    std.append(image_std_b)
    return (avg, std)


In [91]:
lab_avg_list = [[] for _ in range(3)]  # For each channel
lab_std_list = [[] for _ in range(3)]

for idx in range(len(hdf5_dataset)):

    color_space = "HSV"
    img = hdf5_dataset[idx]
    if color_space == "LAB":
        img = color.rgb2lab(img)
    elif color_space == "HED":
        img = color.rgb2hed(img)
    elif color_space == "HSV":
        img = color.rgb2hsv(img)

    avg, std = getavgstd(img)
    for i in range(3):
        lab_avg_list[i].append(avg[i])
        lab_std_list[i].append(std[i])

In [92]:
import scipy.stats as scipy_stats  # Renamed to avoid conflict

def get_best_fit(data, distributions):
    best_distribution = None
    best_sse = np.inf  # Initialize the sum of squared errors to a large number

    for distribution in distributions:
        dist = getattr(scipy_stats, distribution)
        params = dist.fit(data)
        fitted_data = dist.pdf(np.linspace(min(data), max(data), len(data)), *params[:-2], loc=params[-2], scale=params[-1])
        sse = np.sum((np.histogram(data, bins=len(data), density=True)[0] - fitted_data) ** 2)

        if sse < best_sse:
            best_sse = sse
            best_distribution = distribution

    return best_distribution

# Assuming lab_avg_list and lab_std_list are lists of data for each channel, and color_space is defined

stats = {}
distributions = ["norm", "laplace"]

In [93]:
import numpy as np
import scipy.stats as scipy_stats
import yaml

def get_best_fit(data, distributions):
    best_distribution = None
    best_sse = np.inf  # Initialize the sum of squared errors to a large number

    for distribution in distributions:
        dist = getattr(scipy_stats, distribution)
        params = dist.fit(data)
        fitted_data = dist.pdf(np.linspace(min(data), max(data), len(data)), *params[:-2], loc=params[-2], scale=params[-1])
        sse = np.sum((np.histogram(data, bins=len(data), density=True)[0] - fitted_data) ** 2)

        if sse < best_sse:
            best_sse = sse
            best_distribution = distribution

    return best_distribution

# Assuming lab_avg_list and lab_std_list are lists of data for each channel, and color_space is defined

stats = {}
distributions = ["norm", "laplace"]

In [94]:
for i, (avg_list, std_list) in enumerate(zip(lab_avg_list, lab_std_list)):
    channel = color_space[i]
    avg_distribution = get_best_fit(avg_list, distributions)
    std_distribution = get_best_fit(std_list, distributions)

    # Convert numpy scalar types to native Python types
    avg_mean = round(float(np.mean(avg_list)), 3)
    avg_std = round(float(np.std(avg_list)), 3)
    std_mean = round(float(np.mean(std_list)), 3)
    std_std = round(float(np.std(std_list)), 3)

    stats[channel] = {
        "avg": {
            "mean": avg_mean,
            "std": avg_std,
            "distribution": avg_distribution,
        },
        "std": {
            "mean": std_mean,
            "std": std_std,
            "distribution": std_distribution,
        },
    }

yaml_save_path = "./dataset_statistics.yaml"
with open(yaml_save_path, "w") as f:
    yaml.dump(stats, f)

print(f"Dataset statistics saved in {yaml_save_path}")


Dataset statistics saved in ./dataset_statistics.yaml
