In [None]:
import os
import numpy as np

input_dir = "D:/BBBC021/singlecell/singh_cp_pipeline_singlecell_images"

found_image = False

# Walk through the input directory and its subdirectories
for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".npy"):
            image_path = os.path.join(root, file)
            image = np.load(image_path)
            print(f"\nFound .npy in second-level folder: {file}")
            print(f"Image Path: {image_path}")
            print(f"Image Shape: {image.shape}")
            print(f"Image Data Type: {image.dtype}")
            print(f"Min Pixel Value: {image.min()}")
            print(f"Max Pixel Value: {image.max()}")

            found_image = True
            break
    if found_image:
        break


Found .npy in second-level folder: B02_s1_w16F89C55C-7808-4136-82E4-E066F8E3CB10_0.npy
Image Path: D:/BBBC021/singlecell/singh_cp_pipeline_singlecell_images\B02_s1_w16F89C55C-7808-4136-82E4-E066F8E3CB10\B02_s1_w16F89C55C-7808-4136-82E4-E066F8E3CB10_0.npy
Image Shape: (68, 68, 3)
Image Data Type: uint16
Min Pixel Value: 608
Max Pixel Value: 8352


In [None]:
import os
import h5py
import numpy as np
from tqdm import tqdm

input_dir = "D:/BBBC021/singlecell/singh_cp_pipeline_singlecell_images"
# save in BBBC021 directory
output_file = "D:/BBBC021/BBBC021_dataset.h5"

# Build a list of all .npy file paths and names (no generator used)
file_paths = []
file_names = []
for root, dirs, files in os.walk(input_dir):
    for file in files:
        if file.endswith('.npy'):
            file_paths.append(os.path.join(root, file))
            file_names.append(file)  # only the file name

total_files = len(file_paths)
print(f"Found {total_files} .npy files.")

# Define expected image shape and data type (from your example)
img_shape = (68, 68, 3)
img_dtype = np.uint16

with h5py.File(output_file, 'w') as h5f:
    # Create resizable datasets for images and names
    image_dataset = h5f.create_dataset(
        'images',
        shape=(0, *img_shape),
        maxshape=(total_files, *img_shape),
        dtype=img_dtype,
        chunks=(1, *img_shape),
        compression='gzip'
    )

    name_dataset = h5f.create_dataset(
        'image_names',
        shape=(0,),
        maxshape=(total_files,),
        dtype=h5py.string_dtype(),
        compression='gzip'
    )

    # Process each file and write it to the HDF5 file one by one
    index = 0
    for file_path, file_name in tqdm(zip(file_paths, file_names), total=total_files, desc="Processing Images", unit="file"):
        data = np.load(file_path)
        
        # Resize datasets to add a new entry
        image_dataset.resize(index + 1, axis=0)
        name_dataset.resize(index + 1, axis=0)
        
        # Store the image and its name
        image_dataset[index] = data
        name_dataset[index] = file_name
        
        index += 1

print(f"Stored {index} images in {output_file}")

Found 488396 .npy files.


Processing Images: 100%|██████████| 488396/488396 [29:15<00:00, 278.14file/s]


Stored 488396 images in BBBC021_dataset.h5
