In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!wget http://images.cocodataset.org/zips/train2017.zip

In [None]:
!wget http://images.cocodataset.org/zips/val2017.zip

In [None]:
!wget http://images.cocodataset.org/zips/test2017.zip

In [None]:
# Extract them
!unzip -q train2017.zip
!unzip -q val2017.zip
!unzip -q test2017.zip

In [None]:
!ls -la

In [None]:
import h5py
import numpy as np
import cv2
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from google.colab import drive
from sklearn.model_selection import train_test_split



# # Create output directories in Drive
# output_base = '/content/drive/MyDrive/coco_colorization'
# os.makedirs(f'{output_base}/train', exist_ok=True)
# os.makedirs(f'{output_base}/val', exist_ok=True)
# os.makedirs(f'{output_base}/test', exist_ok=True)

# print("Output directories created!")

In [None]:
def process_image_corrected(image_path, target_size=(128, 128)):
    """
    CORRECTED version - uses proper LAB ranges
    """
    try:
        # Load image
        img = cv2.imread(image_path)
        if img is None:
            return None

        # Convert BGR to RGB
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Resize to 256x256 (128x128)
        img_resized = cv2.resize(img_rgb, target_size, interpolation=cv2.INTER_AREA)

        # Convert RGB to LAB
        img_lab = cv2.cvtColor(img_resized, cv2.COLOR_RGB2LAB)

        # CORRECT Normalization:
        img_normalized = img_lab.astype(np.float32)
        img_normalized[..., 0] = img_normalized[..., 0] / 255.0        # L: 0-255 â†’ 0-1
        img_normalized[..., 1] = (img_normalized[..., 1] - 128.0) / 128.0  # A: -1 to 1
        img_normalized[..., 2] = (img_normalized[..., 2] - 128.0) / 128.0  # B: -1 to 1

        return img_normalized

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [None]:
!pip install h5py

In [None]:
os.makedirs('/content/drive/MyDrive/coco_colorization_hdf5', exist_ok=True)

In [None]:
def process_and_save_hdf5_optimized(image_paths, output_path, split_name, max_images=None):
    """
    Optimal solution: HDF5 + float16 + compression
    Expected size: 12-18GB for entire dataset
    """
    if max_images:
        image_paths = image_paths[:max_images]

    # Process first image to get shape
    sample = process_image_corrected(image_paths[0])
    if sample is None:
        # Try another image if first fails
        for img_path in image_paths[1:10]:
            sample = process_image_corrected(img_path)
            if sample is not None:
                break
        if sample is None:
            raise ValueError("Could not process any sample images")

    with h5py.File(output_path, 'w') as f:
        # Create resizable dataset
        images_dset = f.create_dataset(
            'images',
            shape=(0, 128, 128, 3),
            maxshape=(None, 128, 128, 3),
            dtype=np.float16,
            compression='gzip',
            compression_opts=9,
            chunks=(500, 128, 128, 3)  # Increased chunk size for better performance
        )

        batch = []
        batch_size = 2000  # Increased from 500 to 2000 - much faster!
        success_count = 0

        for i, img_path in enumerate(tqdm(image_paths, desc=f"Processing {split_name}")):
            processed_img = process_image_corrected(img_path)
            if processed_img is not None:
                # Convert to float16 for storage efficiency
                batch.append(processed_img.astype(np.float16))
                success_count += 1

            # Save in batches to manage memory
            if len(batch) >= batch_size or i == len(image_paths) - 1:
                if batch:
                    # Resize dataset
                    current_size = images_dset.shape[0]
                    new_size = current_size + len(batch)
                    images_dset.resize(new_size, axis=0)

                    # Add batch
                    images_dset[current_size:new_size] = np.array(batch)
                    print(f"ðŸ’¾ Saved batch of {len(batch)} images (total: {new_size})")
                    batch = []  # Clear batch

        print(f"âœ… Completed {split_name}: {success_count}/{len(image_paths)} images processed")
        print(f"ðŸ“Š Final dataset shape: {images_dset.shape}")
        print(f"ðŸ’¾ File size: {os.path.getsize(output_path) / (1024**3):.2f} GB")

    return success_count

In [None]:
# Get all image paths
def get_image_paths(directory):
    """Get all image file paths from directory"""
    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp']
    image_paths = []

    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in image_extensions):
                image_paths.append(os.path.join(root, file))

    return image_paths

# Get image paths from each split
train_paths = get_image_paths('train2017')
val_paths = get_image_paths('val2017')
test_paths = get_image_paths('test2017')

print(f"Found {len(train_paths)} training images")
print(f"Found {len(val_paths)} validation images")
print(f"Found {len(test_paths)} test images")

In [None]:
print("\nðŸ“¥ Processing validation images...")
val_count = process_and_save_hdf5_optimized(
    val_paths,
    '/content/drive/MyDrive/Preprocessed_data_Image_colorizer/val.h5',
    'val'
)

print("\nðŸ“¥ Processing test images...")
test_count = process_and_save_hdf5_optimized(
    test_paths,
    '/content/drive/MyDrive/Preprocessed_data_Image_colorizer/test.h5',
    'test'
)

print("=" * 60)
print(f"Validation: {val_count} images")
print(f"Test: {test_count} images")

In [None]:
print("ðŸš€ PROCESSING WITH HDF5 (OPTIMIZED STORAGE)")
print("=" * 60)

print("\nðŸ“¥ Processing training images...")
train_count = process_and_save_hdf5_optimized(
    train_paths,
    '/content/drive/MyDrive/Preprocessed_data_Image_colorizer/train.h5',
    'train'
)

# print("\nðŸ“¥ Processing validation images...")
# val_count = process_and_save_hdf5_optimized(
#     val_paths,
#     '/content/drive/MyDrive/coco_colorization_hdf5/val.h5',
#     'val'
# )

# print("\nðŸ“¥ Processing test images...")
# test_count = process_and_save_hdf5_optimized(
#     test_paths,
#     '/content/drive/MyDrive/coco_colorization_hdf5/test.h5',
#     'test'
# )

print("=" * 60)
print("ðŸŽ‰ ALL PROCESSING COMPLETED!")
print(f"Training: {train_count} images")
# print(f"Validation: {val_count} images")
# print(f"Test: {test_count} images")

In [None]:
#Check just one file quickly
def quick_check(file_path):
    with h5py.File(file_path, 'r') as f:
        images = f['images']
        print(f"Shape: {images.shape}")
        print(f"Data type: {images.dtype}")

        # Check one image
        sample = images[0]
        print(f"Sample ranges - L: [{sample[...,0].min():.3f}, {sample[...,0].max():.3f}]")
        print(f"Sample ranges - A: [{sample[...,1].min():.3f}, {sample[...,1].max():.3f}]")
        print(f"Sample ranges - B: [{sample[...,2].min():.3f}, {sample[...,2].max():.3f}]")

        # Convert and display one image
        lab_denorm = sample.astype(np.float32)
        lab_denorm[..., 0] = lab_denorm[..., 0] * 100.0
        lab_denorm[..., 1] = lab_denorm[..., 1] * 128.0 + 128.0
        lab_denorm[..., 2] = lab_denorm[..., 2] * 128.0 + 128.0
        lab_uint8 = np.clip(lab_denorm, 0, 255).astype(np.uint8)
        rgb = cv2.cvtColor(lab_uint8, cv2.COLOR_LAB2RGB)

        plt.imshow(rgb)
        plt.title("Sample Image (RGB)")
        plt.axis('off')
        plt.show()

# Check each file
print("TRAINING SET:")
quick_check('/content/drive/MyDrive/Preprocessed_data_Image_colorizer/train.h5')

print("\nVALIDATION SET:")
quick_check('/content/drive/MyDrive/Preprocessed_data_Image_colorizer/val.h5')

print("\nTEST SET:")
quick_check('/content/drive/MyDrive/Preprocessed_data_Image_colorizer/test.h5')