# Write DHS images as HDF5

In order to speed up loading, and avoid this from being a bottleneck during training, we save the DHS images as a HDF5 file. This will be much faster than reading an individual .np file for each image during training. For more information about this, see the [h5py website](https://www.h5py.org/).

In [1]:
import h5py
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import configparser

# Read config file
config = configparser.ConfigParser()
config.read('../config.ini')

DATA_DIR = config['PATHS']['DATA_DIR']

df = pd.read_csv(os.path.join(DATA_DIR, 'dhs_with_imgs.csv'))
img_dir = os.path.join(DATA_DIR, 'dhs_images')

hdf5_path = os.path.join(DATA_DIR, 'dhs_images.h5')

Write as .h5-file. This took me about an hour and a half on Alvis.

In [None]:
with h5py.File(hdf5_path, 'w') as h5f:
    for i, row in tqdm(df.iterrows(), total=len(df)):
        cluster_id = row['cluster_id']
        img_path = os.path.join(img_dir, cluster_id, 'landsat.np')
        try:
            img = np.load(img_path)  # shape: (H, W, 6)
            img = img.astype(np.uint16)  # Convert to uint16
            h5f.create_dataset(str(i), data=img, compression="gzip")
        except Exception as e:
            print(f"Skipping {img_path}: {e}")


100%|██████████| 10000/10000 [03:44<00:00, 44.47it/s]


Sanity check

In [3]:
with h5py.File(hdf5_path, 'r') as h5f:
    print(f"Number of datasets in HDF5 file: {len(h5f)}")
    print(f"Keys in HDF5 file: {list(h5f.keys())[:5]}")  # Show first 5 keys
    print(f"Shape of first dataset: {h5f[list(h5f.keys())[0]].shape}")  # Shape of the first dataset
    print(f"Data type of first dataset: {h5f[list(h5f.keys())[0]].dtype}")  # Data type of the first dataset

Number of datasets in HDF5 file: 68619
Keys in HDF5 file: ['AO.Bengo.71.135', 'AO.Bengo.71.158', 'AO.Bengo.71.169', 'AO.Bengo.71.203', 'AO.Bengo.71.208']
Shape of first dataset: (224, 224, 6)
Data type of first dataset: uint16


In [6]:
from torch.utils.data import Dataset

class RegressionDataset(Dataset):
    def __init__(self, df, hdf5_path, transform=None):
        self.df = df.reset_index(drop=True)
        self.hdf5_path = hdf5_path
        self.transform = transform
        self.h5_file = None  # Will be initialized lazily in __getitem__

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.h5_file is None:
            self.h5_file = h5py.File(self.hdf5_path, 'r')

        cluster = self.df.iloc[idx]
        img = self.h5_file[cluster['cluster_id']][:].astype(np.float32)
        target = cluster['iwi'] / 100

        if self.transform:
            img = self.transform(img)

        return img, target

    def __del__(self):
        if self.h5_file is not None:
            self.h5_file.close()


In [7]:
ds = RegressionDataset(df, hdf5_path)

for img, target in tqdm(ds):
    pass  # Just iterating through the dataset to ensure it works

  5%|▌         | 3772/68619 [00:18<05:14, 205.95it/s]
Exception ignored in: <function RegressionDataset.__del__ at 0x15321ac27f70>
Traceback (most recent call last):
  File "/local/tmp.4735263/ipykernel_3048466/2471170545.py", line 27, in __del__
AttributeError: 'RegressionDataset' object has no attribute 'h5_file'


KeyboardInterrupt: 

In [8]:
batch_size = 128
num_workers = 32
from torch.utils.data import DataLoader

dataloader = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=True)



In [9]:
for batch in tqdm(dataloader):
    imgs, targets = batch

100%|██████████| 537/537 [00:42<00:00, 12.63it/s]


In [10]:
batch_size = 128
num_workers = 16
from torch.utils.data import DataLoader

dataloader = DataLoader(ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=True)

for batch in tqdm(dataloader):
    imgs, targets = batch

  0%|          | 0/537 [00:00<?, ?it/s]

100%|██████████| 537/537 [00:33<00:00, 16.23it/s]


In [11]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch

class RegressionDataset(Dataset):
    def __init__(self, df, hdf5_path, transform=None):
        self.df = df.reset_index(drop=True)
        self.hdf5_path = hdf5_path
        self.transform = transform
        self.h5_file = None  # Will be initialized lazily in __getitem__

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.h5_file is None:
            self.h5_file = h5py.File(self.hdf5_path, 'r')

        cluster = self.df.iloc[idx]
        img = self.h5_file[cluster['cluster_id']][:].astype(np.float32)
        target = cluster['iwi']
        if self.transform:
            img = self.transform(img)
        return img, target

    def __del__(self):
        if self.h5_file is not None:
            self.h5_file.close()

def get_dataloaders(df, hdf5_path, train_folds, val_fold, test_fold, batch_size=128, num_workers=16):

    # Get the indices for each fold
    train_folds = df[df['cv_fold'].isin(train_folds)].index.tolist()
    val_fold = df[df['cv_fold'] == val_fold].index.tolist()
    test_fold = df[df['cv_fold'] == test_fold].index.tolist()

    train_dataset = RegressionDataset(df=df.iloc[train_folds], hdf5_path=hdf5_path, transform=None)
    val_dataset = RegressionDataset(df=df.iloc[val_fold], hdf5_path=hdf5_path, transform=None)
    test_dataset = RegressionDataset(df=df.iloc[test_fold], hdf5_path=hdf5_path, transform=None)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    return train_dataloader, val_dataloader, test_dataloader

In [12]:
# Create equal-sized folds
folds = ['A', 'B', 'C', 'D', 'E']
df['cv_fold'] = np.nan  # Initialize the cv_fold column

# Generate and shuffle indices
indices = np.arange(len(df))
np.random.shuffle(indices)

# Split indices into equal-sized groups and assign folds
fold_indices = np.array_split(indices, len(folds))
for fold, idx in zip(folds, fold_indices):
    df.loc[idx, 'cv_fold'] = fold

In [13]:
fold = 'A'  # Specify the fold you want to use for testing
batch_size = 128

# Define train, validation, and test folds
test_fold = fold
val_fold = folds[(folds.index(fold) + 1) % len(folds)]
train_folds = [f for f in folds if f not in [test_fold, val_fold]]
print(f"Train folds: {train_folds}, Validation fold: {val_fold}, Test fold: {test_fold}")

# Get dataloaders
train_dataloader, val_dataloader, test_dataloader = get_dataloaders(df, hdf5_path, train_folds, val_fold, test_fold, batch_size=batch_size)

Train folds: ['C', 'D', 'E'], Validation fold: B, Test fold: A


In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

landsat_transform = transforms.Compose([
    transforms.Lambda(lambda x: x * 0.0000275 - 0.2),
    transforms.Lambda(lambda x: torch.clamp(x, 0.0, 0.3)),
    transforms.Lambda(lambda x: x / 0.3)
])

train_transform = transforms.Compose([
    landsat_transform,
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip()
])

for batch in tqdm(train_dataloader):
    imgs, targets = batch
    imgs = train_transform(imgs.to(device, dtype=torch.float32, memory_format=torch.channels_last))

100%|██████████| 322/322 [00:19<00:00, 16.36it/s]
