# Write DHS images as MMAP

In order to speed up loading, and avoid this from being a bottleneck during training, we save the DHS images as a memory map (MMAP). This will be much faster than reading an individual .np file for each image during training. This MMAP will take up more storage, but given the speed-up, it's a worthy trade-off. For more information about this, see the [MMAP Ninja library](https://github.com/hristo-vrigazov/mmap.ninja?tab=readme-ov-file).

In [3]:
import mmap_ninja
from mmap_ninja.ragged import RaggedMmap
import numpy as np
import os
import pandas as pd
import configparser
from tqdm import tqdm

# Read config file
config = configparser.ConfigParser()
config.read('../config.ini')

DATA_DIR = config['PATHS']['DATA_DIR']

df = pd.read_csv(os.path.join(DATA_DIR, 'dhs_with_imgs.csv'))

Get the paths to the DHS images

In [2]:
dhs_img_paths = df['cluster_id'].apply(lambda x: os.path.join(DATA_DIR, 'dhs_images', x, 'landsat.np'))
dhs_img_paths = dhs_img_paths.tolist()
print(f'Found {len(dhs_img_paths)} DHS images')

Found 68619 DHS images


Write the images as a memory map. This will take a couple of minutes

In [None]:
# Once per project, convert the images to a memory map
RaggedMmap.from_generator(
    # Directory in which the memory map will be persisted
    out_dir=os.path.join(DATA_DIR, 'dhs_images_ragged_mmap'),
    sample_generator=map(np.load, dhs_img_paths),
    # Maximum number of samples to keep in memory before flushing to disk
    batch_size=1024,
    verbose=True
)

print('Memory map created successfully.')

0it [00:00, ?it/s]

29681it [04:47, 216.39it/s]

Test loading and iterating over the MMAP. It should now take less than a second to iterate through the whole dataset.

In [13]:
import torch

# Open the memory map
images_mmap = np_ninja.open_existing(os.path.join(DATA_DIR, 'dhs_images_mmap'))

for i in tqdm(range(len(images_mmap))):
    img: np.ndarray = images_mmap[i]
    img_tensor = torch.from_numpy(img)

  img_tensor = torch.from_numpy(img)
100%|██████████| 68619/68619 [00:00<00:00, 321951.23it/s]


In [16]:
import torch
from mmap_ninja import numpy as np_ninja

# Open the memory map
images_mmap = np_ninja.open_existing(os.path.join(DATA_DIR, 'dhs_images_mmap'))

s_df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the DataFrame for demonstration

batch_size = 32  # Define your batch size
batch_list = []

batch_tensor = torch.empty((batch_size, 224, 224, 6), dtype=torch.float32)  # adjust dtype & shape

for df_i in tqdm(np.arange(len(df))):
    img_i = s_df.iloc[df_i].name
    img: np.ndarray = images_mmap[img_i]
    img_tensor = torch.from_numpy(img)
    
    batch_tensor[df_i % batch_size] = img_tensor
    if (df_i + 1) % batch_size == 0:
        batch_tensor.zero_()  # Reset the batch tensor for the next batch


  0%|          | 0/68619 [00:00<?, ?it/s]

 54%|█████▎    | 36828/68619 [00:32<00:28, 1133.49it/s]


KeyboardInterrupt: 

In [None]:
import numpy as np
from mmap_ninja import numpy as np_ninja

class SimpleDataLoader:
    def __init__(self, df, img_mmap, transform=None, shuffle=False, batch_size=1):
        self.df = df.reset_index(drop=True)
        self.img_mmap = img_mmap
        self.transform = transform
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.indices = np.arange(len(self.df))

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
        self.ptr = 0
        return self

    def __next__(self):
        if self.ptr >= len(self.indices):
            raise StopIteration

        # Select batch indices
        batch_indices = self.indices[self.ptr:self.ptr + self.batch_size]
        self.ptr += self.batch_size

        # Fetch batch
        images = []
        targets = []

        for idx in batch_indices:
            row = self.df.iloc[idx]
            img = torch.from_numpy(self.img_mmap[idx])
            if self.transform:
                img = self.transform(img)
            images.append(img)
            targets.append(row['iwi'] / 100)

        # Stack if batch size > 1, else return single items
        if self.batch_size == 1:
            return images[0], targets[0]
        else:
            return np.stack(images), np.array(targets)

    def __len__(self):
        return (len(self.df) + self.batch_size - 1) // self.batch_size

dl = SimpleDataLoader(
    df=s_df,
    img_mmap=images_mmap,
    transform=None,
    shuffle=True,
    batch_size=128
)

for batch_images, batch_targets in tqdm(dl):
    continue  # Process your batch here, e.g., pass to a model

  7%|▋         | 36/537 [00:36<08:30,  1.02s/it]

In [16]:
from torch.utils.data import Dataset

class RegressionDatasetMMAP(Dataset):
    def __init__(self, df, images_mmap, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_mmap = images_mmap
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        cluster = self.df.iloc[idx]
        img = self.img_mmap[cluster.name]
        target = (cluster['iwi'] / 100)
        if self.transform:
            img = self.transform(img)
        return img, target

ds = RegressionDatasetMMAP(
    df=s_df,
    images_mmap=images_mmap,
    transform=None  # Add any transformations if needed
)

for img, target in tqdm(ds):
    img_tensor = torch.from_numpy(img)


100%|██████████| 68619/68619 [00:04<00:00, 14699.70it/s]


In [21]:
import torch
from torch.utils.data import Dataset
from mmap_ninja.ragged import RaggedMmap

class RegressionDatasetMMAP(Dataset):
    def __init__(self, df, img_mmap_path, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_mmap = RaggedMmap(img_mmap_path, wrapper_fn=torch.tensor)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        cluster = self.df.iloc[idx]
        cluster_id = cluster['cluster_id']
        img = self.img_mmap[cluster.name]
        target = (cluster['iwi'] / 100)
        if self.transform:
            img = self.transform(img)
        return img, target

# Example usage
ds = RegressionDatasetMMAP(
    df,
    os.path.join(DATA_DIR, 'dhs_images_mmap'),
)

# Loop through the dataset
for img, target in tqdm(ds):
    # Process the image and target as needed
    pass


  0%|          | 0/68619 [00:00<?, ?it/s]


ValueError: cannot reshape array of size 20658161664 into shape (224,224,6)

In [17]:
img.dtype

dtype('float64')