In [1]:
!pip install -e ..

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Obtaining file:///netscratch2/alontke/master_thesis/code/ssl-ddpm-rs
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
[?25hCollecting remote-sensing-core@ git+https://github.com/nicikess/remote-sensing-core.git
  Cloning https://github.com/nicikess/remote-sensing-core.git to /tmp/pip-install-10w4fid9/remote-sensing-core_723c0b8b004248aeaf14d6eaf705a781
  Running command git clone -q https://github.com/nicikess/remote-sensing-core.git /tmp/pip-install-10w4fid9/remote-sensing-core_723c0b8b004248aeaf14d6eaf705a781
  Resolved https://github.com/nicikess/remote-sensing-core.git to commit b4274b9c415d0145cd78cc8d1efb9b811653a693
  Installing build dependencies ... [?25ldone
[?25h  

In [1]:
import time
import yaml
from dataclasses import replace
from typing import Callable, Tuple, Optional
from functools import partial

# Numpy
import numpy as np
from tqdm import tqdm

from ffcv.loader import Loader, OrderOption
from ffcv.fields.decoders import NDArrayDecoder

In [8]:
from lit_diffusion.util import instantiate_python_class_from_string_config

with open("../config/model_configs/backbones/s2_era5.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

data_loader = instantiate_python_class_from_string_config(
    class_config=config["train_torch_data_loader"]
)

for ims in data_loader: 
    print(ims)
    break

{'era_5': tensor([[[[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          ...,
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000]],

         [[-0.1803, -0.1803, -0.1803,  ..., -0.1803, -0.1803, -0.1803],
          [-0.1803, -0.1803, -0.1803,  ..., -0.1803, -0.1803, -0.1803],
          [-0.1803, -0.1803, -0.1803,  ..., -0.1803, -0.1803, -0.1803],
          ...,
          [-0.1803, -0.1803, -0.1803,  ..., -0.1803, -0.1803, -0.1803],
          [-0.1803, -0.1803, -0.1803,  ..., -0.1803, -0.1803, -0.1803],
          [-0.1803, -0.1803, -0.1803,  ..., -0.1803, -0.1803, -0.1803]],

         [[-0.4040, -0.4040, -0.4040,  ..., -0.4040, -0.4040, -0.4040],
          [-0.4040, 

In [12]:
ims["sentinel_2"].shape

torch.Size([24, 4, 128, 128])

In [4]:
ims_1, ims_2 = ims
ims.shape

ValueError: too many values to unpack (expected 1)

In [17]:
from typing import Callable, List, Optional, Tuple

# Nmpy
import numpy as np

# FFCV
from ffcv.pipeline.operation import Operation
from ffcv.pipeline.allocation_query import AllocationQuery
from ffcv.pipeline.state import State
from ffcv.pipeline.compiler import Compiler


class ChannelWiseMinMaxScaler(Operation):
    def __init__(
        self,
        minimum_value: List[float],
        maximum_value: List[float],
        interval_min: List[float] = None,
        interval_max: List[float] = None,
        two_dims: bool = False,
    ):
        self.two_dims = two_dims
        assert len(minimum_value) == len(maximum_value)
        # Store Scaler values
        self.minimum_value = minimum_value
        self.maximum_value = maximum_value
        self.interval_min = (
            interval_min if interval_min else [-1 for _ in range(len(maximum_value))]
        )
        self.interval_max = (
            interval_max if interval_min else [1 for _ in range(len(maximum_value))]
        )
        assert len(self.interval_min) == len(self.interval_max)
        assert len(self.interval_max) == len(minimum_value)
        self.minimum_value, self.maximum_value, self.interval_min, self.interval_max = (np.array(x) for x in [self.minimum_value, self.maximum_value, self.interval_min, self.interval_max])

    def generate_code(self) -> Callable:
        parallel_range = Compiler.get_iterator()
        # get local variables to use in return function
        two_dims = self.two_dims
        n_channels = len(self.maximum_value)
        minimum = self.minimum_value
        maximum = self.maximum_value
        interval_minimum = self.interval_min
        interval_maximum = self.interval_max

        def scale_images(images, *args):
            results = np.zeros_like(images)
            for i in parallel_range(n_channels):
                current_minimum = minimum[i]
                current_maximum = maximum[i]
                current_interval_minimum = interval_minimum[i]
                current_interval_maximum = interval_maximum[i]

                current_channel = images[:, i] if two_dims else images[:, i, ::]
                current_channel = (
                        (
                            (current_channel - current_minimum)
                            / (current_maximum - current_minimum)
                        )
                        * (current_interval_maximum - current_interval_minimum)
                        + current_interval_minimum
                )
                if two_dims:
                    results[:, i] = current_channel
                else:
                    results[:, i, ::] = current_channel
            return results

        scale_images.is_parallel = True
        return scale_images

    def declare_state_and_memory(
        self, previous_state: State
    ) -> Tuple[State, Optional[AllocationQuery]]:
        return previous_state, None


In [16]:
image_pipelines = {
    "climate_zone": None, 
    "elevation_differ": None,
    "era_5": [ChannelWiseMinMaxScaler(
        minimum_value=[700, 250, 250, -20, -20, -20, -20, 0, 0],
        maximum_value=[1000, 310, 310, 25, 25, 20, 20, 110, 110],
        interval_min=[-1, -1, -1, -1, -1, -1, -1, -1, -1],
        interval_max=[1, 1, 1, 1, 1, 1, 1, 1, 1],
        two_dims=True,
    )], # [BlowUp((9,2,2))],
    "esa_worldcover": None,
    "glo_30_dem": None,
    "multiclass_numer": None,
    "multiclass_one_h": None,
    "season_s1": None,
    "season_s2": None,
    "sentinel_1": None,
    "sentinel_2": None,
    "field_names": None,
}

loader = Loader(
    "/ds2/remote_sensing/ben-ge/ffcv/ben-ge-60-train.beton", 
    batch_size=4, num_workers=4, order=OrderOption.SEQUENTIAL, pipelines=image_pipelines,
)

# First epoch includes compilation time
for ims in loader: 
    print(ims)
    break
start_time = time.time()
    
# for _ in tqdm(range(399)):
#     for ims, in loader: pass
# print(f'Method: {name} | Shape: {ims.shape} | Time per epoch: {(time.time() - start_time) / 100:.4f}s')

(array([[ 1.        ,  0.10883382,  0.30386657, -0.13902222, -0.09004445,
        -0.0816    ,  0.01785   ,  0.61594546, -0.01460003],
       [ 1.        ,  0.21726684, -0.20686646, -0.0556    ,  0.07168888,
         0.41645002, -0.12405   , -0.16556361,  0.8143091 ],
       [ 1.        ,  0.17690022,  0.5170664 , -0.2804889 , -0.19813333,
         0.1227    , -0.17899999, -0.4333091 , -0.375     ],
       [ 1.        ,  0.01809997,  0.05680033, -0.09017778, -0.19591111,
        -0.04705   ,  0.23555   ,  0.72689086,  0.48596358]],
      dtype=float32),)


In [71]:
loader.reader.handlers

{'climate_zone': <ffcv.fields.basics.FloatField at 0x7f587cc41910>,
 'elevation_differ': <ffcv.fields.basics.FloatField at 0x7f587cc41cd0>,
 'era_5': <ffcv.fields.ndarray.NDArrayField at 0x7f587cc41190>,
 'esa_worldcover': <ffcv.fields.ndarray.NDArrayField at 0x7f587cd7c3d0>,
 'glo_30_dem': <ffcv.fields.ndarray.NDArrayField at 0x7f587cd7ceb0>,
 'multiclass_numer': <ffcv.fields.ndarray.NDArrayField at 0x7f587ccdbca0>,
 'multiclass_one_h': <ffcv.fields.ndarray.NDArrayField at 0x7f591166b160>,
 'season_s1': <ffcv.fields.basics.FloatField at 0x7f587cd0afd0>,
 'season_s2': <ffcv.fields.basics.FloatField at 0x7f587cd0abe0>,
 'sentinel_1': <ffcv.fields.ndarray.NDArrayField at 0x7f587cd0aa60>,
 'sentinel_2': <ffcv.fields.ndarray.NDArrayField at 0x7f587cd0a7c0>,
 'field_names': <ffcv.fields.bytes.BytesField at 0x7f587ccd9460>}

In [70]:
for k in loader.reader.handlers.keys():
    if k not in image_pipelines.keys():
        print(k)

elevation_differ
multiclass_numer
multiclass_one_h


In [103]:
import torch
a = torch.rand(size=(4, 2, 2))

In [114]:
[a,][0]

tensor([[[0.9030, 0.7439],
         [0.2211, 0.3594]],

        [[0.5916, 0.2383],
         [0.1266, 0.2170]],

        [[0.1631, 0.0768],
         [0.2557, 0.5864]],

        [[0.8731, 0.6418],
         [0.4523, 0.3200]]])

In [115]:
len(a)

4

In [119]:
from ffcv.fields import JSONField

In [121]:
JSONField.unpack(ims[0][1])

['climate_zone',
 'elevation_difference_label',
 'era_5',
 'esa_worldcover',
 'glo_30_dem',
 'multiclass_numeric_label',
 'multiclass_one_hot_label',
 'season_s1',
 'season_s2',
 'sentinel_1',
 'sentinel_2']

## Create Benge60-Delta ##

In [28]:
import os
import pandas as pd
from tqdm import tqdm

In [3]:
os.listdir("/netscratch2/nkesseli/master-thesis-benge/src/master_thesis_benge/scripts/data-split")

['ben-ge-test40.csv',
 'data-split.py',
 'ben-ge-test80.csv',
 '__init__.py',
 'ben-ge-validation60.csv',
 'ben-ge-train60.csv',
 'ben-ge-validation20.csv',
 'ben-ge-train20.csv',
 'ben-ge-test.csv',
 'ben-ge-train.csv',
 'ben-ge-validation80.csv',
 'ben-ge-train80.csv',
 'ben-ge-validation40.csv',
 'ben-ge-train40.csv',
 'ben-ge-validation.csv',
 'ben-ge-test60.csv',
 'ben-ge-test20.csv']

In [13]:
ben_ge_60_index = pd.read_csv("/netscratch2/nkesseli/master-thesis-benge/src/master_thesis_benge/scripts/data-split/ben-ge-train60.csv", index_col="patch_id")

In [14]:
len(ben_ge_60_index)

279513

In [15]:
ben_ge_40_index = pd.read_csv("/netscratch2/nkesseli/master-thesis-benge/src/master_thesis_benge/scripts/data-split/ben-ge-train40.csv", index_col="patch_id")

In [16]:
len(ben_ge_40_index)

186342

In [26]:
279513 - 186342

93171

In [22]:
benge_60_delta_index = set(ben_ge_60_index.index) - set(ben_ge_40_index.index)
assert len(benge_60_delta_index) == (279513 - 186342)

In [27]:
benge_60_delta_train = ben_ge_60_index.loc[list(benge_60_delta_index), ::]

In [30]:
base_path = "/ds2/remote_sensing/ben-ge/ffcv/write_indices/"
for split in tqdm(["train", "validation", "test"]):
    ben_ge_60_index = pd.read_csv(f"/netscratch2/nkesseli/master-thesis-benge/src/master_thesis_benge/scripts/data-split/ben-ge-{split}60.csv", index_col="patch_id")
    ben_ge_40_index = pd.read_csv(f"/netscratch2/nkesseli/master-thesis-benge/src/master_thesis_benge/scripts/data-split/ben-ge-{split}40.csv", index_col="patch_id")
    benge_60_delta_index = list(set(ben_ge_60_index.index) - set(ben_ge_40_index.index))
    assert len(benge_60_delta_index) == (len(ben_ge_60_index) - len(ben_ge_40_index))
    benge_60_delta = ben_ge_60_index.loc[list(benge_60_delta_index), ::]
    benge_60_delta.to_csv(base_path + f"ben-ge-{split}60-delta.csv")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:04<00:00,  1.55s/it]
