# Embed2Scale challenge demo notebook

In this notebook we show two ways of loading the challenge task data; first through our custom challenge dataloader, and then with a dataloader from the updated SSL4EO-S12 V1.1 dataset.

We then give an example of how to create a submission file by creating embeddings through random sampling.

In [1]:
import numpy as np
import pandas as pd

from torchvision import transforms
from torch.utils.data import DataLoader
from challenge_dataset import E2SChallengeDataset, collate_fn
from ssl4eos12_dataset import SSL4EOS12Dataset, S2L1C_MEAN, S2L1C_STD, S2L2A_MEAN, S2L2A_STD, S1GRD_MEAN, S1GRD_STD

# Configurations

In [2]:
modalities = ['s2l2a', 's2l1c', 's1']

mean_data = S2L2A_MEAN + S2L1C_MEAN + S1GRD_MEAN
std_data = S2L2A_STD + S2L1C_STD + S1GRD_STD

path_to_data = '/path/to/challenge/data/'

data_transform = transforms.Compose([
    transforms.Normalize(mean=mean_data, std=std_data)
])

# Load data with custom dataloader

In [3]:
# Do not concatenate modalities
# Dataloader output is {'data': {'s2l2a': s2l2a_data, 's2l1c': s2l1c_data, 's1': s1_data}, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for s2l2a [1, 4, 12, 264, 264])

concatenate_modalities = False
dataset_e2s = E2SChallengeDataset(path_to_data, 
                               modalities = modalities, 
                               dataset_name='bands', 
                               transform=data_transform, 
                               concat=concatenate_modalities,
                               output_file_name=False
                              )

# Print dataset length
print(f"Length of train dataset: {len(dataset_e2s)}")

# Print shape of first sample
for m, d in dataset_e2s[0].items():
    print(f'Modality {m} shape:', d.shape)

Length of train dataset: 1659
Modality s2l2a shape: torch.Size([1, 4, 12, 264, 264])
Modality s2l1c shape: torch.Size([1, 4, 13, 264, 264])
Modality s1 shape: torch.Size([1, 4, 2, 264, 264])


In [4]:
# Concatenate modalities
# dataloader output is {'data': concatenated_data, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for concatenated_data [1, 4, 27, 264, 264])

concatenate_modalities = True
dataset_e2s = E2SChallengeDataset(path_to_data, 
                               modalities = modalities, 
                               dataset_name='bands', 
                               transform=data_transform, 
                               concat=concatenate_modalities,
                               output_file_name=True
                              )

# Print dataset length
print(f"Length of train dataset: {len(dataset_e2s)}")

# Print shape of first sample
print(dataset_e2s[0]['data'].shape)

Length of train dataset: 1659
torch.Size([1, 4, 27, 264, 264])


# Load with SSL4EOS12 V1.1 dataloader

Note that we have modified the code to allow for a different number of samples per zarr files. The challenge task data consists of a single sample per file, while SSL4EO-S12 V1.1 has 64 samples per zarr file.

In [5]:
dataset_ssl4eo = SSL4EOS12Dataset(
    data_dir=path_to_data,
    modalities=modalities, # optional, list of modality folders.
    transform=data_transform,  # optional, torchvision transforms. Returns tensors if not provided.
    concat=True,  # Concatenate all modalities along the band dimension.
    single_timestamp=False,  # Load single timestamps rather than time series.
    num_batch_samples=1,  # optional, subsample samples in each zarr file.
    samples_per_zarr=1
)

# Print dataset length
print(f"Length of train dataset: {len(dataset_ssl4eo)}")

# Print shape of first sample
print(dataset_ssl4eo[0].shape)

Length of train dataset: 1659
torch.Size([1, 4, 27, 264, 264])


In [6]:
# Compare the output from the datasets
print("The two datasets' first sample is the same:", np.all((dataset_e2s[0]['data'] == dataset_ssl4eo[0]).numpy()))

The two datasets' first sample is the same: True


# Dataloader

In [7]:
train_loader  = DataLoader(
    dataset=dataset_e2s,
    batch_size=2,  # Note that each each challenge task zarr file contains a single sample.
    shuffle=True,
    collate_fn=collate_fn,  # Data needs to be concatenated along sample dimension instead of being stacked,
)

for ind, data_file_name in enumerate(train_loader):
    for find, fn in enumerate(data_file_name['file_name']):
        print(f'File name {find}:', fn[0:10] + '...')
    print(data_file_name['data'].shape)
    break

File name 0: 277a989511...
File name 1: bcb433a384...
torch.Size([2, 4, 27, 264, 264])


# Create submission file

In this section, we create a submission by randomly generating embeddings of the correct size.
Finally, we create a submission file.

We use the E2SChallengeDataset since we can easily get the sample ID (file name) from the dataloader.

In [8]:
def str_format_np_array(arr):
    """Create string from numpy array formatted as: '[val1, val2, ...]'."""
    return '[' + ','.join([str(n) for n in arr]) + ']'

def create_submission_from_dict(emb_dict):
    """Assume dictionary has format {hash-id0: embedding0, hash-id1: embedding1, ...}
    """
    df_submission = pd.DataFrame(data=[[k, str_format_np_array(e)] for k, e in emb_dict.items()], 
                                 columns=['id', 'embedding'], dtype=str)
        
    return df_submission
        

In [9]:
# Randomly generate embeddings from normal distribution.

create_n_embeddings = 10

embedding_dim = 1024
embeddings = {}
rng = np.random.default_rng(seed=None)
for ind, data_file_name in enumerate(train_loader):
    # -------------------------
    # Do compression magic here
    # -------------------------

    # Randomly generate embedding from normal distribution
    for fn in data_file_name['file_name']:
        emb = rng.normal(0, 1, size=(embedding_dim,))
        embeddings[fn] = emb

    # Stop early in this example
    if ind >= create_n_embeddings-1:
        break

# Create submission file
submission_file = create_submission_from_dict(embeddings)


In [10]:
submission_file.head()

Unnamed: 0,id,embedding
0,4b8e5bda5684f2e7474c66e139275ddf61dce40b1d3377...,"[-1.187911937493888,0.8601787852627838,0.42074..."
1,d98a6d88b825cf1925436102c4739cc25ea30014fc9fb2...,"[0.21094484112814646,0.4052598312097374,-0.779..."
2,c4dd0056a2f815bf826dc867cc18fe16a66444cdd8e317...,"[0.28992485348822905,-0.4125092913214582,0.847..."
3,73d6f99ecabe4cb3afcabd2eb3f9ce39ea4117537c0ca5...,"[0.17379557544147067,-0.01019416285788091,1.15..."
4,aaa7d6eb29e40ae32b716c6e9086b88a179d1f2d9d9624...,"[0.0954283316333879,0.6574380779154033,0.18032..."


In [11]:
# Write submission
if False:
    submission_file.to_csv('./random_embeddings.csv', index=False)