# Embed2Scale challenge demo notebook

In this notebook we show two ways of loading the challenge task data; first through our custom challenge dataloader, and then with a dataloader from the updated SSL4EO-S12 V1.1 dataset.

We then give an example of how to create a submission file by creating embeddings through random sampling.

In [42]:
import numpy as np
import pandas as pd

from torchvision import transforms
from torch.utils.data import DataLoader

from challenge_dataset import E2SChallengeDataset, collate_fn
from ssl4eos12_dataset import SSL4EOS12Dataset, S2L1C_MEAN, S2L1C_STD, S2L2A_MEAN, S2L2A_STD, S1GRD_MEAN, S1GRD_STD

# Configurations

In [43]:
modalities = ['s2l2a', 's2l1c', 's1']

# 
path_to_data = '/path/to/challenge/data/'
path_to_output_file = 'path/to/output/file.csv'

write_result_to_file = False  # Set to True to trigger saving of the csv at the end.

# Create data transformation
# Get mean and standard deviations for the modealities in the correct order
mean_data = S2L2A_MEAN + S2L1C_MEAN + S1GRD_MEAN
std_data = S2L2A_STD + S2L1C_STD + S1GRD_STD

data_transform = transforms.Compose([
    # Add additional transformation here
    transforms.Normalize(mean=mean_data, std=std_data)
])

# Note that both E2SChallengeDataset and SSL4EOS12Dataset outputs torch tensors, so there is no need to a ToTensor transform.

# Load data with custom dataloader

In [44]:
# Do not concatenate modalities
# Dataloader output is {'data': {'s2l2a': s2l2a_data, 's2l1c': s2l1c_data, 's1': s1_data}, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for s2l2a [1, 4, 12, 264, 264])

concatenate_modalities = False
dataset_e2s = E2SChallengeDataset(path_to_data, 
                               modalities = modalities, 
                               dataset_name='bands', 
                               transform=data_transform, 
                               concat=concatenate_modalities,
                               output_file_name=False
                              )

# Print dataset length
print(f"Length of train dataset: {len(dataset_e2s)}")

# Print shape of first sample
for m, d in dataset_e2s[0].items():
    print(f'Modality {m} shape:', d.shape)

Length of train dataset: 5537
Modality s2l2a shape: torch.Size([1, 4, 12, 264, 264])
Modality s2l1c shape: torch.Size([1, 4, 13, 264, 264])
Modality s1 shape: torch.Size([1, 4, 2, 264, 264])


In [45]:
# Concatenate modalities
# dataloader output is {'data': concatenated_data, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for concatenated_data [1, 4, 27, 264, 264])

concatenate_modalities = True
dataset_e2s = E2SChallengeDataset(path_to_data, 
                                modalities = modalities, 
                                dataset_name='bands', 
                                transform=data_transform, 
                                concat=concatenate_modalities,
                                output_file_name=True
                                ) 

# Print dataset length
print(f"Length of train dataset: {len(dataset_e2s)}")

# Print shape of first sample
print(dataset_e2s[0]['data'].shape)

Length of train dataset: 5537
torch.Size([1, 4, 27, 264, 264])


# Load with SSL4EOS12 V1.1 dataloader

Note that we have modified the code to allow for a different number of samples per zarr files. The challenge task data consists of a single sample per file, while SSL4EO-S12 V1.1 has 64 samples per zarr file.

In [46]:
dataset_ssl4eo = SSL4EOS12Dataset(
    data_dir=path_to_data,
    modalities=modalities, # optional, list of modality folders.
    transform=data_transform,  # optional, torchvision transforms. Returns tensors if not provided.
    concat=True,  # Concatenate all modalities along the band dimension.
    single_timestamp=False,  # Load single timestamps rather than time series.
    num_batch_samples=1,  # optional, subsample samples in each zarr file.
    num_timestamps=4  # optional, number of seasons to include
)

# Print dataset length
print(f"Length of train dataset: {len(dataset_ssl4eo)}")

# Print shape of first sample
print(dataset_ssl4eo[0].shape)

Length of train dataset: 5537
torch.Size([1, 4, 27, 264, 264])


In [47]:
# Compare the output from the datasets
print("The two datasets' first sample is the same:", np.all((dataset_e2s[0]['data'] == dataset_ssl4eo[0]).numpy()))

The two datasets' first sample is the same: True


# Dataloader

In [48]:
train_loader  = DataLoader(
    dataset=dataset_e2s,
    batch_size=2,  # Note that each each challenge task zarr file contains a single sample.
    shuffle=True,
    collate_fn=collate_fn,  # Data needs to be concatenated along sample dimension instead of being stacked,
)

for ind, data_file_name in enumerate(train_loader):
    for find, fn in enumerate(data_file_name['file_name']):
        print(f'File name {find}:', fn[0:10] + '...')
    print(data_file_name['data'].shape)
    break

File name 0: 58c0234746...
File name 1: 1caa53021e...
torch.Size([2, 4, 27, 264, 264])


# Create submission file

In this section, we create a submission by randomly generating embeddings of the correct size.
Finally, we create a submission file.

We use the E2SChallengeDataset since we can easily get the sample ID (file name) from the dataloader.

In [49]:
def str_format_np_array(arr):
    """Create string from numpy array formatted as: '[val1, val2, ...]'."""
    return '[' + ','.join([str(n) for n in arr]) + ']'

def create_submission_from_dict(emb_dict):
    """Assume dictionary has format {hash-id0: embedding0, hash-id1: embedding1, ...}
    """
    df_submission = pd.DataFrame(data=[[k, str_format_np_array(e)] for k, e in emb_dict.items()], 
                                 columns=['id', 'embedding'], dtype=str)
        
    return df_submission
        

In [50]:
# Randomly generate embeddings from normal distribution.

create_n_embeddings = np.inf#10

embedding_dim = 1024
embeddings = {}
rng = np.random.default_rng(seed=None)

# Create random embeddings before the loop to speed up this example
rand_embds = rng.normal(0, 1, size=(len(dataset_e2s), embedding_dim))

for ind, data_file_name in enumerate(dataset_e2s):
    # -------------------------
    # Do compression magic here
    # -------------------------
    file_name = data_file_name['file_name']
    # Insert the random embeddings
    emb = rand_embds[ind, :]
    embeddings[file_name] = emb

    # Stop early in this example
    if ind >= create_n_embeddings-1:
        break

# Create submission file
submission_file = create_submission_from_dict(embeddings)


In [51]:
print('Number of embeddings:', len(submission_file))

Number of embeddings: 5537


In [52]:
submission_file.head()

Unnamed: 0,id,embedding
0,0002c11ab0bad1ae6efff695891f5713b95101063eaca5...,"[-2.865617372338198,1.677732491586853,-0.60472..."
1,0002c8e787ba871d725f57833996ef31a4d60f370180a9...,"[-0.3904162336796967,-0.24518438950491572,0.22..."
2,0017ed83b4548be10f6e12e84e55b0a57ee6f67eae30ea...,"[0.731157728271667,0.8078014141432319,0.175666..."
3,0026f8260255069e3f4b29862368abe752dd7659ab2c0e...,"[0.27152057155809795,-0.3736780542052399,-0.33..."
4,0031471ca1c1a720c95103d8ca90d2142abd95e27d82f6...,"[0.030451769036419937,-0.7073207885596251,1.12..."


In [53]:
# Write submission
if write_result_to_file:
    submission_file.to_csv(path_to_output_file, index=False)