# Embed2Scale challenge demo notebook

In this notebook we show two ways of loading the challenge task data; first through our custom challenge dataloader, and then with a dataloader from the updated SSL4EO-S12 V1.1 dataset.

We then give an example of how to create a submission file by creating embeddings through random sampling.

Finally, a function for testing a submission for standard errors is provided.

Note that parts of this notebook is simplified for demonstration purposes. However, the datasets and dataloaders, as well as the verification of the submission file are intended to be directly usable and true to the data and the expected submission file formats.

In [1]:
import numpy as np
import pandas as pd

from torchvision import transforms
from torch.utils.data import DataLoader

from challenge_dataset import E2SChallengeDataset, collate_fn
from ssl4eos12_dataset import SSL4EOS12Dataset, S2L1C_MEAN, S2L1C_STD, S2L2A_MEAN, S2L2A_STD, S1GRD_MEAN, S1GRD_STD

# Configurations

In [2]:
# Order of modalities.
# In this demo, modalities are ordered the same as the default order in the SSL4EOS12 dataset class.
# Modalities are loaded in the order provided here.
# Change the order based on your needs.
modalities = ['s2l1c', 's2l2a', 's1']

# Path to challenge data folder, i.e. the folder containing the s1, s2l1c and s2l2a subfolders.
path_to_data = '/path/to/challenge/data/'

# Path to where the submission file should be written.
path_to_output_file = 'path/to/output/file.csv'

write_result_to_file = True  # Set to True to trigger saving of the csv at the end.

# Create data transformation
# Get mean and standard deviations for the modealities in the same order as the modalities
mean_data = S2L1C_MEAN + S2L2A_MEAN + S1GRD_MEAN
std_data = S2L1C_STD + S2L2A_STD + S1GRD_STD

data_transform = transforms.Compose([
    # Add additional transformation here
    transforms.Normalize(mean=mean_data, std=std_data)
])

# Note that both E2SChallengeDataset and SSL4EOS12Dataset outputs torch tensors, so there is no need to a ToTensor transform.

# Load data with custom dataloader

In [None]:
# Do not concatenate modalities
# Dataloader output is {'data': {'s2l1c': s2l1c_data, 's2l2a': s2l2a_data, 's1': s1_data}, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for s2l2a [1, 4, 12, 264, 264])

concatenate_modalities = False
dataset_e2s = E2SChallengeDataset(path_to_data, 
                               modalities = modalities, 
                               dataset_name='bands', 
                               transform=data_transform, 
                               concat=concatenate_modalities,
                               output_file_name=False
                              )

# Print dataset length
print(f"Length of dataset: {len(dataset_e2s)}")

# Print shape of first sample
for m, d in dataset_e2s[0].items():
    print(f'Modality {m} shape:', d.shape)

Length of dataset: 5537
Modality s2l1c shape: torch.Size([1, 4, 13, 264, 264])
Modality s2l2a shape: torch.Size([1, 4, 12, 264, 264])
Modality s1 shape: torch.Size([1, 4, 2, 264, 264])


In [4]:
# Concatenate modalities
# dataloader output is {'data': concatenated_data, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for concatenated_data [1, 4, 27, 264, 264])

concatenate_modalities = True
dataset_e2s = E2SChallengeDataset(path_to_data, 
                                modalities = modalities, 
                                dataset_name='bands', 
                                transform=data_transform, 
                                concat=concatenate_modalities,
                                output_file_name=True
                                ) 

# Print dataset length
print(f"Length of dataset: {len(dataset_e2s)}")

# Print shape of first sample
print(dataset_e2s[0]['data'].shape)

Length of dataset: 5537
torch.Size([1, 4, 27, 264, 264])


# Load with SSL4EOS12 V1.1 dataloader

Note that we have modified the code to allow for a different number of samples per zarr files. The challenge task data consists of a single sample per file, while SSL4EO-S12 V1.1 has 64 samples per zarr file.

In [5]:
dataset_ssl4eo = SSL4EOS12Dataset(
    data_dir=path_to_data,
    modalities=modalities, # optional, list of modality folders.
    transform=data_transform,  # optional, torchvision transforms. Returns tensors if not provided.
    concat=True,  # Concatenate all modalities along the band dimension.
    single_timestamp=False,  # Load single timestamps rather than time series.
    num_batch_samples=1,  # optional, subsample samples in each zarr file.
    num_timestamps=4  # optional, number of seasons to include
)

# Print dataset length
print(f"Length of dataset: {len(dataset_ssl4eo)}")

# Print shape of first sample
print(dataset_ssl4eo[0].shape)

Length of dataset: 5537
torch.Size([1, 4, 27, 264, 264])


In [6]:
# Compare the output from the datasets
print("The two datasets' first sample is the same:", np.all((dataset_e2s[0]['data'] == dataset_ssl4eo[0]).numpy()))

The two datasets' first sample is the same: True


# Dataloader

In [7]:
train_loader  = DataLoader(
    dataset=dataset_e2s,
    batch_size=2,  # Note that each each challenge task zarr file contains a single sample.
    shuffle=True,
    collate_fn=collate_fn,  # Data needs to be concatenated along sample dimension instead of being stacked,
)

for ind, data_file_name in enumerate(train_loader):
    for find, fn in enumerate(data_file_name['file_name']):
        print(f'File name {find}:', fn[0:10] + '...')
    print(data_file_name['data'].shape)
    break

File name 0: 31bb826a45...
File name 1: 4be6443115...
torch.Size([2, 4, 27, 264, 264])


# Create submission file

In this section, we create a submission by randomly generating embeddings of the correct size.
Finally, we create a submission file.

We use the E2SChallengeDataset since we can easily get the sample ID (file name) from the dataloader.

In [8]:
def create_submission_from_dict(emb_dict):
    """Assume dictionary has format {hash-id0: embedding0, hash-id1: embedding1, ...}
    """
    df_submission = pd.DataFrame.from_dict(emb_dict, orient='index')

    # Reset index with name 'id'
    df_submission.index.name = 'id'
    df_submission.reset_index(drop=False, inplace=True)
        
    return df_submission
        

In [9]:
# Randomly generate embeddings from normal distribution.

embedding_dim = 1024
embeddings = {}
rng = np.random.default_rng(seed=None)

# Allow stop early for demonstration purposes
create_n_embeddings = np.inf # 10 # np.inf

# Create random embeddings before the loop to speed up this example
rand_embds = rng.normal(0, 1, size=(len(dataset_e2s), embedding_dim))

for ind, data_file_name in enumerate(dataset_e2s):
    # -------------------------
    # Do compression magic here
    # -------------------------
    
    file_name = data_file_name['file_name']
    
    # Insert the random embeddings
    emb = rand_embds[ind, :]
    embeddings[file_name] = emb

    # Stop early in this example
    if ind >= create_n_embeddings-1:
        break

In [10]:
# Create submission file
submission_file = create_submission_from_dict(embeddings)

In [11]:
print('Number of embeddings:', len(submission_file))

Number of embeddings: 5537


In [12]:
submission_file.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,fec24d0cda8793ff55e1059c7b88763fee8d58d3decf78...,-0.006489,0.925561,-0.205256,0.403202,0.539068,0.302692,0.544578,0.073934,-1.571672,...,-0.320863,-0.946073,0.929955,-0.086056,0.20706,-2.296272,0.54387,1.623381,0.577205,0.042881
1,67960f4c8870a8aa52f295da0f0fea6d708c3cee2555a4...,-0.573994,1.615674,0.280768,-2.057713,-0.330985,2.795643,0.256959,1.724288,1.10852,...,-0.707276,0.049364,-0.58041,0.013492,1.455911,0.189114,0.639222,0.673259,0.151533,-1.848105
2,9688abfaebaea5dca2ec8bde771a7bf1e2bba8e661b777...,-0.22494,0.26741,0.119751,1.129929,0.52394,-0.115578,-0.281985,-0.38409,1.541275,...,0.621671,-0.846401,0.409987,0.367025,-1.406988,-0.136495,-0.395851,0.375316,0.299588,-0.457385
3,fa3ae237ee6e2ee569c20a1e088112cf2105300d9272cc...,0.495171,1.02883,0.724903,1.863715,-1.147544,-0.227417,-0.441443,0.48872,-0.212321,...,-0.939947,-0.412391,0.02979,-0.174632,-1.031126,-0.194826,-0.099551,-1.123242,-1.26815,1.292612
4,430590d31e38c5b345a92dc7d9eb8d126c01abced0cf1a...,-0.508994,-1.416999,2.414253,-1.195625,-0.722276,-1.979099,2.155417,1.318081,-0.086528,...,0.362877,-0.201678,0.788398,-0.441496,1.020406,0.859524,0.433693,1.231648,-1.031058,-0.645724


In [13]:
# Write submission
if write_result_to_file:
    submission_file.to_csv(path_to_output_file, index=False)

# Verify submission file integrity

Below we provide a snippet from a function which will read your embeddings and test for the same errors that the evaluation will check for. The function is similar to how the submission files are loaded.

The intention of this function is to help to verify that a submission has the right structure and contents, check for missing embeddings or NaN values, prior to submission.

The function is intended to be a support. Successfully completing this function does not guarantee fault-free submission files, but is an indication that the most common errors are not present.

In [14]:
def test_submission(path_to_submission: str, 
                    expected_embedding_ids: set, 
                    embedding_dim: int = 1024):
    # Load data
    df = pd.read_csv(path_to_submission, header=0)

    # Verify that id is in columns
    if 'id' not in df.columns:
        raise ValueError(f"""Submission file must contain column 'id'.""")

    # Temporarily set index to 'id'
    df.set_index('id', inplace=True)

    # Check that all samples are included
    submitted_embeddings = set(df.index.to_list())
    n_missing_embeddings = len(expected_embedding_ids.difference(submitted_embeddings))
    if n_missing_embeddings > 0:
        raise ValueError(f"""Submission is missing {n_missing_embeddings} embeddings.""")
    
    # Check that embeddings have the correct length
    if len(df.columns) != embedding_dim:
        raise ValueError(f"""{embedding_dim} embedding dimensions, but provided embeddings have {len(df.columns)} dimensions.""")

    # Convert columns to float
    try:
        for col in df.columns:
            df[col] = df[col].astype(float)
    except Exception as e:
        raise ValueError(f"""Failed to convert embedding values to float.
    Check embeddings for any not-allowed character, for example empty strings, letters, etc.
    Original error message: {e}""")

    # Check if any NaNs 
    if df.isna().any().any():
        raise ValueError(f"""Embeddings contain NaN values.""")
    
    # Successful completion of the function
    return True

In [15]:
# We use the created embeddings as the list of all samples.
# This can be done since we are sure to have fully looped through the dataset.
# A better way would be to find all the IDs in the challenge data separately, e.g. from the dataloader.
embedding_ids = set(embeddings.keys())

# Test submission
assert test_submission(path_to_output_file, embedding_ids, embedding_dim)