# Embed2Scale challenge "mean" baselilne

This notebook creates baseline embeddings by bilinear interpolation and averaging of the modalities.

We use the E2SChallengeDataset to load the data. The datacubes of the challenge data are of shapes (1, 4, 27, 264, 264), (number of samples, number of timesteps, number of channels, height, width).

The embedding works as follow:
1. Subsample each channel to 9x9 pixels using bilinear interpolation -> shape (1, 4, 27, 9, 9)
2. Average each modality (S1, S2 L1C, S2 L2A) separately in the channel dimension -> shape (1, 4, 3, 9, 9)
3. Flatten into 972 element vector -> shape (972,)
4. Append 52 zeros to the end to make the embedding 1024 element long -> shape (1024,)

At the end a submission file is created in the expected format for the embed2scale eval.ai challenge.

In [2]:
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from scipy.ndimage import zoom
from torchvision import transforms

from challenge_dataset import E2SChallengeDataset
from ssl4eos12_dataset import S2L1C_MEAN, S2L1C_STD, S2L2A_MEAN, S2L2A_STD, S1GRD_MEAN, S1GRD_STD



# Configurations

In [3]:
modalities = ['s2l2a', 's2l1c', 's1']

# 
path_to_data = '/path/to/challenge/data/'
path_to_output_file = 'path/to/output/file.csv'

write_result_to_file = False  # Set to True to trigger saving of the csv at the end.

# Create data transformation
# Get mean and standard deviations for the modealities in the correct order
mean_data = S2L2A_MEAN + S2L1C_MEAN + S1GRD_MEAN
std_data = S2L2A_STD + S2L1C_STD + S1GRD_STD

data_transform = transforms.Compose([
    # Add additional transformation here
    transforms.Normalize(mean=mean_data, std=std_data)
])

# Note that both E2SChallengeDataset and SSL4EOS12Dataset outputs torch tensors, so there is no need to a ToTensor transform.

# Load data

In [4]:
# Concatenate modalities
# dataloader output is {'data': concatenated_data, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for concatenated_data [1, 4, 27, 264, 264])

dataset_e2s = E2SChallengeDataset(path_to_data, 
                               modalities = modalities, 
                               dataset_name='bands', 
                               transform=data_transform, 
                               concat=True,
                               output_file_name=True
                              )

# Print dataset length
print(f"Length of train dataset: {len(dataset_e2s)}")

# Print shape of first sample
print(dataset_e2s[0]['data'].shape)

Length of train dataset: 5537
torch.Size([1, 4, 27, 264, 264])


# Create submission file

In this section, we create a submission by randomly generating embeddings of the correct size.
Finally, we create a submission file.

We use the E2SChallengeDataset since we can easily get the sample ID (file name) from the this.

In [5]:
def str_format_np_array(arr):
    """Create string from numpy array formatted as: '[val1, val2, ...]'."""
    return '[' + ','.join([str(n) for n in arr]) + ']'

def create_submission_from_dict(emb_dict):
    """Assume dictionary has format {hash-id0: embedding0, hash-id1: embedding1, ...}
    """
    df_submission = pd.DataFrame(data=[[k, str_format_np_array(e)] for k, e in emb_dict.items()], 
                                 columns=['id', 'embedding'], dtype=str)
        
    return df_submission
        

# Compress by bilinear transform and modality averaging

In this section, we create a submission file by processing each sample accordingly:
1. Subsampling each channel to 9x9 pixels using bilinear interpolation
2. Average each modality (S1, S2 L1C, S2 L2A) in the channel dimension.
3. Flatten into 972 element vector
4. Append 52 zeros to the end to make the embedding 1024 element long

We use the dataloader based on the E2SChallengeDataset since we can easily get the sample ID (file name) from the dataloader.

In [6]:
def embed(data, file_name, n_missing_numbers):
    rescaled = zoom(data, (1, 1, 1, 9/data.shape[3], 9/data.shape[4]), order=1)
    # Mean of S2-L2A, S2-L1C and S1 respectively
    rescaled = np.concatenate((np.mean(rescaled[:, :, 0:12, :, :], axis=2, keepdims=True), 
                               np.mean(rescaled[:, :, 12:25, :, :], axis=2, keepdims=True), 
                               np.mean(rescaled[:, :, 25:, :, :], axis=2, keepdims=True)), 
                               axis=1)
    rescaled = rescaled.flatten()
    # append missing values
    missing_array = np.array(n_missing_numbers*[0.])
    return {'file_name': file_name, 'embedding': np.concatenate((rescaled, missing_array))}


def mean_embedding_parallel(dataset, n_missing_numbers, n_workers=4, n_samples=None):
    
    # Initialize result embeddings
    embeddings = {}

    # Run embedding in parallel
    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        futures = []
        
        for ind, data_file_name in enumerate(dataset):
            data = data_file_name['data']
            file_name = data_file_name['file_name']
            # Submit the batch for processing
            future = executor.submit(embed, data, file_name, n_missing_numbers)
            futures.append(future)

            if (n_samples is not None) and (ind-1 > n_samples):
                break
        
        # Extract results
        for future in futures:
            res = future.result()
            # Compile embeddings
            embeddings[res['file_name']] = res['embedding']
    return embeddings



In [7]:
# Number of zeros to append at the end
n_missing_numbers = 1024-4*3*9*9

run_parallel = True

if run_parallel:
    # Embed data
    embeddings = mean_embedding_parallel(dataset_e2s, n_missing_numbers=n_missing_numbers, n_workers=8)
else:
    embeddings = {}
    for ind, data_file_name in enumerate(dataset_e2s):
        data = data_file_name['data']
        file_name = data_file_name['file_name']

        # Embed
        rescaled = zoom(data, (1, 1, 1, 9/data.shape[3], 9/data.shape[4]), order=1)
        # Mean of S2-L2A, S2-L1C and S1 respectively
        rescaled = np.concatenate((np.mean(rescaled[:, :, 0:12, :, :], axis=2, keepdims=True), 
                                   np.mean(rescaled[:, :, 12:25, :, :], axis=2, keepdims=True), 
                                   np.mean(rescaled[:, :, 25:, :, :], axis=2, keepdims=True)), 
                                   axis=1)
        rescaled = rescaled.flatten()
        # append missing values
        missing_array = np.array(n_missing_numbers*[0.])
        embeddings[file_name] = np.concatenate((rescaled, missing_array))
# Create submission file
submission_file = create_submission_from_dict(embeddings)

In [8]:
print('Number of embeddings:', len(submission_file))

Number of embeddings: 5537


In [9]:
submission_file.head()

Unnamed: 0,id,embedding
0,0002c11ab0bad1ae6efff695891f5713b95101063eaca5...,"[-0.758180558681488,-1.004097580909729,-1.1865..."
1,0002c8e787ba871d725f57833996ef31a4d60f370180a9...,"[-0.7967846393585205,-0.8520763516426086,-0.73..."
2,0017ed83b4548be10f6e12e84e55b0a57ee6f67eae30ea...,"[-0.5252641439437866,-0.9159708023071289,-0.91..."
3,0026f8260255069e3f4b29862368abe752dd7659ab2c0e...,"[-0.9750652313232422,-1.2674709558486938,-1.08..."
4,0031471ca1c1a720c95103d8ca90d2142abd95e27d82f6...,"[-1.1343963146209717,-1.3163825273513794,-0.82..."


In [10]:
# Write submission
if write_result_to_file:
    submission_file.to_csv(path_to_output_file, index=False)