# Script for baseline "mean" 

Example Clouds (regression/classification)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from scipy.ndimage import zoom
from torchvision import transforms
from torch.utils.data import DataLoader

from challenge_dataset import E2SChallengeDataset, collate_fn
from ssl4eos12_dataset import S2L1C_MEAN, S2L1C_STD, S2L2A_MEAN, S2L2A_STD, S1GRD_MEAN, S1GRD_STD



# Configurations

In [None]:
modalities = ['s2l2a', 's2l1c', 's1']

mean_data = S2L2A_MEAN + S2L1C_MEAN + S1GRD_MEAN
std_data = S2L2A_STD + S2L1C_STD + S1GRD_STD

path_to_data = '/path/to/challenge/data/'
path_to_output_file = 'path/to/output/file.csv'

data_transform = transforms.Compose([
    # Add additional transformation here
    transforms.Normalize(mean=mean_data, std=std_data)
])
# Note that both E2SChallengeDataset and SSL4EOS12Dataset outputs torch tensors, so there is no need to a ToTensor transform.

# Load data

In [None]:
# Concatenate modalities
# dataloader output is {'data': concatenated_data, 'file_name': file_name}
# The data has shapes [n_samples, n_seasons, n_channels, height, width] (for concatenated_data [1, 4, 27, 264, 264])

dataset_e2s = E2SChallengeDataset(path_to_data, 
                               modalities = modalities, 
                               dataset_name='bands', 
                               transform=data_transform, 
                               concat=True,
                               output_file_name=True
                              )

# Print dataset length
print(f"Length of train dataset: {len(dataset_e2s)}")

# Print shape of first sample
print(dataset_e2s[0]['data'].shape)

Length of train dataset: 1659
torch.Size([1, 4, 27, 264, 264])


# Dataloader

In [None]:
train_loader  = DataLoader(
    dataset=dataset_e2s,
    batch_size=1,  # Note that each each challenge task zarr file contains a single sample.
    shuffle=True,
    collate_fn=collate_fn,  # Data needs to be concatenated along sample dimension instead of being stacked,
)

for ind, data_file_name in enumerate(train_loader):
    for find, fn in enumerate(data_file_name['file_name']):
        print(f'File name {find}:', fn[0:10] + '...')
    print(data_file_name['data'].shape)
    break

File name 0: 277a989511...
File name 1: bcb433a384...
torch.Size([2, 4, 27, 264, 264])


# Create submission file

In this section, we create a submission by randomly generating embeddings of the correct size.
Finally, we create a submission file.

We use the E2SChallengeDataset since we can easily get the sample ID (file name) from the dataloader.

In [None]:
def str_format_np_array(arr):
    """Create string from numpy array formatted as: '[val1, val2, ...]'."""
    return '[' + ','.join([str(n) for n in arr]) + ']'

def create_submission_from_dict(emb_dict):
    """Assume dictionary has format {hash-id0: embedding0, hash-id1: embedding1, ...}
    """
    df_submission = pd.DataFrame(data=[[k, str_format_np_array(e)] for k, e in emb_dict.items()], 
                                 columns=['id', 'embedding'], dtype=str)
        
    return df_submission
        

# Compress by bilinear transform and modality averaging

In this section, we create a submission file by processing each sample accordingly:
1. Subsampling each channel to 9x9 pixels using bilinear interpolation
2. Average each modality (S1, S2 L1C, S2 L2A) in the channel dimension.
3. Flatten into 972 element vector
4. Append 52 zeros to the end to make the embedding 1024 element long

We use the dataloader based on the E2SChallengeDataset since we can easily get the sample ID (file name) from the dataloader.

In [None]:
# Compress downstream task data by averaging:

counter = 0
embeddings = {}
n_missing_numbers = 1024-4*3*9*9
problematic_files = []

for ind, data_file_name in train_loader:
    data = data_file_name['data']
    file_name = data_file_name['file_name']

    # Embed
    rescaled = zoom(data, (1, 1, 9/data.shape[2], 9/data.shape[3]), order=1)
    # Mean of S2-L2A, S2-L1C and S1 respectively
    rescaled = np.concatenate((np.mean(rescaled[:,0:12,:,:], axis=1, keepdims=True), 
                               np.mean(rescaled[:,12:25,:,:], axis=1, keepdims=True), 
                               np.mean(rescaled[:,25:,:,:], axis=1, keepdims=True)), 
                               axis=1)
    rescaled = rescaled.flatten()
    # append missing values
    missing_array = np.array(n_missing_numbers*[0.])
    embeddings[file_name] = np.concatenate((rescaled, missing_array))

submission_file = create_submission_from_dict(embeddings)


In [None]:
submission_file.head()

In [None]:
# Write submission
if False:
    submission_file.to_csv(path_to_output_file, index=False)