# Data Preprocessor

outputs:
- copies data to train, val, test directories
- json of mean and std of each channel: r, g, b, swir2, nir, swir1 

In [1]:
import os
import numpy as np
import random
import shutil
import json

# Parameters
data_dir = "./data_infrared/"
save_dir = "./data_infrared_split/"
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Split the data into train, val, and test sets
file_list = os.listdir(data_dir)
train_size = int(train_ratio * len(file_list))
val_size = int(val_ratio * len(file_list))
test_size = len(file_list) - train_size - val_size

# Make sure the split sizes add up to the total size
assert train_size + val_size + test_size == len(file_list), "Split sizes do not add up to total size"

# Split the data into train, val, and test sets
random.shuffle(file_list)
train_file_list = file_list[:train_size]
val_file_list = file_list[train_size:train_size+val_size]
test_file_list = file_list[train_size+val_size:]

def copy_split(split:str, file_list:list):
    '''
    Arguments:
        split: train, val, or test string
        file_list: list of file names in the data directory
    '''
    assert split in ["train", "val", "test"], "split must be train, val, or test"

    # Create the split directory if it doesn't exist
    if not os.path.exists(os.path.join(save_dir, split)):
        os.makedirs(os.path.join(save_dir, split))

    # Copy the files to the split directory
    for file in file_list:
        file:str
        if not file.endswith(".npz"):
            continue

        from_path = os.path.join(data_dir, file)
        to_path = os.path.join(save_dir, split, file)
        shutil.copy(from_path, to_path)

copy_split("train", train_file_list)
copy_split("val", val_file_list)
copy_split("test", test_file_list)

In [2]:
# Compute the mean and standard deviation of the training data
all_pixels = []
for file in train_file_list:
    if not file.endswith(".npz"):
        continue

    # Load the data
    file_path = os.path.join(data_dir, file)
    data = np.load(file_path)

    # image shape: (H, W, C)
    image = data['image']

    # pixels shape: (H*W, C)
    pixels = image.reshape(-1, image.shape[2])
    all_pixels.append(pixels)

# Concatenate all the pixels to be (total_pixels, C)
all_pixels = np.concatenate(all_pixels, axis=0)

# Compute the mean and standard deviation of each channel (R, G, B, SWIR2, NIR, SWIR1)
# shape: (, C)
mean = np.mean(all_pixels, axis=0)
std = np.std(all_pixels, axis=0)
print(mean)
print(std)

# Save the mean and std to a file
d = {
    "mean": mean.tolist(),
    "std": std.tolist()
}
json.dump(d, open(os.path.join(save_dir, "mean_std.json"), "w"))

[0.08886919 0.07239396 0.04874872 0.14343218 0.20228706 0.2078192 ]
[0.07472857 0.06264118 0.05680478 0.09022863 0.09870495 0.10954753]
