In [1]:
import torch
from torchvision.transforms import v2
from torch.utils.data import DataLoader
from datasets.imageio import get_X_paths, get_y_paths
from datasets.dataset import CloudCoverDataset
from datasets.transforms.minmax_normalize import MinMaxNormalize

In [2]:
import numpy as np
from typing import Union


class AveragedWelfordIncrementalVariance(object):
    """
    Welford's algorithm computes the sample variance incrementally.
    """

    def __init__(self, ddof: int = 0):
        self.ddof = ddof
        self.n = 0
        self.mean = 0.0
        self.M2 = 0.0

    def accumulate(self, x_i: Union[np.ndarray, int, float]):
        self.n += 1
        self.delta = x_i - self.mean
        self.mean += self.delta / self.n
        self.M2 += self.delta * (x_i - self.mean)

    @property
    def variance(self):
        return np.mean(self.M2 / (self.n - self.ddof))

    @property
    def std(self):
        return np.sqrt(self.variance)

# Data Loading

Each x can have up to 4 different feature types, one for R, G, B, Infrared.  
Each feature is an image.  

In [3]:
train_X_paths = get_X_paths()
train_X_paths[:3]

[[WindowsPath('../../data/final/public/train_features/adwp/B02.tif'),
  WindowsPath('../../data/final/public/train_features/adwp/B03.tif'),
  WindowsPath('../../data/final/public/train_features/adwp/B04.tif'),
  WindowsPath('../../data/final/public/train_features/adwp/B08.tif')],
 [WindowsPath('../../data/final/public/train_features/adwu/B02.tif'),
  WindowsPath('../../data/final/public/train_features/adwu/B03.tif'),
  WindowsPath('../../data/final/public/train_features/adwu/B04.tif'),
  WindowsPath('../../data/final/public/train_features/adwu/B08.tif')],
 [WindowsPath('../../data/final/public/train_features/adwz/B02.tif'),
  WindowsPath('../../data/final/public/train_features/adwz/B03.tif'),
  WindowsPath('../../data/final/public/train_features/adwz/B04.tif'),
  WindowsPath('../../data/final/public/train_features/adwz/B08.tif')]]

In [4]:
train_y_paths = get_y_paths()
train_y_paths[:3]

[WindowsPath('../../data/final/public/train_labels/adwp.tif'),
 WindowsPath('../../data/final/public/train_labels/adwu.tif'),
 WindowsPath('../../data/final/public/train_labels/adwz.tif')]

## Create Dataset

In [5]:
train_ds = CloudCoverDataset(
    X_paths=train_X_paths, 
    y_paths=train_y_paths, 
    transforms=MinMaxNormalize(0, 1)
)

In [7]:
batch_size = 64
loader = DataLoader(train_ds, batch_size=batch_size, shuffle=False)
channel_index = 3

# Initialize accumulators
sums = 0.0
sums_sq = 0.0
n = 0

for batch, _ in loader:
    # Extract the fourth channel
    channel_data = batch[:, channel_index]

    # Update accumulators
    sums += channel_data.sum()
    sums_sq += (channel_data ** 2).sum()
    n += channel_data.numel()
    print(sums)

# Compute mean and std
mean = sums / n
variance = (sums_sq / n) - (mean ** 2)
std = torch.sqrt(variance)

print(f"Mean: {mean.item()}, Standard Deviation: {std.item()}")

tensor(6353140.5000)
tensor(12082211.)
tensor(17690556.)
tensor(23615220.)
tensor(29651418.)
tensor(35528368.)
tensor(41349632.)
tensor(46677252.)
tensor(50907164.)
tensor(55189452.)
tensor(60748044.)
tensor(68020024.)
tensor(74845544.)
tensor(80989592.)
tensor(86756648.)
tensor(92750912.)
tensor(97908736.)
tensor(1.0397e+08)
tensor(1.1017e+08)
tensor(1.1667e+08)
tensor(1.2351e+08)
tensor(1.2996e+08)
tensor(1.3720e+08)
tensor(1.4536e+08)
tensor(1.5348e+08)
tensor(1.5827e+08)
tensor(1.6219e+08)
tensor(1.6683e+08)
tensor(1.7173e+08)
tensor(1.7658e+08)
tensor(1.8485e+08)
tensor(1.9354e+08)
tensor(2.0166e+08)
tensor(2.0615e+08)
tensor(2.1078e+08)
tensor(2.1653e+08)
tensor(2.2240e+08)
tensor(2.2849e+08)
tensor(2.3531e+08)
tensor(2.4236e+08)
tensor(2.4964e+08)
tensor(2.5593e+08)
tensor(2.6123e+08)
tensor(2.6532e+08)
tensor(2.6875e+08)
tensor(2.7300e+08)
tensor(2.7795e+08)
tensor(2.8409e+08)
tensor(2.9098e+08)
tensor(2.9667e+08)
tensor(3.0145e+08)
tensor(3.0678e+08)
tensor(3.1419e+08)
tensor(