In [None]:
import os
from collections import Counter
from copy import deepcopy
from pathlib import Path

import cv2
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [None]:
# mean_list = []
# stdev_list = []
# fish_dsnames = list(Counter(df_train["parent (dsname)"]).keys())

# for fish_dsname in fish_dsnames:
    
#     df_tmp = df_train[df_train["parent (dsname)"] == fish_dsname]
#     img_batch = None
    
#     for img_path in df_tmp["path"]:
        
#         img: np.ndarray = cv2.imread(img_path)
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#         img = img[None, :]
        
#         if img_batch is None:
#             img_batch = deepcopy(img)
#         else:
#             img_batch = np.append(img_batch, img, axis=0)
        
#     mean_list.append(np.mean(img_batch, axis=(0, 1, 2)))
#     stdev_list.append(np.std(img_batch, axis=(0, 1, 2)))

In [None]:
# def convert_list2array(target_list):
#     """
#     """
#     tmp_array = None
#     for value in target_list:
#         value = value[None, :]
#         if tmp_array is None:
#             tmp_array = deepcopy(value)
#         else:
#             tmp_array = np.append(tmp_array, value, axis=0)

#     return tmp_array

## YuDe, WenWei

In [None]:
def online_var(df, channel):
    x_sum = 0
    x_sq_sum = 0
    n = 0
    
    for path in df["path"]:
        n += 1
        new_img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)[:, :, channel]
        f = new_img.shape[0] * new_img.shape[1]
        x_sum += np.sum(new_img) / (n * f)
        x_sq_sum += np.sum(np.power(new_img, 2))

    print(x_sum, x_sq_sum, x_sq_sum - np.power(x_sum, 2) / (n * f))
    return np.sqrt(x_sq_sum - np.power(x_sum, 2) / (n * f)) / (n * f)

## Welford's online algorithm [(Reference)]

[(Reference)]: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm

In [None]:
# For a new value new_value, compute the new count, new mean, the new M2.
# mean accumulates the mean of the entire dataset
# M2 aggregates the squared distance from the mean
# count aggregates the number of samples seen so far
def update(existing_aggregate, new_value) -> tuple[int, float, float]:
    """
    """
    (count, mean, M2) = existing_aggregate
    count += 1
    delta = new_value - mean
    mean += delta / count
    delta2 = new_value - mean
    M2 += delta * delta2
    return (count, mean, M2)

# Retrieve the mean, variance and sample variance from an aggregate
def finalize(existing_aggregate) -> tuple[float, float, float]:
    """
    """
    (count, mean, M2) = existing_aggregate
    if count < 2:
        return float("nan")
    else:
        (mean, variance, sample_variance) = (mean, M2 / count, M2 / (count - 1))
        return (mean, variance, sample_variance)

In [None]:
def welford_online_algo(df:pd.DataFrame, channel:str) -> tuple[int, float, float, float, float]:
    """
    """
    if channel not in ["R", "G", "B"]: raise ValueError("Channel accept 'R', 'G', 'B' only")
    channel_str2int = {"R": 0, "G": 1, "B": 2}
    channel_i = channel_str2int[channel.upper()]

    existing_aggregate = (0.0, 0.0, 0.0) # (count, mean, M2)
    
    with tqdm(total=len(df["path"]), desc=f"channel_{channel} ") as pbar:
        for path in df["path"]:
            new_img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)[:, :, channel_i]
            for pixel in new_img.flatten():
                existing_aggregate = update(existing_aggregate, pixel)
            pbar.update(1)
            pbar.refresh()
    
    count, mean, M2 = existing_aggregate
    mean, variance, sample_variance = finalize(existing_aggregate)
    stdev = np.sqrt(variance)

    print(f"Channel: {channel}")
    print(f"Total mean: {mean}")
    print(f"Total standard deviation: {stdev}")
    
    return (count, mean, variance, sample_variance, stdev)

## Postprocess Funtions ( generating `single row` in dataframe )

In [None]:
def gen_dataset_identity_string(dataset_xlsx_path: str):
    """
    """
    path_split: list[str] = dataset_xlsx_path.split(os.sep)
    palmskin_cnt_num: str = path_split[6].split("_")[-1] # i[num]
    palmskin_alias: str = path_split[7] # e.g. 'RGB_direct_max_zproj'
    xlsx_name_split: list[str] = os.path.splitext(path_split[-1])[0].split("_") # e.g. ['DS', 'SURF3C', 'CRPS256', 'SF14', 'INT30', 'DRP45']
    
    return (palmskin_cnt_num, palmskin_alias, xlsx_name_split[2], xlsx_name_split[3])

In [None]:
def gen_single_statistic_df(dataset_xlsx_path:str, dataset:str, channel:str,
                            count:int, mean:float, variance:float, sample_variance:float, stdev:float):
    """
    """
    (palmskin_cnt_num, palmskin_alias, \
        crop_size, shift_region) = gen_dataset_identity_string(dataset_xlsx_path)
    
    tmp_dict: dict = {}
    
    tmp_dict["identity"]: str = palmskin_cnt_num
    tmp_dict["palmskin_alias"]: str = palmskin_alias
    tmp_dict["crop_size"]: str = crop_size
    tmp_dict["shift_region"]: str = shift_region
    tmp_dict["dataset"]: str = dataset
    tmp_dict["channel"]: str = channel
    tmp_dict["count"]: int = count
    tmp_dict["mean"]: float = mean
    tmp_dict["variance"]: float = variance
    tmp_dict["sample_variance"]: float = sample_variance
    tmp_dict["stdev"]: float = stdev
    tmp_df = pd.DataFrame(tmp_dict, index=[0]) # convert `Dict` to `DataFrame`
    
    return tmp_df

## Main Process

In [None]:
dataset_xlsx_path: str = r"/home/rime97410000/ZebraFish_DB/{Dataset}_Cropped_v2/SEED_2022/{20231030_del_day7_8}_Academia_Sinica_i631/RGB_direct_max_zproj/KMeansORIG_RND2022/DS_SURF3C_CRPS256_SF14_INT30_DRP45.xlsx"

In [None]:
df = pd.read_excel(dataset_xlsx_path)

In [None]:
print(df.columns)
df # display dataframe

In [None]:
gen_dataset_identity_string(dataset_xlsx_path) # fn_test

In [None]:
gen_single_statistic_df(dataset_xlsx_path, "train", "R", 20, 1.0, 2.0, 1.9, 1.5) # fn_test

In [None]:
df_dict: dict[str, pd.DataFrame] = {}
df_dict["train_all"]      = df[(df["dataset"] == "train")]
df_dict["train_preserve"] = df[(df["dataset"] == "train") & (df["state"] == "preserve")]
df_dict["test_all"]       = df[(df["dataset"] == "test")]
df_dict["test_preserve"]  = df[(df["dataset"] == "test") & (df["state"] == "preserve")]

[print(f"{dataset}: {len(df)}") for dataset, df in df_dict.items()]

In [None]:
csv_path = Path("./dataset_statistic_result.csv")

if csv_path.exists():
    statistic_df = pd.read_csv(csv_path, encoding='utf_8')
else:
    statistic_df = None

statistic_df # display dataframe

In [None]:
for dataset, df in df_dict.items():

    for channel in ["R", "G", "B"]:
        
        tmp_tuple = welford_online_algo(df, channel) # (count, mean, variance, sample_variance, stdev)
        tmp_df = gen_single_statistic_df(dataset_xlsx_path, dataset, channel, *tmp_tuple)
        
        if statistic_df is None:
            statistic_df = deepcopy(tmp_df)
        else:
            statistic_df = pd.concat([statistic_df, tmp_df], ignore_index=True)

In [None]:
statistic_df # display dataframe

In [None]:
statistic_df.to_csv("dataset_statistic_result.csv", encoding='utf_8_sig', index=False)