In [2]:
from pathlib import Path

# 바꿀 경로 정의
old_prefix = "/work/mech-ai-scratch/bgekim/project/imputation/IA_dataset/30m/Patches/S2"
new_prefix = "/scratch/bepk/bkim2/patches/30m/S2"

# 입력/출력 파일 경로
input_file = "/scratch/bepk/bkim2/MultiMAE_RGB/MultiMAE/valid_list/nova/30m/pair_S2.txt"
output_file = "/scratch/bepk/bkim2/MultiMAE_RGB/MultiMAE/valid_list/delta/30m/pair_S2.txt"

# 파일을 한 줄씩 읽어서 경로를 치환하고 새 파일에 씁니다.
with open(input_file, "r") as fin, open(output_file, "w") as fout:
    for line in fin:
        line = line.strip()
        if line:
            # 경로 치환
            new_line = line.replace(old_prefix, new_prefix)
            fout.write(new_line + "\n")


In [2]:
#!/usr/bin/env python3
"""
S1 vs MODIS·S2 경로 비교
- 세 txt 파일은 한 줄에 하나의 full‑path가 들어 있다고 가정
- 결과:
    1) 각 리스트 총 개수 출력
    2) MODIS·S2에는 없고 S1에만 있는 키 목록 출력
"""

from pathlib import Path

# -------------------------------------------------------------
MODIS_TXT = Path("valid_MODIS.txt")   # 경로 수정
S2_TXT    = Path("valid_S2.txt")
S1_TXT    = Path("valid_S1.txt")
# -------------------------------------------------------------

def read_lines(f: Path):
    return [ln.strip() for ln in f.read_text().splitlines() if ln.strip()]

def make_key(full_path: str) -> str:
    """
    '/.../patches/MODIS/patch_1008_1008/2018-07-22_IA.tif'
        → 'patch_1008_1008/2018-07-22_IA.tif'
    (모달리티 이름 제거: 패치폴더/파일명 만 남김)
    """
    parts = full_path.split("/patches/", 1)[-1].split("/", 2)
    # parts: [ 'MODIS', 'patch_1008_1008', '2018-07-22_IA.tif' ]
    return "/".join(parts[1:]) if len(parts) >= 3 else full_path

def load_keys(txt_path: Path):
    return {make_key(p) for p in read_lines(txt_path)}

# ---------- 실행 ----------
modis_keys = load_keys(MODIS_TXT)
s2_keys    = load_keys(S2_TXT)
s1_keys    = load_keys(S1_TXT)

print("----- 리스트 개수 -----")
print(f"MODIS: {len(modis_keys):>6}")
print(f"S2   : {len(s2_keys):>6}")
print(f"S1   : {len(s1_keys):>6}\n")

# 두 기준(MODIS·S2) 합쳐서 참조 집합
reference = modis_keys | s2_keys
extra_in_s1 = sorted(s1_keys - reference)     # S1에만 있는 경로

print(f"S1 전용(튀는) 항목: {len(extra_in_s1)}개")
for k in extra_in_s1[:20]:                    # 처음 20개 미리보기
    print("  ", k)

# 필요하면 파일로 저장
Path("extra_in_S1.txt").write_text("\n".join(extra_in_s1) + "\n")
print("\n✅  extra_in_S1.txt 파일로 저장 완료")


----- 리스트 개수 -----
MODIS:  60519
S2   :  61128
S1   :  60838

S1 전용(튀는) 항목: 949개
   patch_0_896/2016-05-01_IA.tif
   patch_0_896/2017-08-20_IA.tif
   patch_0_896/2017-09-10_IA.tif
   patch_0_896/2018-04-22_IA.tif
   patch_0_896/2018-05-20_IA.tif
   patch_0_896/2018-07-08_IA.tif
   patch_0_896/2018-07-22_IA.tif
   patch_0_896/2018-08-12_IA.tif
   patch_0_896/2018-09-16_IA.tif
   patch_0_896/2018-10-14_IA.tif
   patch_0_896/2018-10-21_IA.tif
   patch_0_896/2018-10-28_IA.tif
   patch_0_896/2019-04-14_IA.tif
   patch_0_896/2019-05-05_IA.tif
   patch_0_896/2019-05-12_IA.tif
   patch_0_896/2019-07-14_IA.tif
   patch_0_896/2019-07-21_IA.tif
   patch_0_896/2019-07-28_IA.tif
   patch_0_896/2019-08-04_IA.tif
   patch_0_896/2019-08-25_IA.tif

✅  extra_in_S1.txt 파일로 저장 완료


## normalize

In [1]:
#!/usr/bin/env python
"""
Compute per-band mean/std for Sentinel-1 or Sentinel-2 patch lists (parallel).
"""

import numpy as np
import rasterio
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor
import tqdm

# ── User settings ─────────────────────────────────────────────
S1_LIST = Path("/scratch/bepk/bkim2/MultiMAE_RGB/MultiMAE/valid_list/delta/30m/pair_S1.txt")
S2_LIST = Path("/scratch/bepk/bkim2/MultiMAE_RGB/MultiMAE/valid_list/delta/30m/pair_S2.txt")
NODATA_VAL = -9999
SAVE_DIR = Path("./stats")
SAVE_DIR.mkdir(parents=True, exist_ok=True)
NUM_WORKERS = 16   # 병렬 프로세스 개수 (HPC 노드 코어 수 맞게 조정)
# ─────────────────────────────────────────────────────────────

def process_file(fp, nodata, divide_factor):
    """단일 파일 mean/sum 계산"""
    try:
        with rasterio.open(fp) as src:
            img = src.read().astype("float32")  # (C,H,W)
    except Exception as e:
        return None  # 에러난 파일은 skip

    img[img == nodata] = np.nan
    if divide_factor is not None:
        img = img / divide_factor

    sum_ = np.nansum(img, axis=(1, 2))
    sq_  = np.nansum(img**2, axis=(1, 2))
    cnt  = np.sum(~np.isnan(img), axis=(1, 2))
    return sum_, sq_, cnt


def compute_mean_std(list_path, nodata=NODATA_VAL, divide_factor=None, tag=""):
    with open(list_path) as f:
        paths = [line.strip() for line in f if line.strip()]
    if not paths:
        raise FileNotFoundError(f"❌ No files found in {list_path}")

    with rasterio.open(paths[0]) as tmp:
        C = tmp.count
    sum_, sq_, cnt = np.zeros(C), np.zeros(C), np.zeros(C)

    # 병렬 처리
    with ProcessPoolExecutor(max_workers=NUM_WORKERS) as ex:
        futures = [ex.submit(process_file, fp, nodata, divide_factor) for fp in paths]
        for fut in tqdm.tqdm(futures, desc=f"Accumulating {tag}"):
            res = fut.result()
            if res is None:  # skip
                continue
            s, q, c = res
            sum_ += s
            sq_  += q
            cnt  += c

    mean = np.divide(sum_, cnt, out=np.zeros_like(sum_), where=(cnt > 0))
    var = np.divide(sq_, cnt, out=np.zeros_like(sq_), where=(cnt > 0)) - mean**2
    var[var < 0] = 0
    std = np.sqrt(var)

    out_path = SAVE_DIR / f"{tag}_stats.npz"
    np.savez(out_path, mean=mean.astype("float32"), std=std.astype("float32"))

    print(f"✅ Saved {tag} μ,σ → {out_path} (bands={C}, files={len(paths)})")
    print("mean:", mean)
    print("std :", std)
    return mean, std


if __name__ == "__main__":
    compute_mean_std(S1_LIST, nodata=NODATA_VAL, divide_factor=None, tag="s1")
    compute_mean_std(S2_LIST, nodata=NODATA_VAL, divide_factor=10000.0, tag="s2")


Accumulating s1: 100%|██████████| 118059/118059 [07:06<00:00, 276.67it/s]


✅ Saved s1 μ,σ → stats/s1_stats.npz (bands=2, files=118059)
mean: [0.12419162 0.02826689]
std : [0.41080412 0.04929494]


Accumulating s2: 100%|██████████| 118059/118059 [10:49<00:00, 181.81it/s]


✅ Saved s2 μ,σ → stats/s2_stats.npz (bands=12, files=118059)
mean: [0.056615   0.06887008 0.09391985 0.10163148 0.14381698 0.25419976
 0.30873655 0.32210684 0.33362151 0.34459104 0.27966201 0.19123411]
std : [0.06765407 0.06848592 0.06620852 0.08064312 0.0791716  0.08433618
 0.1150472  0.11199426 0.11933007 0.13678636 0.09880431 0.10527262]
