In [10]:
import os
import json
from pathlib import Path
import pandas as pd
import datetime
import re

ROOT = Path('../data')
PROCESSED_S2 = ROOT / 'processed/sentinel2'
PROCESSED_S1 = ROOT / 'processed/sentinel1'
RAW_S2 = ROOT / 'raw/sentinel2'
RAW_S1 = ROOT / 'raw/sentinel1'
DEM_MASKS = ROOT / 'masks/dem_srtmgl1'
NDVI_MASKS = ROOT / 'masks/ndvi'
NDWI_MASKS = ROOT / 'masks/ndwi'
BUILDING_MASKS = ROOT / 'masks/building'
ROAD_MASKS = ROOT / 'masks/road'

# -- Helper for extracting patch_id --
def extract_patch_id(filename, prefix):
    # For NDVI/NDWI: patch_{patch_id}_{datetime}_ndvi.tiff
    # For DEM: dem_patch_{patch_id}.tiff
    if prefix == "ndvi" or prefix == "ndwi":
        m = re.match(r"patch_([0-9]{5}_[a-f0-9]+)_\d+_\d+_\d+_\d+_" + prefix + r"\.tiff$", filename)
        if m: return m.group(1)
    elif prefix == "dem":
        m = re.match(r"dem_patch_([0-9]{5}_[a-f0-9]+)\.tiff$", filename)
        if m: return m.group(1)
    elif prefix == "slope":
        m = re.match(r"dem_slope_patch_([0-9]{5}_[a-f0-9]+)\.tiff$", filename)
        if m: return m.group(1)
    return None

# --- Scan mask folders and build sets/counts ---
ndvi_patch_counts = {}
for f in NDVI_MASKS.glob("*.tiff"):
    patch_id = extract_patch_id(f.name, "ndvi")
    if patch_id:
        ndvi_patch_counts[patch_id] = ndvi_patch_counts.get(patch_id, 0) + 1

ndwi_patch_counts = {}
for f in NDWI_MASKS.glob("*.tiff"):
    patch_id = extract_patch_id(f.name, "ndwi")
    if patch_id:
        ndwi_patch_counts[patch_id] = ndwi_patch_counts.get(patch_id, 0) + 1

dem_patch_set = set()
for f in DEM_MASKS.glob("dem_patch_*.tiff"):
    patch_id = extract_patch_id(f.name, "dem")
    if patch_id:
        dem_patch_set.add(patch_id)

slope_patch_set = set()
for f in DEM_MASKS.glob("dem_slope_patch_*.tiff"):
    patch_id = extract_patch_id(f.name, "slope")
    if patch_id:
        slope_patch_set.add(patch_id)

# -- Debug output --
print("NDVI patch_ids found:", ndvi_patch_counts)
print("NDWI patch_ids found:", ndwi_patch_counts)
print("DEM patch_ids found:", dem_patch_set)
print("Slope patch_ids found:", slope_patch_set)

def collect_s2_entries(processed_dir):
    entries = []
    for json_file in processed_dir.glob("*.json"):
        try:
            meta = json.loads(json_file.read_text())
            patch_id = meta["patch_id"]
            timestamp = meta["timestamp"]
            file_processed = meta.get("file_processed")
            file_raw = meta.get("file_raw")
            entries.append({
                "patch_id": patch_id,
                "timestamp": timestamp,
                "file_processed": file_processed,
                "file_raw": file_raw
            })
        except Exception as e:
            print(f"Error reading {json_file}: {e}")
    return entries

def collect_s1_entries(processed_dir):
    entries = []
    for json_file in processed_dir.glob("*.json"):
        try:
            meta = json.loads(json_file.read_text())
            patch_id = meta["patch_id"]
            timestamp = meta["timestamp"]
            file_processed = meta.get("file_processed")
            file_raw = meta.get("file_raw")
            entries.append({
                "patch_id": patch_id,
                "timestamp": timestamp,
                "file_processed": file_processed,
                "file_raw": file_raw
            })
        except Exception as e:
            print(f"Error reading {json_file}: {e}")
    return entries

s2_entries = collect_s2_entries(PROCESSED_S2)
s1_entries = collect_s1_entries(PROCESSED_S1)

all_keys = set()
for entry in s2_entries + s1_entries:
    all_keys.add((entry["patch_id"], entry["timestamp"]))

report = []
for i, (patch_id, timestamp) in enumerate(sorted(all_keys)):
    core_patch_id = patch_id.replace("patch_", "")

    s2_entry = next((e for e in s2_entries if e["patch_id"] == patch_id and e["timestamp"] == timestamp), None)
    s2_proc_file = s2_entry["file_processed"] if s2_entry else None
    s2_raw_file = s2_entry["file_raw"] if s2_entry else None
    s1_entry = next((e for e in s1_entries if e["patch_id"] == patch_id and e["timestamp"] == timestamp), None)
    s1_proc_file = s1_entry["file_processed"] if s1_entry else None
    s1_raw_file = s1_entry["file_raw"] if s1_entry else None

    s2_proc_path = PROCESSED_S2 / s2_proc_file if s2_proc_file else None
    s2_raw_path = RAW_S2 / s2_raw_file if s2_raw_file else None
    s1_proc_path = PROCESSED_S1 / s1_proc_file if s1_proc_file else None
    s1_raw_path = RAW_S1 / s1_raw_file if s1_raw_file else None

    ndvi_match = 1 if core_patch_id in ndvi_patch_counts else 0
    ndwi_match = 1 if core_patch_id in ndwi_patch_counts else 0
    dem_match = 1 if core_patch_id in dem_patch_set else 0
    slope_match = 1 if core_patch_id in slope_patch_set else 0

    if i < 10:
        print(f"[DEBUG] {patch_id} | {core_patch_id}: ndvi={ndvi_match}, ndwi={ndwi_match}, dem={dem_match}, slope={slope_match}")

    row = {
        "patch_id": patch_id,
        "timestamp": timestamp,
        "raw_s2": int(s2_raw_path.exists()) if s2_raw_path else 0,
        "proc_s2": int(s2_proc_path.exists()) if s2_proc_path else 0,
        "ndvi": ndvi_match,
        "ndwi": ndwi_match,
        "building": 0,  # fill in later
        "road": 0,      # fill in later
        "raw_s1": int(s1_raw_path.exists()) if s1_raw_path else 0,
        "proc_s1": int(s1_proc_path.exists()) if s1_proc_path else 0,
        "dem": dem_match,
        "slope": slope_match
    }
    report.append(row)
    
df = pd.DataFrame(report)
print(df.head())
df.to_csv(f"tile_file_status-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.csv", index=False)

patch_summary = []
all_patch_ids = sorted(set(ndvi_patch_counts.keys()) | set(ndwi_patch_counts.keys()) | dem_patch_set | slope_patch_set)
for patch_id in all_patch_ids:
    patch_summary.append({
        "patch_id": patch_id,
        "ndvi_count": ndvi_patch_counts.get(patch_id, 0),
        "ndwi_count": ndwi_patch_counts.get(patch_id, 0),
        "dem": 1 if patch_id in dem_patch_set else 0,
        "slope": 1 if patch_id in slope_patch_set else 0
    })
patch_summary_df = pd.DataFrame(patch_summary)
print(patch_summary_df)
patch_summary_df.to_csv(f"patch_status-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.csv", index=False)

NDVI patch_ids found: {'00032_5925311e': 134, '00051_a4d8ed7c': 184, '00041_6f274c8a': 190, '00060_4ced72c0': 165, '00042_e6bff2f8': 184, '00033_4e36ed13': 173, '00050_b75fc6d6': 187, '00034_1558c231': 237, '00053_7c2edad5': 186, '00035_2dffd3d8': 243, '00057_9a3cec2c': 165, '00044_4359be37': 85, '00059_6ea9e5b3': 163, '00043_691c8352': 184, '00058_4d51287e': 163, '00052_4b0beee9': 184}
NDWI patch_ids found: {'00050_b75fc6d6': 187, '00034_1558c231': 237, '00042_e6bff2f8': 184, '00035_2dffd3d8': 243, '00051_a4d8ed7c': 184, '00052_4b0beee9': 184, '00044_4359be37': 85, '00032_5925311e': 134, '00059_6ea9e5b3': 163, '00043_691c8352': 184, '00041_6f274c8a': 190, '00053_7c2edad5': 186, '00057_9a3cec2c': 165, '00033_4e36ed13': 173, '00060_4ced72c0': 165, '00058_4d51287e': 163}
DEM patch_ids found: {'00041_6f274c8a', '00044_4359be37', '00035_2dffd3d8', '00042_e6bff2f8', '00050_b75fc6d6', '00052_4b0beee9', '00057_9a3cec2c', '00060_4ced72c0', '00059_6ea9e5b3', '00051_a4d8ed7c', '00053_7c2edad5', 

In [7]:
df
# df[df["ndvi_tile"] == 0]

Unnamed: 0,patch_id,timestamp,raw_s2,proc_s2,ndvi,ndwi,building,road,raw_s1,proc_s1,dem,slope
0,patch_00032_5925311e,2015-02-28T21:52:46+00:00,0,0,1,1,0,0,1,1,1,1
1,patch_00032_5925311e,2015-03-05T22:00:41+00:00,0,0,1,1,0,0,1,1,1,1
2,patch_00032_5925311e,2015-03-24T21:52:46+00:00,0,0,1,1,0,0,1,1,1,1
3,patch_00032_5925311e,2015-03-29T22:00:41+00:00,0,0,1,1,0,0,1,1,1,1
4,patch_00032_5925311e,2015-04-17T21:52:47+00:00,0,0,1,1,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
16204,patch_00060_4ced72c0,2025-03-02T02:40:37.503000+00:00,1,1,1,1,0,0,0,0,1,1
16205,patch_00060_4ced72c0,2025-04-03T02:40:24.469000+00:00,1,1,1,1,0,0,0,0,1,1
16206,patch_00060_4ced72c0,2025-04-23T02:40:27.062000+00:00,1,1,1,1,0,0,0,0,1,1
16207,patch_00060_4ced72c0,2025-05-01T02:40:34.822000+00:00,1,1,1,1,0,0,0,0,1,1


In [8]:
df.groupby('patch_id').sum()

Unnamed: 0_level_0,timestamp,raw_s2,proc_s2,ndvi,ndwi,building,road,raw_s1,proc_s1,dem,slope
patch_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
patch_00032_5925311e,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,134,134,1030,1030,0,0,896,896,1030,1030
patch_00033_4e36ed13,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,173,173,1079,1079,0,0,906,906,1079,1079
patch_00034_1558c231,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,237,237,1143,1143,0,0,906,906,1143,1143
patch_00035_2dffd3d8,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,243,243,1149,1149,0,0,906,906,1149,1149
patch_00041_6f274c8a,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,191,191,1097,1097,0,0,906,906,1097,1097
patch_00042_e6bff2f8,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,185,185,1091,1091,0,0,906,906,1091,1091
patch_00043_691c8352,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,185,185,1091,1091,0,0,906,906,1091,1091
patch_00044_4359be37,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,85,85,991,991,0,0,906,906,991,991
patch_00050_b75fc6d6,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,188,188,1094,1094,0,0,906,906,1094,1094
patch_00051_a4d8ed7c,2015-02-28T21:52:46+00:002015-03-05T22:00:41+0...,185,185,1091,1091,0,0,906,906,1091,1091
