In [2]:
import os
import numpy as np
import cv2
from tqdm import tqdm
import csv
from scipy.stats import entropy
import warnings
import rasterio

# Suppress noisy warnings from rasterio
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)

transparency_tiles = []

# === CONFIG ===
IN_DIR = 'dataset-medium'
OUT_DIR = 'chipped_data'
TILE_SIZE = 512
STRIDE = 256
IGNORE_COLOR = (255, 0, 255)
BACKGROUND_COLOR_RGB = (255, 255, 255)

# --- Class definitions---
NUM_CLASSES = 6
CLASS_NAMES = ['Building', 'Vegetation', 'Water', 'Background', 'Car', 'Road']
COLOR_TO_CLASS = {
    (230, 25, 75): 0,    # Building
    (60, 180, 75): 1,    # Vegetation
    (0, 130, 200): 2,    # Water
    (255, 255, 255): 3,  # Background
    (245, 130, 48): 4,   # Car
    (128, 128, 128): 5,  # Road
}
# -----------------------------------------

# === UTILS ===
def standardise_elevation(elev, raster_nodata=-32767):
    valid_mask = elev != raster_nodata
    valid_elev = elev[valid_mask]
    if valid_elev.size == 0:
        return np.zeros_like(elev, dtype=np.float32)
    mean = valid_elev.mean()
    std = valid_elev.std()
    standardised = np.zeros_like(elev, dtype=np.float32)
    if std > 0:
        standardised[valid_mask] = (valid_elev - mean) / std
    return standardised

# === CHIP FUNCTION ===
def chip_all():
    train_rows = []
    os.makedirs(OUT_DIR, exist_ok=True)
    source_files = [f for f in os.listdir(os.path.join(IN_DIR, 'images')) if f.endswith('-ortho.tif')]

    # NEW: Global counters for the final summary
    total_chips_encountered = 0
    passed_chips = 0
    skipped_due_to_ignore = 0
    skipped_due_to_elevation = 0
    skipped_due_to_transparency = 0
    skipped_due_to_unknown_color = 0

    for fname in tqdm(source_files, desc="Chipping Scenes"):
        # ... (file loading logic is the same)
        base = fname.replace('-ortho.tif', '')
        rgb_path = os.path.join(IN_DIR, 'images', f'{base}-ortho.tif')
        elev_path = os.path.join(IN_DIR, 'elevations', f'{base}-elev.tif')
        label_path = os.path.join(IN_DIR, 'labels', f'{base}-label.png')

        if not os.path.exists(label_path):
            continue

        alpha = None
        with rasterio.open(rgb_path) as src:
            rgb = src.read([1,2,3]).transpose(1, 2, 0)
            if src.count >= 4:
                alpha = src.read(4)

        with rasterio.open(elev_path) as src:
            elev = src.read(1).astype(np.float32)

        label = cv2.cvtColor(cv2.imread(label_path), cv2.COLOR_BGR2RGB)

        # Convert Clutter to Background
        clutter_color = np.array([145, 30, 180])
        clutter_mask = np.all(label == clutter_color, axis=2)
        label[clutter_mask] = BACKGROUND_COLOR_RGB

        h, w = rgb.shape[:2]
        elev_std = standardise_elevation(elev)

        for y in range(0, h - TILE_SIZE + 1, STRIDE):
            for x in range(0, w - TILE_SIZE + 1, STRIDE):
                total_chips_encountered += 1

                label_tile_rgb = label[y:y+TILE_SIZE, x:x+TILE_SIZE]
                elev_tile = elev[y:y+TILE_SIZE, x:x+TILE_SIZE]

                # --- All Skip Conditions ---
                if np.any(np.all(label_tile_rgb == IGNORE_COLOR, axis=-1)):
                    skipped_due_to_ignore += 1
                    continue

                if np.isnan(elev_tile).any():
                    skipped_due_to_elevation += 1
                    continue

                if alpha is not None:
                    alpha_tile = alpha[y:y+TILE_SIZE, x:x+TILE_SIZE]
                    if np.any(alpha_tile < 255):
                        skipped_due_to_transparency += 1
                        transparency_tiles.append(f"{base}_{x}_{y}")
                        continue

                # --- Unknown Color Check/Conversion ---
                label_ids = np.full((TILE_SIZE, TILE_SIZE), -1, dtype=np.int32)
                for color_rgb, class_idx in COLOR_TO_CLASS.items():
                    mask = np.all(label_tile_rgb == color_rgb, axis=-1)
                    label_ids[mask] = class_idx

                if np.any(label_ids == -1):
                    skipped_due_to_unknown_color += 1
                    continue # Skip chips with colors not in our final ontology

                # --- If all checks pass, save the chip ---
                passed_chips += 1
                tile_id = f"{base}_{x}_{y}"
                rgb_tile = rgb[y:y+TILE_SIZE, x:x+TILE_SIZE]
                elev_tile_std = elev_std[y:y+TILE_SIZE, x:x+TILE_SIZE]

                # ... (metadata calculation and saving logic is the same)
                counts = np.bincount(label_ids.flatten(), minlength=NUM_CLASSES).astype(np.int64)
                class_percentages = counts / (TILE_SIZE * TILE_SIZE)
                entropy_val = entropy(class_percentages + 1e-9, base=2)

                for folder in ['images', 'elevations', 'labels']:
                    os.makedirs(os.path.join(OUT_DIR, 'train', folder), exist_ok=True)

                cv2.imwrite(os.path.join(OUT_DIR, 'train', 'images', f'{tile_id}-ortho.png'), cv2.cvtColor(rgb_tile, cv2.COLOR_RGB2BGR))
                np.save(os.path.join(OUT_DIR, 'train', 'elevations', f'{tile_id}-elev.npy'), elev_tile_std)
                cv2.imwrite(os.path.join(OUT_DIR, 'train', 'labels', f'{tile_id}-label.png'), cv2.cvtColor(label_tile_rgb, cv2.COLOR_RGB2BGR))

                train_rows.append([tile_id, base, x, y] + class_percentages.tolist() + [float(entropy_val)] + counts.tolist())

    # Save metadata files
    with open(os.path.join(OUT_DIR, 'train_metadata.csv'), 'w', newline='') as f:
        writer = csv.writer(f)
        header = ['tile_id', 'source_file', 'x', 'y'] + [f'dist_{i}:{name}' for i, name in enumerate(CLASS_NAMES)] + ['entropy'] + [f'count_{i}:{name}' for i, name in enumerate(CLASS_NAMES)]
        writer.writerow(header)
        writer.writerows(train_rows)

    # NEW: Final summary printout
    total_skipped = total_chips_encountered - passed_chips
    print("\n" + "="*40)
    print("--- CHIPPING COMPLETE: FINAL SUMMARY ---")
    print("="*40)
    print(f"Total Chips Encountered: {total_chips_encountered}")
    print(f"Chips Successfully Saved: {passed_chips} ({passed_chips/total_chips_encountered:.2%})")
    print("-" * 40)
    print(f"Total Chips Skipped: {total_skipped} ({total_skipped/total_chips_encountered:.2%})")
    print(f"  - Skipped due to 'Ignore' pixel: {skipped_due_to_ignore}")
    print(f"  - Skipped due to missing elevation (NaN): {skipped_due_to_elevation}")
    print(f"  - Skipped due to RGB transparency: {skipped_due_to_transparency}")
    print(f"  - Skipped due to unknown label colors: {skipped_due_to_unknown_color}")
    print("="*40)
    print("Transparency Tiles:", transparency_tiles)
    print("="*40)

# === RUN ===
chip_all()

Chipping Scenes: 100%|██████████| 55/55 [17:53<00:00, 19.52s/it]


--- CHIPPING COMPLETE: FINAL SUMMARY ---
Total Chips Encountered: 33361
Chips Successfully Saved: 16449 (49.31%)
----------------------------------------
Total Chips Skipped: 16912 (50.69%)
  - Skipped due to 'Ignore' pixel: 16888
  - Skipped due to missing elevation (NaN): 0
  - Skipped due to RGB transparency: 22
  - Skipped due to unknown label colors: 2
Transparency Tiles: ['1476907971_CHADGRISMOPENPIPELINE_3072_4352', '1476907971_CHADGRISMOPENPIPELINE_3328_4352', '107f24d6e9_F1BE1D4184INSPIRE_8448_256', '25f1c24f30_EB81FE6E2BOPENPIPELINE_768_9984', '25f1c24f30_EB81FE6E2BOPENPIPELINE_1024_9984', '25f1c24f30_EB81FE6E2BOPENPIPELINE_768_10240', 'cc4b443c7d_A9CBEF2C97INSPIRE_1024_768', 'b771104de5_7E02A41EBEOPENPIPELINE_1024_768', 'b771104de5_7E02A41EBEOPENPIPELINE_1280_768', 'b771104de5_7E02A41EBEOPENPIPELINE_1024_2048', 'b771104de5_7E02A41EBEOPENPIPELINE_1280_2048', 'b771104de5_7E02A41EBEOPENPIPELINE_256_2560', 'b771104de5_7E02A41EBEOPENPIPELINE_512_2560', 'b771104de5_7E02A41EBEOPEN


