# Thess Geo Analytics — Outputs & Rasters Explorer (PNG Quicklooks)

This notebook assumes you have already run the main pipeline, for example:

```bash
make full
python -m thess_geo_analytics.entrypoints.BuildAggregatedTimestamps
```

It will:

1. Discover and **describe all CSV tables** under `outputs/tables/` in (approximate) order of generation.
2. Randomly pick **up to 5 raster files (`.tif`)** from common data locations and generate **PNG quicklooks on disk**
   instead of rendering big images inline.

PNG quicklooks are written under:

```text
reports/raster_quicklooks/
```

So the notebook stays light and you can inspect PNGs with any image viewer.


In [None]:
from pathlib import Path
import os
import random

import pandas as pd
import matplotlib.pyplot as plt
import rasterio
import numpy as np

from IPython.display import display

# Make plots a bit bigger by default (for the saved PNGs)
plt.rcParams["figure.figsize"] = (8, 6)

PROJECT_ROOT = Path(".").resolve()
TABLES_DIR = PROJECT_ROOT / "outputs" / "tables"
REPORTS_DIR = PROJECT_ROOT / "reports"
RASTER_QLOOK_DIR = REPORTS_DIR / "raster_quicklooks"

REPORTS_DIR.mkdir(parents=True, exist_ok=True)
RASTER_QLOOK_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Tables directory: {TABLES_DIR} (exists={TABLES_DIR.exists()})")
print(f"Raster quicklook directory: {RASTER_QLOOK_DIR} (exists={RASTER_QLOOK_DIR.exists()})")


## 1. List & Describe All Tables in `outputs/tables/`

In [None]:
if not TABLES_DIR.exists():
    raise FileNotFoundError(f"Tables directory does not exist: {TABLES_DIR}")

# Discover all CSVs and sort them by modification time (approx generation order)
csv_files = sorted(TABLES_DIR.glob("*.csv"), key=lambda p: p.stat().st_mtime)

print(f"Found {len(csv_files)} CSV tables:")
for i, p in enumerate(csv_files, start=1):
    mtime = pd.to_datetime(p.stat().st_mtime, unit="s")
    print(f"{i:2d}. {p.name} (modified: {mtime})")


In [None]:
def describe_table(path: Path, max_rows: int = 5):
    print("=" * 80)
    print(f"TABLE: {path.name}")
    print(f"PATH : {path}")
    print("-" * 80)
    
    try:
        df = pd.read_csv(path)
    except Exception as e:
        print(f"[ERROR] Failed to read {path.name}: {e}")
        return
    
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print("Columns:")
    print(list(df.columns))
    
    if df.empty:
        print("[INFO] Table is empty.")
        return
    
    # Show head
    print("\nHead:")
    display(df.head(max_rows))
    
    # Basic describe (numeric)
    if df.select_dtypes(include=[np.number]).shape[1] > 0:
        print("\nDescribe (numeric):")
        display(df.describe(include=[np.number]).T)
    else:
        print("\n[INFO] No numeric columns to describe.")
    
    # Top-level overview for all columns (may be wide, but useful)
    print("\nDescribe (all columns, top-level):")
    try:
        display(df.describe(include="all").T)
    except Exception as e:
        print(f"[WARN] Could not compute full describe(include='all'): {e}")
    
for path in csv_files:
    describe_table(path)


## 2. Random Raster Quicklook (PNG only, up to 5 files)

In [None]:
# Candidate directories where rasters may live
candidate_dirs = [
    PROJECT_ROOT / "DATA_LAKE" / "data_raw" / "aggregated",  # per-timestamp mosaics
    PROJECT_ROOT / "DATA_LAKE" / "cache" / "s2",             # raw tiles
    PROJECT_ROOT / "outputs" / "cogs",                       # processed COGs
    PROJECT_ROOT / "outputs" / "composites",                 # NDVI composites etc.
]

tif_paths = []

for d in candidate_dirs:
    if d.exists():
        # Limit to a reasonable number of files to avoid scanning huge trees
        for i, p in enumerate(d.rglob("*.tif")):
            tif_paths.append(p)
            if i >= 1000:
                break

print(f"Discovered {len(tif_paths)} .tif files across {len(candidate_dirs)} candidate dirs.")

if not tif_paths:
    print("No .tif files found. Ensure you have run downloads & aggregation steps.")


In [None]:
def show_raster_quicklook(path: Path, idx: int):
    """
    Create a downsampled PNG quicklook for the given raster path.

    - Reads band 1
    - Applies nodata mask if present
    - Robust (2–98%) value clipping
    - Downsamples to max ~512x512 for speed
    - Saves PNG into RASTER_QLOOK_DIR
    """
    print("=" * 80)
    print(f"RASTER: {path}")
    
    try:
        with rasterio.open(path) as src:
            arr = src.read(1)  # first band
            crs = src.crs
            print(f"Shape: {arr.shape}, dtype={arr.dtype}, CRS={crs}")
            
            if not (np.issubdtype(arr.dtype, np.integer) or np.issubdtype(arr.dtype, np.floating)):
                print("[WARN] Non-numeric raster dtype; skipping quicklook.")
                return
            
            # Convert to float and apply nodata mask if present
            data = arr.astype(float)
            nodata = src.nodata
            if nodata is not None:
                data = np.where(data == nodata, np.nan, data)
            
            # Downsample to keep plotting cheap
            max_side = 512
            h, w = data.shape
            factor = max(1, int(max(h, w) // max_side))
            if factor > 1:
                data = data[::factor, ::factor]
            
            valid = np.isfinite(data)
            if valid.any():
                vmin, vmax = np.percentile(data[valid], [2, 98])
            else:
                vmin, vmax = np.nanmin(data), np.nanmax(data)
            
            # Save PNG instead of showing inline
            out_png = RASTER_QLOOK_DIR / f"quicklook_{idx:02d}.png"
            fig, ax = plt.subplots()
            im = ax.imshow(data, vmin=vmin, vmax=vmax)
            ax.set_title(str(path))
            ax.axis("off")
            fig.colorbar(im, ax=ax, shrink=0.7, label="value")
            fig.tight_layout()
            fig.savefig(out_png, dpi=150)
            plt.close(fig)
            
            print(f"[OK] Saved PNG quicklook → {out_png}")
    except Exception as e:
        print(f"[ERROR] Failed to open/plot {path}: {e}")


In [None]:
# Pick up to 5 random rasters and generate PNG quicklooks
N = 5
if tif_paths:
    random.seed(42)
    sample_paths = random.sample(tif_paths, k=min(N, len(tif_paths)))
    print(f"Generating quicklooks for {len(sample_paths)} raster(s):")
    for idx, p in enumerate(sample_paths, start=1):
        show_raster_quicklook(p, idx)
else:
    print("No rasters available for quicklook generation.")
