# Thess Geo Analytics — Outputs & Rasters Explorer

This notebook assumes you have already run the main pipeline (e.g. via:

```bash
make full
```

and optionally the timestamp aggregation step:

```bash
python -m thess_geo_analytics.entrypoints.BuildAggregatedTimestamps
```

It will:

1. Discover and **describe all CSV tables** under `outputs/tables/` in (approximate) order of generation.
2. Randomly pick **5 raster files (`.tif`)** from common data locations and plot them for a quick visual sanity check.


In [None]:
from pathlib import Path
import os
import random

import pandas as pd
import matplotlib.pyplot as plt
import rasterio
import numpy as np

from IPython.display import display

# Make plots a bit bigger by default
plt.rcParams["figure.figsize"] = (8, 6)

PROJECT_ROOT = Path(".").resolve()
TABLES_DIR = PROJECT_ROOT / "outputs" / "tables"

print(f"Project root: {PROJECT_ROOT}")
print(f"Tables directory: {TABLES_DIR} (exists={TABLES_DIR.exists()})")


## 1. List & Describe All Tables in `outputs/tables/`

In [None]:
if not TABLES_DIR.exists():
    raise FileNotFoundError(f"Tables directory does not exist: {TABLES_DIR}")

# Discover all CSVs and sort them by modification time (approx generation order)
csv_files = sorted(TABLES_DIR.glob("*.csv"), key=lambda p: p.stat().st_mtime)

print(f"Found {len(csv_files)} CSV tables:")
for i, p in enumerate(csv_files, start=1):
    mtime = pd.to_datetime(p.stat().st_mtime, unit="s")
    print(f"{i:2d}. {p.name} (modified: {mtime})")


In [None]:
def describe_table(path: Path, max_rows: int = 5):
    print("=" * 80)
    print(f"TABLE: {path.name}")
    print(f"PATH : {path}")
    print("-" * 80)
    
    try:
        df = pd.read_csv(path)
    except Exception as e:
        print(f"[ERROR] Failed to read {path.name}: {e}")
        return
    
    print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
    print("Columns:")
    print(list(df.columns))
    
    if df.empty:
        print("[INFO] Table is empty.")
        return
    
    # Show head
    print("
Head:")
    display(df.head(max_rows))
    
    # Basic describe (both numeric and non-numeric)
    print("
Describe (numeric):")
    display(df.describe(include=[np.number]).T)
    
    print("
Describe (all columns, top-level):")
    display(df.describe(include="all").T)
    
for path in csv_files:
    describe_table(path)


## 2. Random Raster Quicklook (5 Examples)

In [None]:
# Candidate directories where rasters may live
candidate_dirs = [
    PROJECT_ROOT / "DATA_LAKE" / "data_raw" / "aggregated",  # per-timestamp mosaics
    PROJECT_ROOT / "DATA_LAKE" / "cache" / "s2",             # raw tiles
    PROJECT_ROOT / "outputs" / "cogs",                       # processed COGs
    PROJECT_ROOT / "outputs" / "composites",                 # NDVI composites etc.
]

tif_paths = []

for d in candidate_dirs:
    if d.exists():
        # Limit to a reasonable number of files to avoid scanning huge trees
        # We'll collect up to first 1000 and sample from them.
        for i, p in enumerate(d.rglob("*.tif")):
            tif_paths.append(p)
            if i >= 1000:
                break

print(f"Discovered {len(tif_paths)} .tif files across {len(candidate_dirs)} candidate dirs.")

if not tif_paths:
    print("No .tif files found. Ensure you have run downloads & aggregation steps.")


In [None]:
def show_raster_quicklook(path: Path):
    print("=" * 80)
    print(f"RASTER: {path}")
    
    try:
        with rasterio.open(path) as src:
            arr = src.read(1)  # first band
            crs = src.crs
            transform = src.transform
            print(f"Shape: {arr.shape}, dtype={arr.dtype}, CRS={crs}")
            
            # Simple display — stretch a bit if numeric
            if np.issubdtype(arr.dtype, np.integer) or np.issubdtype(arr.dtype, np.floating):
                # Mask invalid / extreme values for visualization
                data = arr.astype(float)
                # Handle nodata if present
                nodata = src.nodata
                if nodata is not None:
                    data = np.where(data == nodata, np.nan, data)
                
                # Compute robust min/max
                valid = np.isfinite(data)
                if valid.any():
                    vmin, vmax = np.percentile(data[valid], [2, 98])
                else:
                    vmin, vmax = np.nanmin(data), np.nanmax(data)
                
                plt.figure()
                plt.imshow(data, vmin=vmin, vmax=vmax)
                plt.colorbar(label="value")
                plt.title(str(path))
                plt.axis("off")
                plt.show()
            else:
                print("[WARN] Non-numeric raster dtype; skipping imshow.")
    except Exception as e:
        print(f"[ERROR] Failed to open/plot {path}: {e}")

# Pick up to 5 random rasters
N = 5
if tif_paths:
    random.seed(42)
    sample_paths = random.sample(tif_paths, k=min(N, len(tif_paths)))
    print(f"Showing quicklook for {len(sample_paths)} raster(s):")
    for p in sample_paths:
        show_raster_quicklook(p)
