# 06 — Validate NDVI Period Stats (Monthly or Quarterly)
This notebook validates the **new stats output** produced by:
`python -m thess_geo_analytics.entrypoints.BuildNdviPeriodStats`

It reads:
- `outputs/tables/ndvi_period_stats.csv`

and performs:
- schema checks
- duplicate checks
- basic sanity ranges
- quick plots (time series + distributions)

**Period format**: `YYYY-MM` (monthly) or `YYYY-Qn` (quarterly).

In [None]:
from __future__ import annotations

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from thess_geo_analytics.utils.RepoPaths import RepoPaths

PERIOD_RE = re.compile(r"^(\d{4})-(\d{2}|Q[1-4])$")  # YYYY-MM or YYYY-Qn

stats_path = RepoPaths.table("ndvi_period_stats.csv")
stats_path


In [None]:
if not stats_path.exists():
    raise FileNotFoundError(
        f"Missing stats file: {stats_path}\n"
        "Run: python -m thess_geo_analytics.entrypoints.BuildNdviPeriodStats --aoi-id el522"
    )

df = pd.read_csv(stats_path)
df.head()


## 1) Schema validation

In [None]:
required_cols = {
    "period",
    "aoi_id",
    "mean_ndvi",
    "median_ndvi",
    "p10_ndvi",
    "p90_ndvi",
    "std_ndvi",
    "valid_pixel_ratio",
    "count_valid_pixels",
    "count_total_pixels",
    "tif_path",
}

missing = required_cols - set(df.columns)
extra = set(df.columns) - required_cols

print("Rows:", len(df))
print("Missing required cols:", sorted(missing))
print("Extra cols:", sorted(extra))

if missing:
    raise ValueError(f"Stats CSV missing required columns: {sorted(missing)}")


## 2) Period format checks

In [None]:
bad = df[~df["period"].astype(str).str.match(PERIOD_RE)]
print("Bad period rows:", len(bad))
if len(bad):
    display(bad[["period", "tif_path"]].head(20))
    raise ValueError("Found invalid period strings. Expected YYYY-MM or YYYY-Qn.")


## 3) Duplicate checks

In [None]:
dup = df.duplicated(subset=["aoi_id", "period"], keep=False)
ddf = df[dup].sort_values(["aoi_id", "period"])
print("Duplicate (aoi_id, period) rows:", len(ddf))
if len(ddf):
    display(ddf[["aoi_id", "period", "tif_path"]])
    raise ValueError("Duplicate periods detected. The pipeline should replace/update, not append duplicates.")


## 4) Basic sanity checks

In [None]:
for col in ["mean_ndvi", "median_ndvi", "p10_ndvi", "p90_ndvi"]:
    bad = df[(df[col] < -1.05) | (df[col] > 1.05)]
    print(f"{col}: bad rows =", len(bad))
    if len(bad):
        display(bad[["period", "aoi_id", col, "tif_path"]].head(20))
        raise ValueError(f"{col} out of expected range [-1,1].")

bad_ratio = df[(df["valid_pixel_ratio"] < 0) | (df["valid_pixel_ratio"] > 1)]
print("valid_pixel_ratio: bad rows =", len(bad_ratio))
if len(bad_ratio):
    display(bad_ratio[["period", "aoi_id", "valid_pixel_ratio", "tif_path"]].head(20))
    raise ValueError("valid_pixel_ratio out of [0,1].")

bad_counts = df[(df["count_total_pixels"] <= 0) | (df["count_valid_pixels"] <= 0)]
print("count_*: bad rows =", len(bad_counts))
if len(bad_counts):
    display(bad_counts[["period", "aoi_id", "count_valid_pixels", "count_total_pixels", "tif_path"]].head(20))
    raise ValueError("Invalid pixel counts.")

bad_consistency = df[df["count_valid_pixels"] > df["count_total_pixels"]]
print("count_valid_pixels > count_total_pixels rows =", len(bad_consistency))
if len(bad_consistency):
    display(bad_consistency[["period", "aoi_id", "count_valid_pixels", "count_total_pixels", "tif_path"]].head(20))
    raise ValueError("count_valid_pixels cannot exceed count_total_pixels.")


## 5) Sort periods for plotting

In [None]:
def period_to_date(p: str) -> pd.Timestamp:
    p = str(p)
    m = re.match(r"^(\d{4})-(\d{2})$", p)
    if m:
        return pd.Timestamp(year=int(m.group(1)), month=int(m.group(2)), day=1)
    m = re.match(r"^(\d{4})-(Q[1-4])$", p)
    if m:
        y = int(m.group(1))
        q = int(m.group(2)[1])
        month = {1: 1, 2: 4, 3: 7, 4: 10}[q]
        return pd.Timestamp(year=y, month=month, day=1)
    return pd.NaT

df["period_dt"] = df["period"].apply(period_to_date)
if df["period_dt"].isna().any():
    display(df[df["period_dt"].isna()][["period"]].drop_duplicates().head(50))
    raise ValueError("Failed to parse some period strings to dates.")

df = df.sort_values(["aoi_id", "period_dt"]).reset_index(drop=True)
df[["period", "period_dt", "aoi_id", "mean_ndvi", "valid_pixel_ratio"]].head(10)


## 6) Plot: NDVI time series

In [None]:
aoi_ids = df["aoi_id"].unique().tolist()
print("AOIs:", aoi_ids)
aoi_id = aoi_ids[0]

d = df[df["aoi_id"] == aoi_id].copy()

plt.figure(figsize=(10, 4))
plt.plot(d["period_dt"], d["mean_ndvi"])
plt.title(f"Mean NDVI over time ({aoi_id})")
plt.xlabel("Period")
plt.ylabel("Mean NDVI")
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 4))
plt.plot(d["period_dt"], d["median_ndvi"])
plt.title(f"Median NDVI over time ({aoi_id})")
plt.xlabel("Period")
plt.ylabel("Median NDVI")
plt.tight_layout()
plt.show()


## 7) Plot: valid pixel ratio

In [None]:
plt.figure(figsize=(10, 4))
plt.plot(d["period_dt"], d["valid_pixel_ratio"])
plt.title(f"Valid pixel ratio over time ({aoi_id})")
plt.xlabel("Period")
plt.ylabel("Valid ratio")
plt.ylim(0, 1)
plt.tight_layout()
plt.show()


## 8) Distributions

In [None]:
plt.figure(figsize=(6, 4))
plt.hist(d["mean_ndvi"].dropna().values, bins=20)
plt.title(f"Distribution of mean NDVI ({aoi_id})")
plt.xlabel("Mean NDVI")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6, 4))
plt.hist(d["valid_pixel_ratio"].dropna().values, bins=20)
plt.title(f"Distribution of valid pixel ratio ({aoi_id})")
plt.xlabel("Valid pixel ratio")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


## 9) Worst coverage periods

In [None]:
worst = d.sort_values("valid_pixel_ratio", ascending=True).head(12)
worst[["period", "valid_pixel_ratio", "mean_ndvi", "median_ndvi", "tif_path"]]


## 10) Done

In [None]:
print("[OK] Validation notebook finished successfully.")