In [None]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

# JSON.apth 
CLIENT_JSON = "client_secret_Final.json"   #The google drive API 

gauth = GoogleAuth()
gauth.LoadClientConfigFile(CLIENT_JSON)

# Command-line auth -> 
gauth.CommandLineAuth()

# Save token 
gauth.SaveCredentialsFile('token.json')

drive = GoogleDrive(gauth)
print("✅ Google Drive connected")

✅ Google Drive connected


In [None]:
# List  folders in Google Drive 
folder_list = drive.ListFile({
    'q': "'root' in parents and mimeType='application/vnd.google-apps.folder' and trashed=false"
}).GetList()

for folder in folder_list:
    print(f"Folder: {folder['title']}  |  ID: {folder['id']}")


In [None]:

"""
Per-admin AEI-constrained binarization using ONLY the admin shapefile.

For each country probability TIFF:
  - Use unit_code polygons from AEI_2020_2_withAEI.* (AEI in HECTARES).
  - For each unit_code, pick pixels INSIDE (centroid-in) from highest probability
    downward until its AEI quota is met (quota = floor(AEI_ha*10_000 / 900) pixels).
  - Preserve NaNs from the probability input.
  - Write ONE binary 0/1 TIFF per country to:
      Drive → CountryModelPredicted/Probability/Binary/
  - Also write a CSV summary per country with thresholds and counts.
"""

import os, re, math, csv, tempfile, warnings, unicodedata
from collections import defaultdict, Counter

import numpy as np
import rasterio
from rasterio.windows import Window
from rasterio.features import rasterize

import fiona
from shapely.geometry import shape, box
from shapely.ops import transform as shp_transform
from shapely.strtree import STRtree
from pyproj import Transformer

# ------------------------- CONFIG -------------------------
ROOT_FOLDER_ID        = os.environ.get("ROOT_FOLDER_ID", "1hqMIyDYEFKnpS8KLxC4bqHmF_9dHXImG")
PARENT_FOLDER_NAME    = "CountryModelPredicted_Cropland"
PROB_SUBFOLDER_NAME   = "Probability"      
NATIONAL_AEI_FOLDER   = "National AEI"     # folder where the shapefile of AEI statistics found

# Probability & binning
PIXEL_AREA_M2         = 30.0 * 30.0        # 900 m² per 30 m pixel
TILE                  = 1024               # reduce if memory tight
SCALE                 = 1000               

# Shapefile (AEI in HECTARES)
ADMIN_SHP_BASE        = "AEI_2020_2_with_AEI"  # base name AEI statistics shapefile each with unique unit_code 
ADMIN_CODE_COL        = "unit_code"
ADMIN_AEI_COLS        = ["AEI_2020", "AEI2020", "AEI"]  # shapefile feature of AEI statistics  in HECTARES
ADMIN_CNTRY_COLS      = ["name_cntr", "name_cntr1", "name_admin", "ST_NM"]

# Deterministic tie-breaking
RNG_SEED              = int(os.environ.get("AEI_RNG_SEED", "0"))
_rng = np.random.default_rng(RNG_SEED)
# ----------------------------------------------------------

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ---------------------- Drive helpers ---------------------
def _dq(drive, q):
    return drive.ListFile({
        "q": q,
        "supportsAllDrives": True,
        "includeItemsFromAllDrives": True,
        "maxResults": 1000
    }).GetList()

def list_files(drive, parent_id):
    return _dq(drive, f"'{parent_id}' in parents and trashed=false")

def child_folders(drive, parent_id):
    return _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder'")

def get_subfolder_exact(drive, parent_id, name):
    res = _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder' and title='{name}'")
    return res[0]["id"] if res else None

def get_subfolder_fuzzy(drive, parent_id, desired):
    eid = get_subfolder_exact(drive, parent_id, desired)
    if eid: return eid
    key = re.sub(r"\s+", "", desired.lower())
    for f in child_folders(drive, parent_id):
        tkey = re.sub(r"\s+", "", f.get("title","").lower())
        if key in tkey or ("prob" in key and "prob" in tkey):
            return f["id"]
    return None

def _resolve_field(props_sample: dict, candidates, required=False):
    """Fuzzy, case-insensitive resolver for attribute fields."""
    def canon(s): return re.sub(r"[^a-z0-9]+", "", str(s).lower())
    keys = list(props_sample.keys())
    norm_map = {canon(k): k for k in keys}
    for want in candidates:
        w = canon(want)
        if w in norm_map:
            return norm_map[w]
    want_roots = {canon(want) for want in candidates}
    for k in keys:
        ck = canon(k)
        if any(root in ck for root in want_roots):
            return k
    if required:
        raise RuntimeError(f"Required attribute not found. Looked for: {candidates}. Available: {keys}")
    return None

def download_to_temp(drive_file, suffix):
    p = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
    drive_file.GetContentFile(p)
    return p

def get_or_create_folder(drive, parent_id, name):
    res = _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder' and title='{name}'")
    if res: return res[0]["id"]
    nf = drive.CreateFile({"title": name, "parents":[{"id": parent_id}], "mimeType":"application/vnd.google-apps.folder"})
    nf.Upload()
    return nf["id"]

def upload_path(drive, local_path, parent_id, title=None):
    f = drive.CreateFile({"title": title or os.path.basename(local_path), "parents":[{"id": parent_id}]})
    f.SetContentFile(local_path)
    f.Upload()
    return f["id"]

# ---------------------- Name helpers ----------------------
def _norm(s): return re.sub(r"[^a-z0-9]+", "", str(s).lower())
def _canon(s):
    s = unicodedata.normalize("NFKD", str(s)).encode("ascii","ignore").decode()
    return re.sub(r"\s+","", s.lower())

def _extract_country_from_fname(fname):
    # Albania_RF_probability_percent.tif → Albania
    fn = re.sub(r"\s*\(.*\)\.tif(f)?$", ".tif", fname, flags=re.IGNORECASE)
    m = re.match(r"(.+?)_RF_probability_percent", fn, flags=re.IGNORECASE)
    if m: return m.group(1)
    return re.sub(r"\.tif(f)?$", "", fn, flags=re.IGNORECASE)

# ---------------------- Shapefile helpers -----------------
def _download_shapefile_bundle(drive, folder_id, base):
    """
    Download AEI_2020_2_withAEI.* into a single temp directory, ensuring
    all sidecars share the SAME basename so GDAL/Fiona can see attributes.
    Returns a dict of local paths keyed by extension ('.shp', '.dbf', etc).
    """
    exts = [".shp", ".shx", ".dbf", ".prj", ".cpg"]
    items = {}
    for it in list_files(drive, folder_id):
        t = it.get("title", "")
        for e in exts:
            if t.lower() == (base.lower() + e):
                items[e] = it

    if ".shp" not in items or ".dbf" not in items:
        raise FileNotFoundError(f"Missing pieces of {base} shapefile (need at least .shp and .dbf). Found: {sorted(items.keys())}")

    tmpdir = tempfile.mkdtemp(prefix="aei_admin_")
    out = {}
    for e, it in items.items():
        local_path = os.path.join(tmpdir, base + e)  # SAME BASENAME!
        it.GetContentFile(local_path)
        out[e] = local_path

    print(f"[AEI] Shapefile bundle at {tmpdir} → found {sorted(out.keys())}")
    return out


def _read_admins_for_raster(shp_path, raster_crs, raster_bounds):
    """
    Load admin polygons, reproject to raster CRS (no AEI filtering).
    - Reads field names from schema (not the first feature).
    - Honors .cpg encoding when present.
    - Skips null/malformed geometries defensively.
    """
    feats, attrs = [], []
    shp_dir = os.path.dirname(shp_path)
    base = os.path.splitext(os.path.basename(shp_path))[0]
    cpg_path = os.path.join(shp_dir, base + ".cpg")

    # Determine DBF encoding
    encoding = None
    if os.path.exists(cpg_path):
        try:
            with open(cpg_path, "r", encoding="ascii", errors="ignore") as f:
                enc_line = f.read().strip()
                if enc_line:
                    encoding = enc_line
        except Exception:
            pass

    def _open_fiona(enc):
        return fiona.open(shp_path, encoding=enc) if enc else fiona.open(shp_path)

    shp_crs_final = None
    with fiona.Env(SHAPE_RESTORE_SHX='YES'):
        # Try cpg encoding → utf-8 → latin1
        tried = [encoding, "utf-8", "latin1"]
        last_err = None
        for enc in tried:
            try:
                with _open_fiona(enc) as src:
                    shp_crs_local = src.crs_wkt or src.crs
                    props_schema = (src.schema or {}).get("properties", {})
                    field_names = list(props_schema.keys())
                    if not field_names:
                        raise RuntimeError("No attribute fields in schema (DBF not visible).")

                    # Resolve keys against SCHEMA (not a sample feature)
                    dummy_props = {k: None for k in field_names}
                    code_key  = _resolve_field(dummy_props, [ADMIN_CODE_COL], required=True)
                    aei_key   = _resolve_field(dummy_props, ADMIN_AEI_COLS, required=True)  # AEI in hectares
                    cntry_key = _resolve_field(dummy_props, ADMIN_CNTRY_COLS, required=False)

                    # Transform raster bounds to shapefile CRS for coarse prefilter
                    if shp_crs_local:
                        rb_to_shp = Transformer.from_crs(raster_crs, shp_crs_local, always_xy=True)
                        rb_shp = shp_transform(lambda x, y: rb_to_shp.transform(x, y), box(*raster_bounds))
                    else:
                        rb_shp = box(*raster_bounds)

                    # Iterate features
                    for rec in src:
                        gj = rec.get("geometry")
                        if gj is None:
                            continue  # null geometry → skip (defensive)
                        try:
                            g = shape(gj)
                        except Exception:
                            continue
                        if g.is_empty:
                            continue
                        if not g.intersects(rb_shp):
                            continue

                        props = rec.get("properties") or {}
                        try:
                            uc = int(props[code_key])
                            aei_ha = float(props[aei_key])  # hectares
                        except Exception:
                            continue

                        feats.append(g)
                        attrs.append({
                            "unit_code": uc,
                            "aei_ha": aei_ha,
                            "country": str(props.get(cntry_key, "")).strip() if cntry_key else ""
                        })
                # success → keep the CRS we used
                shp_crs_final = shp_crs_local
                break
            except Exception as e:
                last_err = e
                continue

        if last_err and not feats:
            raise RuntimeError(f"Failed to read attributes from shapefile. Tried encodings {tried}. Last error: {last_err}")

    if not feats:
        return [], [], None, {}

    # Reproject to raster CRS for rasterize
    shp_crs = shp_crs_final
    if not shp_crs:
        print("⚠️  Shapefile has no CRS (.prj missing). Assuming raster CRS.")
        shp_crs = raster_crs

    transformer = Transformer.from_crs(shp_crs, raster_crs, always_xy=True)
    geoms_ras = [shp_transform(lambda x, y: transformer.transform(x, y), g) for g in feats]

    tree_ras = STRtree(geoms_ras)
    # IMPORTANT: map by WKB (value identity), not id(...)
    g2i_wkb = {g.wkb: i for i, g in enumerate(geoms_ras)}

    return geoms_ras, attrs, tree_ras, g2i_wkb


# ---- STRtree helper: get candidate indices robustly (Shapely 2 or fallback) ----
def _tree_candidate_indices(tree_ras, tile_poly, geoms_ras, g2i_wkb):
    """
    Return list of indices of geoms that intersect tile_poly.
    Prefer Shapely 2's predicate indices; otherwise map WKBs.
    """
    # Fast path: Shapely 2 can return integer indices with predicate
    try:
        idx = tree_ras.query(tile_poly, predicate="intersects")
        if isinstance(idx, np.ndarray) and np.issubdtype(idx.dtype, np.integer):
            return idx.tolist()
    except TypeError:
        # Older shapely: predicate argument not supported
        pass

    # Fallback: geometry array → map to indices by WKB, then precise intersects
    cand = tree_ras.query(tile_poly)
    if isinstance(cand, np.ndarray):
        cand = cand.tolist()
    out = []
    for g in cand:
        i = g2i_wkb.get(g.wkb, None)
        if i is None:
            # last resort: linear search (rare)
            try:
                i = next(j for j, gg in enumerate(geoms_ras) if gg.equals(g))
            except StopIteration:
                continue
        if geoms_ras[i].intersects(tile_poly):
            out.append(i)
    return out


# ---------------------- Raster helpers -------------------
def _iter_tiles(H, W, tile=TILE):
    for r0 in range(0, H, tile):
        for c0 in range(0, W, tile):
            h = min(tile, H - r0)
            w = min(tile, W - c0)
            yield Window(c0, r0, w, h)

def _read_prob_tile(src, W):
    arr = src.read(1, window=W, out_dtype="float32", masked=True).filled(np.nan)
    finite = np.isfinite(arr)
    if finite.any() and float(np.nanmax(arr[finite])) > 1.5:
        arr[finite] /= 100.0
    np.clip(arr, 0.0, 1.0, out=arr, where=finite)
    return arr

def _tile_bounds(window, transform):
    left, top = transform * (window.col_off, window.row_off)
    right, bottom = transform * (window.col_off + window.width, window.row_off + window.height)
    x0, x1 = sorted([left, right])
    y0, y1 = sorted([bottom, top])
    return (x0, y0, x1, y1)

# ---------------------- Core algorithm -------------------
def aei_binarize_per_admin_from_shapefile(drive):
    """
    Main entry: uses ONLY the admin shapefile with AEI in hectares
    to allocate per-admin pixel quotas and write one binary per country.
    """
    # Locate folders
    cmp_id  = get_subfolder_fuzzy(drive, ROOT_FOLDER_ID, PARENT_FOLDER_NAME)
    if not cmp_id: raise RuntimeError(f"Folder '{PARENT_FOLDER_NAME}' not found under ROOT.")
    prob_id = get_subfolder_fuzzy(drive, cmp_id, PROB_SUBFOLDER_NAME)
    if not prob_id: raise RuntimeError("Probability folder not found (tried fuzzy match).")
    binary_id = get_or_create_folder(drive, prob_id, "Binary")

    aei_folder_id = get_subfolder_fuzzy(drive, ROOT_FOLDER_ID, NATIONAL_AEI_FOLDER)
    if not aei_folder_id: raise RuntimeError("National AEI folder not found at ROOT.")

    # Download admin shapefile bundle
    shp_paths = _download_shapefile_bundle(drive, aei_folder_id, ADMIN_SHP_BASE)
    shp_path  = shp_paths[".shp"]

    # List probability TIFFs
    files = [it for it in list_files(drive, prob_id)
             if isinstance(it, dict)
             and it.get("mimeType") != "application/vnd.google-apps.folder"
             and it.get("title","").lower().endswith((".tif",".tiff"))]
    if not files:
        raise FileNotFoundError("No probability TIFFs in Probability folder.")

    for it in files:
        title   = it.get("title","")
        country = _extract_country_from_fname(title)
        print(f"\n=== {country} (per-admin from shapefile; AEI in hectares) ===")

        rtmp = download_to_temp(it, ".tif")
        with rasterio.open(rtmp) as src:
            H, W = src.height, src.width
            ras_crs = src.crs
            rb = src.bounds
            ras_bounds = (rb.left, rb.bottom, rb.right, rb.top)

            # Read & subset admins, reproject to raster CRS
            geoms_ras, attrs, tree_ras, g2i_wkb = _read_admins_for_raster(shp_path, ras_crs, ras_bounds)
            if not geoms_ras:
                print("  ⚠️  No admin polygons intersect this raster; skipping.")
                continue

            # Optional filter by country name if present (keeps all if missing)
            want = _canon(country)
            keep = [i for i,a in enumerate(attrs) if (not a["country"]) or _canon(a["country"]) == want]
            if keep and len(keep) < len(attrs):
                geoms_ras = [geoms_ras[i] for i in keep]
                attrs     = [attrs[i] for i in keep]
                tree_ras  = STRtree(geoms_ras)
                g2i_wkb   = {g.wkb: i for i, g in enumerate(geoms_ras)}

            # Targets per admin (AEI in HECTARES → m² → pixels); include zeros
            K_map = {}
            for a in attrs:
                aei_m2 = a["aei_ha"] * 10_000.0
                K_map[a["unit_code"]] = int(math.floor(aei_m2 / PIXEL_AREA_M2))  # may be 0

            # PASS 1: per-admin histograms of probability bins (centroid-in)
            hists = defaultdict(Counter)
            for w in _iter_tiles(H, W, TILE):
                prob = _read_prob_tile(src, w)
                valid = np.isfinite(prob)
                if not valid.any():
                    continue

                tile_t = rasterio.windows.transform(w, src.transform)
                tb = _tile_bounds(w, src.transform)
                tile_poly = box(*tb)

                idxs = _tree_candidate_indices(tree_ras, tile_poly, geoms_ras, g2i_wkb)
                if len(idxs) == 0:
                    continue

                shapes = [(geoms_ras[i], attrs[i]["unit_code"]) for i in idxs]
                labels = rasterize(
                    shapes=shapes,
                    out_shape=prob.shape,
                    transform=tile_t,
                    fill=0, dtype="int64",
                    all_touched=False  # centroid-in
                )
                m = valid & (labels != 0)
                if not m.any():
                    continue

                p_int = np.zeros(prob.shape, dtype=np.int32)
                p_int_valid = np.rint(prob[m] * SCALE).astype(np.int32)
                p_int[m] = p_int_valid

                uc = labels[m].ravel()
                pi = p_int[m].ravel()
                for u in np.unique(uc):
                    sel = (uc == u)
                    bc = np.bincount(pi[sel], minlength=SCALE+1)
                    nz = np.nonzero(bc)[0]
                    for b, v in zip(nz, bc[nz]):
                        hists[u][int(b)] += int(v)

            # thresholds per admin (quota 0 => thr=-1, no selection)
            thr_map, need_eq_map = {}, {}
            for u, K in K_map.items():
                total = sum(hists[u].values())
                if K <= 0 or total == 0:
                    thr_map[u] = -1
                    need_eq_map[u] = 0
                    continue
                K = min(K, total)
                cum = 0; gt = 0
                for b in range(SCALE, -1, -1):
                    cnt = int(hists[u].get(b, 0))
                    if cum + cnt >= K:
                        thr_map[u] = b
                        need_eq_map[u] = K - gt
                        break
                    cum += cnt; gt += cnt
            need_eq_left = dict(need_eq_map)

            # PASS 2: write binary output (NaN preserved; default 0; set 1s per admin)
            out_profile = src.profile.copy()
            out_profile.update(
                driver="GTiff",
                height=H, width=W,
                transform=src.transform,
                count=1, dtype="float32", nodata=np.nan,
                compress="LZW", tiled=True, blockxsize=512, blockysize=512,
                BIGTIFF="IF_NEEDED"
            )
            with tempfile.NamedTemporaryFile(delete=False, suffix=".tif") as tmp_out:
                out_local = tmp_out.name

            with rasterio.open(out_local, "w", **out_profile) as dst:
                for w in _iter_tiles(H, W, TILE):
                    prob = _read_prob_tile(src, w)
                    out_tile = np.full(prob.shape, np.nan, dtype=np.float32)

                    valid = np.isfinite(prob)
                    if not valid.any():
                        dst.write(out_tile, 1, window=w)
                        continue

                    tile_t = rasterio.windows.transform(w, src.transform)
                    tb = _tile_bounds(w, src.transform)
                    tile_poly = box(*tb)

                    idxs = _tree_candidate_indices(tree_ras, tile_poly, geoms_ras, g2i_wkb)
                    if len(idxs) == 0:
                        dst.write(out_tile, 1, window=w)
                        continue

                    shapes = [(geoms_ras[i], attrs[i]["unit_code"]) for i in idxs]
                    labels = rasterize(
                        shapes=shapes,
                        out_shape=prob.shape,
                        transform=tile_t,
                        fill=0, dtype="int64",
                        all_touched=False
                    )

                    out_tile[valid] = 0.0  # default: valid-but-not-selected = 0
                    m_all = valid & (labels != 0)
                    if m_all.any():
                        p_int = np.zeros(prob.shape, dtype=np.int32)
                        p_int_valid = np.rint(prob[m_all] * SCALE).astype(np.int32)
                        p_int[m_all] = p_int_valid

                        present = np.unique(labels[m_all])
                        present = [u for u in present if u != 0]  # all units allowed
                        for u in present:
                            u_m = m_all & (labels == u)
                            if not u_m.any():
                                continue
                            t = thr_map.get(u, -1)
                            if t < 0:
                                continue  # quota 0 or no pixels -> stays 0
                            gt_m = u_m & (p_int > t)
                            out_tile[gt_m] = 1.0
                            need = need_eq_left.get(u, 0)
                            if need > 0:
                                eq_m = u_m & (p_int == t) & (out_tile != 1.0)
                                if eq_m.any():
                                    idx = np.flatnonzero(eq_m.ravel())
                                    _rng.shuffle(idx)
                                    take = min(need, idx.size)
                                    sel = idx[:take]
                                    rr, cc = np.unravel_index(sel, eq_m.shape)
                                    out_tile[rr, cc] = 1.0
                                    need_eq_left[u] = need - int(take)

                    dst.write(out_tile, 1, window=w)

            out_name = f"{country}_AEI_binary_0_1.tif"
            upload_path(drive, out_local, binary_id, title=out_name)
            try: os.remove(out_local)
            except: pass

            # CSV summary per admin (includes AEI==0 units)
            with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", newline="") as tmpcsv:
                wcsv = csv.writer(tmpcsv)
                wcsv.writerow(["unit_code","aei_ha","target_pixels","thr_bin","scale","selected_pixels"])
                for a in attrs:
                    u = a["unit_code"]
                    aei_ha = a["aei_ha"]
                    K = int(math.floor((aei_ha * 10_000.0) / PIXEL_AREA_M2))
                    th = int(thr_map.get(u, -1))
                    gt = sum(v for b, v in (hists[u].items() if u in hists else []) if b > th)
                    ties_taken = (need_eq_map.get(u, 0) - need_eq_left.get(u, 0))
                    sel = int(gt + max(0, ties_taken))
                    wcsv.writerow([u, aei_ha, int(K), th, SCALE, sel])
                csv_path = tmpcsv.name
            upload_path(drive, csv_path, binary_id, title=f"{country}_AEI_admin_summary.csv")
            try: os.remove(csv_path)
            except: pass

            print(f"   ✓ Wrote Binary/{out_name}")

        try: os.remove(rtmp)
        except: pass

    print("\n✅ Done (per-admin from shapefile; AEI in hectares; NaNs preserved).")

# ---------------------- CLI ----------------------
if __name__ == "__main__":
    try:
        drive  # noqa: F821
    except NameError:
        raise RuntimeError("PyDrive2 'drive' not found. Authenticate and expose a global `drive` before running.")
    aei_binarize_per_admin_from_shapefile(drive)


[AEI] Shapefile bundle at /tmp/aei_admin_ji6czoq5 → found ['.cpg', '.dbf', '.prj', '.shp', '.shx']

=== North_Dakota (per-admin from shapefile; AEI in hectares) ===
   ✓ Wrote Binary/North_Dakota_AEI_binary_0_1.tif

=== Missouri (per-admin from shapefile; AEI in hectares) ===
   ✓ Wrote Binary/Missouri_AEI_binary_0_1.tif

✅ Done (per-admin from shapefile; AEI in hectares; NaNs preserved).


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Post-processing for AEI per-admin binary masks.

Step 2 after running `aei_binarize_per_admin_from_shapefile`:

  - Take each *_AEI_binary_0_1.tif in
        ROOT / CountryModelPredicted_Cropland / Probability / Binary
  - Apply a window-based majority filter (3*3 windows on the 0/1 binary mask)
    in a streaming / tile-based fashion.
  - Preserve NaNs from the input.
  - Write smoothed 0/1 binary TIFFs to:
        ROOT / CountryModelPredicted_Cropland / Probability / Binary_MAJ

Notes
-----
- This does NOT change the original AEI thresholding step; it only smooths
  the resulting binary maps, so exact per-admin AEI may change slightly.
"""

import os
import re
import tempfile
import warnings

import numpy as np
import rasterio
from rasterio.windows import Window
from scipy.ndimage import uniform_filter   # pip install scipy

# ------------------------- CONFIG -------------------------
ROOT_FOLDER_ID      = os.environ.get("ROOT_FOLDER_ID", "1hqMIyDYEFKnpS8KLxC4bqHmF_9dHXImG")
PARENT_FOLDER_NAME  = "CountryModelPredicted_Cropland"
PROB_SUBFOLDER_NAME = "Probability"   #=
BINARY_FOLDER_NAME  = "Binary"        # input binaries from AEi calibrated postprocessing step
SMOOTH_FOLDER_NAME  = "Binary_MAJ"    # output smoothed binaries

TILE                = 1024            # I/O tile size
KERNEL_PX           = 3              # majority window (odd, in pixels, e.g.3*3  at 30 m res)
MAJ_THR             = 0.50            # majority threshold (>= 50% neighbors == 1)

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ---------------------- Drive helpers ---------------------
def _dq(drive, q):
    return drive.ListFile({
        "q": q,
        "supportsAllDrives": True,
        "includeItemsFromAllDrives": True,
        "maxResults": 1000
    }).GetList()

def list_files(drive, parent_id):
    return _dq(drive, f"'{parent_id}' in parents and trashed=false")

def child_folders(drive, parent_id):
    return _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder'")

def get_subfolder_exact(drive, parent_id, name):
    res = _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder' and title='{name}'")
    return res[0]["id"] if res else None

def get_subfolder_fuzzy(drive, parent_id, desired):
    eid = get_subfolder_exact(drive, parent_id, desired)
    if eid:
        return eid
    key = re.sub(r"\s+", "", desired.lower())
    for f in child_folders(drive, parent_id):
        tkey = re.sub(r"\s+", "", f.get("title", "").lower())
        if key in tkey or ("prob" in key and "prob" in tkey):
            return f["id"]
    return None

def get_or_create_folder(drive, parent_id, name):
    res = _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder' and title='{name}'")
    if res:
        return res[0]["id"]
    nf = drive.CreateFile({"title": name, "parents":[{"id": parent_id}], "mimeType":"application/vnd.google-apps.folder"})
    nf.Upload()
    return nf["id"]

def download_to_temp(drive_file, suffix):
    p = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
    drive_file.GetContentFile(p)
    return p

def upload_path(drive, local_path, parent_id, title=None):
    f = drive.CreateFile({"title": title or os.path.basename(local_path), "parents":[{"id": parent_id}]})
    f.SetContentFile(local_path)
    f.Upload()
    return f["id"]

# ---------------------- Raster helpers -------------------
def _iter_tiles(H, W, tile=TILE):
    for r0 in range(0, H, tile):
        for c0 in range(0, W, tile):
            h = min(tile, H - r0)
            w = min(tile, W - c0)
            yield Window(c0, r0, w, h)

# ---------------------- Majority smoothing ---------------

def _smooth_binary_stream(src_path, dst_path, kernel_px=KERNEL_PX, maj_thr=MAJ_THR):
    """
    Read a float32 0/1/NaN binary raster in tiles and write a smoothed
    0/1/NaN raster using a window-based majority filter.

    - Input: float32, nodata = NaN, values 0 or 1 elsewhere.
    - Output: float32, nodata = NaN, values 0 or 1 elsewhere.
    """
    r = kernel_px // 2
    win_area = float(kernel_px * kernel_px)

    with rasterio.open(src_path) as src:
        profile = src.profile.copy()
        # keep same geo/tiling; ensure float32 + NaN nodata
        profile.update(
            dtype="float32",
            count=1,
            nodata=np.nan,
            compress="LZW",
            tiled=True,
            blockxsize=512,
            blockysize=512,
            BIGTIFF="IF_NEEDED"
        )

        with rasterio.open(dst_path, "w", **profile) as dst:
            for _, w in src.block_windows(1):
                # expand window by halo for neighborhood support
                r0 = max(0, w.row_off - r)
                c0 = max(0, w.col_off - r)
                r1 = min(src.height, w.row_off + w.height + r)
                c1 = min(src.width,  w.col_off + w.width  + r)
                if (c1 - c0) <= 0 or (r1 - r0) <= 0:
                    # nothing there
                    dst.write(np.full((w.height, w.width), np.nan, dtype=np.float32), 1, window=w)
                    continue
                win_pad = Window(c0, r0, c1 - c0, r1 - r0)

                a = src.read(1, window=win_pad, masked=True)
                data = a.data
                # valid where not masked AND finite
                valid = (~a.mask) & np.isfinite(data)
                if not valid.any():
                    dst.write(np.full((w.height, w.width), np.nan, dtype=np.float32), 1, window=w)
                    continue

                # current binary: anything > 0.5 treated as 1
                base = valid & (data > 0.5)
                base_f = base.astype(np.float32)
                valf   = valid.astype(np.float32)

                # neighborhood counts via uniform_filter
                sum_ones  = uniform_filter(base_f, size=kernel_px, mode="constant", cval=0.0) * win_area
                cnt_valid = uniform_filter(valf,   size=kernel_px, mode="constant", cval=0.0) * win_area
                frac = np.divide(sum_ones, cnt_valid, out=np.zeros_like(sum_ones), where=(cnt_valid > 0))

                # majority decision (only where we have valid neighbors)
                smoothed = np.zeros_like(base, dtype=bool)
                has_nb = (cnt_valid > 0)
                smoothed[has_nb] = frac[has_nb] >= maj_thr

                # crop back to original tile window
                rs = w.row_off - r0
                cs = w.col_off - c0
                re = rs + w.height
                ce = cs + w.width

                valid_core    = valid[rs:re, cs:ce]
                smooth_core   = smoothed[rs:re, cs:ce]

                out_block = np.full((w.height, w.width), np.nan, dtype=np.float32)
                out_block[valid_core & smooth_core]  = 1.0
                out_block[valid_core & ~smooth_core] = 0.0

                dst.write(out_block, 1, window=w)

# ---------------------- Main driver ----------------------

def smooth_aei_binary_maps(drive):
    """
    Entry point: locate Binary folder, smooth each AEI binary map,
    and write to Binary_MAJ sibling folder.
    """
    # Locate folders
    cmp_id = get_subfolder_fuzzy(drive, ROOT_FOLDER_ID, PARENT_FOLDER_NAME)
    if not cmp_id:
        raise RuntimeError(f"Folder '{PARENT_FOLDER_NAME}' not found under ROOT.")

    prob_id = get_subfolder_fuzzy(drive, cmp_id, PROB_SUBFOLDER_NAME)
    if not prob_id:
        raise RuntimeError("Probability folder not found (tried fuzzy match).")

    binary_id = get_subfolder_fuzzy(drive, prob_id, BINARY_FOLDER_NAME)
    if not binary_id:
        raise RuntimeError("Binary folder (with AEI binaries) not found under Probability.")

    smooth_id = get_or_create_folder(drive, prob_id, SMOOTH_FOLDER_NAME)

    # List binary TIFFs
    files = [
        it for it in list_files(drive, binary_id)
        if isinstance(it, dict)
        and it.get("mimeType") != "application/vnd.google-apps.folder"
        and it.get("title", "").lower().endswith((".tif", ".tiff"))
    ]
    if not files:
        raise FileNotFoundError("No binary TIFFs in Binary folder.")

    print(f"Found {len(files)} binary rasters in Binary/")

    for it in files:
        title = it.get("title", "")
        base  = re.sub(r"\.tif(f)?$", "", title, flags=re.IGNORECASE)
        out_name = f"{base}_MAJ_k{KERNEL_PX}_p{int(MAJ_THR*100)}.tif"

        print(f"\n=== Smoothing {title} → {out_name} ===")
        tmp_in  = download_to_temp(it, ".tif")
        with tempfile.NamedTemporaryFile(delete=False, suffix=".tif") as tmp_out:
            out_local = tmp_out.name

        _smooth_binary_stream(tmp_in, out_local, kernel_px=KERNEL_PX, maj_thr=MAJ_THR)
        upload_path(drive, out_local, smooth_id, title=out_name)

        # cleanup
        try:
            os.remove(tmp_in)
        except Exception:
            pass
        try:
            os.remove(out_local)
        except Exception:
            pass

        print(f"   ✓ Wrote {SMOOTH_FOLDER_NAME}/{out_name}")

    print("\n✅ Done smoothing all AEI binary maps (majority filter).")

# ---------------------- CLI ----------------------
if __name__ == "__main__":
    try:
        drive  # noqa: F821
    except NameError:
        raise RuntimeError("PyDrive2 'drive' not found. Authenticate and expose a global `drive` before running.")
    smooth_aei_binary_maps(drive)


Found 48 binary rasters in Binary/

=== Smoothing Missouri_AEI_binary_0_1.tif → Missouri_AEI_binary_0_1_MAJ_k9_p50.tif ===
   ✓ Wrote Binary_MAJ/Missouri_AEI_binary_0_1_MAJ_k9_p50.tif

=== Smoothing North_Dakota_AEI_binary_0_1.tif → North_Dakota_AEI_binary_0_1_MAJ_k9_p50.tif ===
   ✓ Wrote Binary_MAJ/North_Dakota_AEI_binary_0_1_MAJ_k9_p50.tif

=== Smoothing Alabama_AEI_binary_0_1.tif → Alabama_AEI_binary_0_1_MAJ_k9_p50.tif ===
   ✓ Wrote Binary_MAJ/Alabama_AEI_binary_0_1_MAJ_k9_p50.tif

=== Smoothing Copy of Wisconsin_AEI_binary_0_1.tif → Copy of Wisconsin_AEI_binary_0_1_MAJ_k9_p50.tif ===
   ✓ Wrote Binary_MAJ/Copy of Wisconsin_AEI_binary_0_1_MAJ_k9_p50.tif

=== Smoothing Copy of Utah_AEI_binary_0_1.tif → Copy of Utah_AEI_binary_0_1_MAJ_k9_p50.tif ===
   ✓ Wrote Binary_MAJ/Copy of Utah_AEI_binary_0_1_MAJ_k9_p50.tif

=== Smoothing Copy of Tennessee_AEI_binary_0_1.tif → Copy of Tennessee_AEI_binary_0_1_MAJ_k9_p50.tif ===
   ✓ Wrote Binary_MAJ/Copy of Tennessee_AEI_binary_0_1_MAJ_k9_p50.

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Merge all per-country AEI binary rasters in Drive → CountryModelPredicted/Probability/Binary/
into a single global binary mosaic (union = max), preserving NaNs.

Output: Merged_AEI_binary_0_1.tif in the same Binary folder.
"""

import os, re, tempfile, warnings
import numpy as np
import rasterio
from rasterio.merge import merge as rio_merge
from rasterio.enums import Resampling

# ------------------------- CONFIG -------------------------
ROOT_FOLDER_ID      = os.environ.get("ROOT_FOLDER_ID", "1hqMIyDYEFKnpS8KLxC4bqHmF_9dHXImG")
PARENT_FOLDER_NAME  = "CountryModelPredicted_Cropland"
PROB_SUBFOLDER_NAME = "Probability"   
BINARY_SUBFOLDER    = "Binary_MAJ"
OUT_NAME            = "US_binary.tif"
# ----------------------------------------------------------

warnings.filterwarnings("ignore", category=RuntimeWarning)

# ---------------------- Drive helpers ---------------------
def _dq(drive, q):
    return drive.ListFile({
        "q": q,
        "supportsAllDrives": True,
        "includeItemsFromAllDrives": True,
        "maxResults": 1000
    }).GetList()

def list_files(drive, parent_id):
    return _dq(drive, f"'{parent_id}' in parents and trashed=false")

def child_folders(drive, parent_id):
    return _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder'")

def get_subfolder_exact(drive, parent_id, name):
    res = _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder' and title='{name}'")
    return res[0]["id"] if res else None

def get_subfolder_fuzzy(drive, parent_id, desired):
    eid = get_subfolder_exact(drive, parent_id, desired)
    if eid: return eid
    key = re.sub(r"\s+", "", desired.lower())
    for f in child_folders(drive, parent_id):
        tkey = re.sub(r"\s+", "", f.get("title","").lower())
        if key in tkey or ("prob" in key and "prob" in tkey):
            return f["id"]
    return None

def get_or_create_folder(drive, parent_id, name):
    res = _dq(drive, f"'{parent_id}' in parents and trashed=false and mimeType='application/vnd.google-apps.folder' and title='{name}'")
    if res: return res[0]["id"]
    nf = drive.CreateFile({"title": name, "parents":[{"id": parent_id}], "mimeType":"application/vnd.google-apps.folder"})
    nf.Upload()
    return nf["id"]

def download_to_temp(drive_file, suffix):
    p = tempfile.NamedTemporaryFile(delete=False, suffix=suffix).name
    drive_file.GetContentFile(p)
    return p

def upload_path(drive, local_path, parent_id, title=None):
    f = drive.CreateFile({"title": title or os.path.basename(local_path), "parents":[{"id": parent_id}]})
    f.SetContentFile(local_path)
    f.Upload()
    return f["id"]

# ---------------------- Merge helpers ---------------------
def _try_merge_with_max(srcs):
    """Preferred path (newer rasterio): method='max', nodata=np.nan."""
    return rio_merge(
        srcs,
        nodata=np.nan,
        dtype="float32",
        precision=7,
        resampling=Resampling.nearest,
        method="max",
    )

def _try_merge_basic(srcs):
    """Older rasterio: no 'method'. We'll nanmax ourselves after merging."""
    mosaic, out_transform = rio_merge(
        srcs,
        nodata=np.nan,
        dtype="float32",
        precision=7,
        resampling=Resampling.nearest,
    )
    # emulate union across sources (elementwise maximum, ignoring NaNs)
    mosaic = np.nanmax(mosaic, axis=0, keepdims=True).astype("float32")
    return mosaic, out_transform

def _try_merge_basic_sentinel(srcs, sentinel=-9999.0):
    """Very old rasterio: nodata cannot be NaN. Use sentinel then convert and nanmax."""
    mosaic, out_transform = rio_merge(
        srcs,
        nodata=sentinel,
        dtype="float32",
        precision=7,
        resampling=Resampling.nearest,
    )
    # convert sentinel to NaN
    mosaic = mosaic.astype("float32", copy=False)
    mosaic[mosaic == sentinel] = np.nan
    mosaic = np.nanmax(mosaic, axis=0, keepdims=True).astype("float32")
    return mosaic, out_transform

# ---------------------- Main merge ------------------------
def merge_all_binary_rasters(drive):
    # Locate folders
    cmp_id     = get_subfolder_fuzzy(drive, ROOT_FOLDER_ID, PARENT_FOLDER_NAME)
    if not cmp_id: raise RuntimeError(f"Folder '{PARENT_FOLDER_NAME}' not found under ROOT.")
    prob_id    = get_subfolder_fuzzy(drive, cmp_id, PROB_SUBFOLDER_NAME)
    if not prob_id: raise RuntimeError("Probability folder not found (tried fuzzy match).")
    binary_id  = get_subfolder_fuzzy(drive, prob_id, BINARY_SUBFOLDER)
    if not binary_id:
        binary_id = get_or_create_folder(drive, prob_id, BINARY_SUBFOLDER)

    # Find all binary GeoTIFFs (skip the mosaic itself if re-running)
    tifs = [it for it in list_files(drive, binary_id)
            if isinstance(it, dict)
            and it.get("mimeType") != "application/vnd.google-apps.folder"
            and it.get("title","").lower().endswith((".tif",".tiff"))
            and OUT_NAME.lower() not in it.get("title","").lower()]
    if not tifs:
        raise FileNotFoundError("No binary .tif files found in Binary/.")

    # Download & open datasets
    local_paths, srcs = [], []
    try:
        for it in tifs:
            p = download_to_temp(it, ".tif")
            local_paths.append(p)
            srcs.append(rasterio.open(p))

        # Try modern merge with 'method=max' → else fallback strategies
        try:
            mosaic, out_transform = _try_merge_with_max(srcs)
        except TypeError:
            # 'method' not supported
            try:
                mosaic, out_transform = _try_merge_basic(srcs)
            except Exception:
                mosaic, out_transform = _try_merge_basic_sentinel(srcs)
        except Exception:
            # Any other unexpected error → robust fallback
            try:
                mosaic, out_transform = _try_merge_basic(srcs)
            except Exception:
                mosaic, out_transform = _try_merge_basic_sentinel(srcs)

        # Build output profile from first raster
        ref = srcs[0]
        out_profile = ref.profile.copy()
        out_profile.update(
            driver="GTiff",
            height=mosaic.shape[1],
            width=mosaic.shape[2],
            transform=out_transform,
            count=1,
            dtype="float32",
            nodata=np.nan,
            compress="LZW",
            tiled=True,
            blockxsize=512,
            blockysize=512,
            BIGTIFF="IF_NEEDED"
        )

        # Write to temp, then upload
        with tempfile.NamedTemporaryFile(delete=False, suffix=".tif") as tmp_out:
            out_local = tmp_out.name

        with rasterio.open(out_local, "w", **out_profile) as dst:
            dst.write(mosaic[0], 1)

        upload_path(drive, out_local, binary_id, title=OUT_NAME)
        try: os.remove(out_local)
        except: pass

        print(f"✅ Merged {len(srcs)} rasters → Binary/{OUT_NAME}")

    finally:
        # Cleanup
        for s in srcs:
            try: s.close()
            except: pass
        for p in local_paths:
            try: os.remove(p)
            except: pass

# ---------------------- CLI ----------------------
if __name__ == "__main__":
    try:
        drive  # noqa: F821
    except NameError:
        raise RuntimeError("PyDrive2 'drive' not found. Authenticate and expose a global `drive` before running.")
    merge_all_binary_rasters(drive)




✅ Merged 48 rasters → Binary/US_binary.tif
