In [None]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

# local dirve path to  JSON.
CLIENT_JSON = "client_secret_Final.json"   #Google Dirve API
gauth = GoogleAuth()
gauth.LoadClientConfigFile(CLIENT_JSON)

# Command-line auth -> 
gauth.CommandLineAuth()

# Save token 
gauth.SaveCredentialsFile('token.json')

drive = GoogleDrive(gauth)
print("‚úÖ Google Drive connected")

In [None]:
"""
Mosaic multiple tif rasters for a given country to have a single file per each varaiable for a country
"""
import os
import re
import shutil
import tempfile
import unicodedata
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import rasterio
from rasterio.io import MemoryFile
from rasterio.shutil import copy as rio_copy
from rasterio.warp import reproject, Resampling as WarpResampling
from tqdm.auto import tqdm

# -------------------------------------------------------------------
# Google Drive setup
# -------------------------------------------------------------------
# NOTE: 'drive' must be an authenticated PyDrive2 client in your session.
ROOT_FOLDER_ID = "18pQKnMMnLramhHRZSNwUJrLqG5DXNMmS"  # folder id from gogole dirve

# Variables 
VARIABLES = [
    "NDVI_mean", "NDVI_max", "NDVI_min",
    "NDWI_mean", "NDWI_max", "NDWI_min",
    "GI_mean", "GI_max", "GI_min",
    "elevation", "slope",
    "ET", "PET",

]

# Provinces to process
PROVINCES = [
    "Philippines",
    # ...
]

# Optional synonyms (case-insensitive)
COUNTRY_SYNONYMS = {
    "Philippines": "philippines",
}

# -------------------------------------------------------------------
# Helpers
# -------------------------------------------------------------------
VARIABLE_TOKENS = {v.lower(): v for v in VARIABLES}  # canonicalize


def _squash_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()


def normalize_country(name: str) -> str:
    """Normalize 'Africa_Inner_Mongolia' -> 'Africa Inner Mongolia', apply synonyms."""
    if not name:
        return ""
    n = name.strip().replace("_", " ")
    n = _squash_spaces(n)
    lower_syn = {k.lower(): v for k, v in COUNTRY_SYNONYMS.items()}
    if n.lower() in lower_syn:
        n = lower_syn[n.lower()]
    return n


def _safe_basename(s: str) -> str:
    """ASCII slug for filenames."""
    norm = unicodedata.normalize("NFKD", s)
    ascii_only = norm.encode("ascii", "ignore").decode("ascii")
    slug = re.sub(r"[^A-Za-z0-9]+", "_", ascii_only).strip("_")
    return slug or "untitled"


def _drive_q_escape(s: str) -> str:
    return s.replace("\\", "\\\\").replace("'", "\\'")


# -------------------------------------------------------------------
# Drive helpers
# -------------------------------------------------------------------
def get_or_create_folder(drive, parent_id, name):
    safe = _drive_q_escape(name)
    q = (
        f"'{parent_id}' in parents and trashed=false and "
        f"mimeType='application/vnd.google-apps.folder' and title='{safe}'"
    )
    res = drive.ListFile({'q': q}).GetList()
    if res:
        return res[0]['id']
    f = drive.CreateFile({
        'title': name,
        'parents': [{'id': parent_id}],
        'mimeType': 'application/vnd.google-apps.folder'
    })
    f.Upload()
    return f['id']


def list_all_tifs_recursive(drive, root_id):
    """Recursively list all .tif/.tiff under root_id."""
    tifs, stack = [], [root_id]
    while stack:
        folder_id = stack.pop()
        q = f"'{folder_id}' in parents and trashed=false"
        for it in drive.ListFile({'q': q}).GetList():
            mime = it.get('mimeType', '')
            if mime == 'application/vnd.google-apps.folder':
                stack.append(it['id'])
            else:
                title = it.get('title', '')
                if title and title.lower().endswith(('.tif', '.tiff')):
                    tifs.append(it)
    return tifs


def download_many(drive, files, dst_dir, max_workers=8):
    """Download Drive files concurrently to dst_dir, return local paths."""

    def _dl(f):
        local = os.path.join(dst_dir, f['title'])
        f.GetContentFile(local)
        return local

    paths = []
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = [ex.submit(_dl, f) for f in files]
        for fut in tqdm(as_completed(futs), total=len(futs), desc="Downloading"):
            paths.append(fut.result())
    return paths


def upload_tif(drive, local_path, parent_id, title):
    f = drive.CreateFile({'title': title, 'parents': [{'id': parent_id}]})
    f.SetContentFile(local_path)
    f.Upload()
    return f['id']


# -------------------------------------------------------------------
# Parsing
# -------------------------------------------------------------------
def _filename_core(title: str) -> str:
    """
    Take a Drive title and return the 'logical' base name:

    e.g.
    'New_York_GI_max-0000000000-0000000000.tif' -> 'New_York_GI_max'
    """
    base = os.path.splitext(os.path.basename(title))[0]
    # Drop anything after the first '-' (tile ID, date, etc.)
    if '-' in base:
        base = base.split('-', 1)[0]
    return base


def _split_tokens(path_title: str):
    """
    Split the core filename into '_' tokens.

    'New_York_GI_max-0000000000-0000000000.tif'
      -> core 'New_York_GI_max'
      -> ['New', 'York', 'GI', 'max']
    """
    core = _filename_core(path_title)
    return [t for t in core.split('_') if t]


def parse_title(title: str):
    """
    Return (country_string_like_'Africa_Anhui', canonical_variable) or (None, None)
    """
    if not title.lower().endswith(('.tif', '.tiff')):
        return None, None

    toks = _split_tokens(title)
    if not toks:
        return None, None

    lower = [t.lower() for t in toks]

    # Match country: pick the longest province whose tokenized name matches the start
    best_country = None
    start_len = 0
    for prov in PROVINCES:
        ptoks = prov.split('_')
        if len(ptoks) <= len(lower) and [t.lower() for t in ptoks] == lower[:len(ptoks)]:
            if len(ptoks) > start_len:
                best_country = prov
                start_len = len(ptoks)

    if not best_country:
        return None, None

    # Find variable starting right after the country tokens
    best = None  # (i, j, canonical_var)
    for i in range(start_len, min(start_len + 4, len(lower))):
        for j in range(i, min(i + 3, len(lower))):
            cand = '_'.join(lower[i:j + 1])
            if cand in VARIABLE_TOKENS:
                # prefer longer match
                if best is None or (j - i) > (best[1] - best[0]):
                    best = (i, j, VARIABLE_TOKENS[cand])

    if not best:
        return None, None

    _, _, variable = best
    return best_country, variable


# -------------------------------------------------------------------
# Size helpers / dtype
# -------------------------------------------------------------------
def _bytes_per_pixel(dtype):
    return {
        "uint8": 1, "int8": 1,
        "uint16": 2, "int16": 2,
        "uint32": 4, "int32": 4,
        "float32": 4, "float64": 8
    }.get(dtype, 4)


def _should_bigtiff(width, height, count, dtype):
    est = width * height * count * _bytes_per_pixel(dtype) * 1.05
    return est >= (4 * 1024 ** 3)


# -------------------------------------------------------------------
# Grid/reference helpers
# -------------------------------------------------------------------
def _pick_reference(datasets):
    # Choose the most frequent (xres, yres); break ties by largest pixel area
    reslist = [(ds.transform.a, -ds.transform.e) for ds in datasets]
    counts = Counter(reslist)
    best_res = max(counts.items(), key=lambda kv: (kv[1], kv[0][0] * kv[0][1]))[0]
    for ds in datasets:
        if (ds.transform.a, -ds.transform.e) == best_res:
            return ds
    return datasets[0]


# -------------------------------------------------------------------
# COG writer
# -------------------------------------------------------------------
def write_cog_from_array(mosaic, profile, cog_path):
    force_bigtiff = _should_bigtiff(
        width=profile["width"],
        height=profile["height"],
        count=profile["count"],
        dtype=profile["dtype"]
    )

    src_profile = profile.copy()
    src_profile.update({
        "driver": "GTiff",
        "tiled": True,
        "blockxsize": profile.get("blockxsize", 512),
        "blockysize": profile.get("blockysize", 512),
        "compress": profile.get("compress", "LZW"),
        "predictor": profile.get("predictor", 3),
        "interleave": "band",
        "BIGTIFF": "YES" if force_bigtiff else "IF_SAFER",
    })

    with MemoryFile() as memfile:
        with memfile.open(**src_profile) as tmp:
            tmp.write(mosaic)
        with memfile.open() as src_ds:
            rio_copy(
                src_ds,
                cog_path,
                driver="COG",
                COMPRESS="LZW",
                PREDICTOR=src_profile["predictor"],
                BLOCKSIZE=512,
                OVERVIEW_LEVELS="2,4,8,16",
                OVERVIEW_RESAMPLING="AVERAGE",
                NUM_THREADS="ALL_CPUS",
                BIGTIFF="YES" if force_bigtiff else "IF_SAFER",
                RESAMPLING="NEAREST",
                DST_NODATA=profile.get("nodata", None),
            )


# -------------------------------------------------------------------
# Mosaic core
# -------------------------------------------------------------------
def _reducer_for_variable(var: str):
    """Pick a per-pixel reducer."""
    vl = var.lower()
    if vl.endswith("_min"):
        return np.fmin
    # default to fmax for max/mean/masks, etc.
    return np.fmax


def mosaic_files_to_array(datasets, reducer, default_nodata=-9999.0):
    """
    Mosaic single-band rasters to float32 using NaN as working nodata.
    Handles mixed CRS and pixel sizes (aligned to common grid).
    """
    # Common CRS: take the first non-None
    common_crs = next((ds.crs for ds in datasets if ds.crs is not None), None)
    if common_crs is None:
        raise RuntimeError(
            "None of the input tiles has a CRS defined; cannot reproject to a common grid."
        )

    ref = _pick_reference(datasets)
    ref_transform = ref.transform
    ref_res = (ref_transform.a, -ref_transform.e)

    # Union bounds in common CRS
    from rasterio.warp import transform_bounds
    minx = miny = float("inf")
    maxx = maxy = float("-inf")
    for ds in datasets:
        b = ds.bounds
        if ds.crs is not None and ds.crs != common_crs:
            b = transform_bounds(ds.crs, common_crs, *b, densify_pts=21)
        minx, miny = min(minx, b[0]), min(miny, b[1])
        maxx, maxy = max(maxx, b[2]), max(maxy, b[3])

    # Destination grid at the reference resolution
    from rasterio.transform import from_origin
    dst_transform = from_origin(minx, maxy, ref_res[0], ref_res[1])
    dst_w = int(np.ceil((maxx - minx) / ref_res[0]))
    dst_h = int(np.ceil((maxy - miny) / ref_res[1]))

    acc = np.full((dst_h, dst_w), np.nan, dtype=np.float32)

    for ds in datasets:
        temp = np.full((dst_h, dst_w), np.nan, dtype=np.float32)
        src_nodata = ds.nodata
        reproject(
            source=rasterio.band(ds, 1),
            destination=temp,
            src_transform=ds.transform,
            src_crs=(ds.crs if ds.crs is not None else common_crs),
            dst_transform=dst_transform,
            dst_crs=common_crs,
            src_nodata=src_nodata,
            dst_nodata=np.nan,
            resampling=WarpResampling.nearest,
        )
        if np.isnan(acc).all():
            acc = temp
        else:
            acc = reducer(acc, temp)  # NaN-aware fmin/fmax

    mosaic = np.where(np.isnan(acc), default_nodata, acc).astype(np.float32)[None, ...]
    return mosaic, dst_transform, common_crs, float(default_nodata)


# -------------------------------------------------------------------
# Country + variable mosaicking and upload
# -------------------------------------------------------------------
def mosaic_country_variable_to_drive(drive, parent_id, country, variable, by_country_id):
    """
    Find all tiles for (country, variable), mosaic, write COG, upload.
    """
    tifs = list_all_tifs_recursive(drive, parent_id)

    selected = []
    c_norm = country  # already province token form
    v_norm = variable.lower()

    for it in tifs:
        title = it.get('title', '')
        ctry, var = parse_title(title)
        if ctry is None:
            continue
        if ctry == c_norm and var.lower() == v_norm:
            selected.append(it)

    if not selected:
        print(f"‚ö†Ô∏è No files for {country} / {variable}")
        return

    print(f"\n‚ñ∂Ô∏è {country} / {variable} | {len(selected)} file(s)")

    tmpdir = tempfile.mkdtemp(
        prefix=f"{_safe_basename(country)}_{_safe_basename(variable)}_"
    )
    local_paths, datasets = [], []
    try:
        # Download tiles
        local_paths = download_many(drive, selected, tmpdir, max_workers=8)

        # Open datasets (require single band)
        for p in local_paths:
            ds = rasterio.open(p)
            if ds.count != 1:
                raise RuntimeError(
                    f"Only single-band rasters supported. "
                    f"{os.path.basename(p)} has {ds.count} bands."
                )
            datasets.append(ds)

        reducer = _reducer_for_variable(variable)

        # Mosaic to float32
        mosaic, out_transform, out_crs, out_nodata = mosaic_files_to_array(
            datasets, reducer=reducer
        )

        profile = {
            "dtype": "float32",
            "height": mosaic.shape[1],
            "width": mosaic.shape[2],
            "count": 1,
            "transform": out_transform,
            "crs": out_crs,
            "nodata": out_nodata,
            "blockxsize": 512,
            "blockysize": 512,
            "compress": "LZW",
            "predictor": 3,
        }

        # Output folder + filename
        display_country = normalize_country(country)
        country_folder_id = get_or_create_folder(drive, by_country_id, display_country)

        safe_country = _safe_basename(display_country)
        safe_variable = _safe_basename(variable)
        out_name = f"{safe_country}_{safe_variable}.tif"
        cog_local = os.path.join(tmpdir, out_name)

        print("   ‚Ä¢ writing COG ‚Ä¶")
        write_cog_from_array(mosaic, profile, cog_local)

        print(f"   ‚Ä¢ uploading to Drive as {out_name} ‚Ä¶")
        out_id = upload_tif(drive, cog_local, country_folder_id, out_name)
        print(f"‚úÖ Uploaded: {out_name} (file id: {out_id})")

    finally:
        for ds in datasets:
            try:
                ds.close()
            except Exception:
                pass
        shutil.rmtree(tmpdir, ignore_errors=True)


# -------------------------------------------------------------------
# Batch runner
# -------------------------------------------------------------------
def run_all_provinces_variables(drive):
    by_country_id = get_or_create_folder(drive, ROOT_FOLDER_ID, "By Country")
    # Ensure country folders exist
    for c in PROVINCES:
        get_or_create_folder(drive, by_country_id, normalize_country(c))

    for c in PROVINCES:
        for v in VARIABLES:
            try:
                mosaic_country_variable_to_drive(drive, ROOT_FOLDER_ID, c, v, by_country_id)
            except Exception as e:
                print(f"‚ùå {c} / {v}: {e}")


# -------------------------------------------------------------------
# Execute
# -------------------------------------------------------------------
if __name__ == "__main__":
    run_all_provinces_variables(drive)
    print("\nüéâ Done.")
