In [2]:
import os
import yaml
from pathlib import Path
import geopandas as gpd

import matplotlib.pyplot as plt
from matplotlib.patches import Patch

import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask
from rasterio.plot import show

import requests
import numpy as np

from dotenv import load_dotenv
from osgeo import gdal
import ee

import xarray as xr
import rioxarray

import rioxarray as rxr
from dask.distributed import Client, LocalCluster
import pandas as pd

In [5]:
# Get project root (adjust based on your folder depth)
current_dir = Path(os.getcwd())
project_root = current_dir.parent.parent  # Navigate up from "Scripts/Phase1_Data_Preprocessing"

with open(project_root / "config.yml", "r") as f:
    config = yaml.safe_load(f)


raw_data_dir = project_root / config["paths"]["raw_data"]

# Processed data paths
processed_data_dir = project_root / config["paths"]["processed_data"]
soil_processed_dir = processed_data_dir / "GIS/Soil"
output_dir = processed_data_dir / "GIS/Study_Area_Boundary"
output_path = output_dir / "Tadla_plain_common.shp"
tadla_common_path = processed_data_dir / config["paths"]["tadla_boundary_processed"]
soil_processed_path = processed_data_dir / config["paths"]["soil_processed"]
dem_processed_path = processed_data_dir / config["paths"]["dem_processed"]
slope_path = processed_data_dir / "GIS/Topography/tadla_slope.tif"
aspect_path = processed_data_dir / "GIS/Topography/tadla_aspect.tif"
chirps_processed_path = processed_data_dir / config["paths"]["chirps_processed"]
era5_processed_path = processed_data_dir / config["paths"]["era5_processed"]
wv0010_processed_path = processed_data_dir / config["paths"]["wv0010_processed"]
topography_processed_dir = processed_data_dir / "GIS/Topography"

land_use_processed_dir = processed_data_dir / config["paths"]["land_use_processed"]

weather_processed_dir = processed_data_dir / "Weather"
chirps_output_dir = Path(config["paths"]["chirps_dir"])


In [6]:
def validate_spatial_alignment(reference_path):
    """Check CRS, resolution, and transform across all datasets."""
    ref = rasterio.open(reference_path)
    layers = [
        Path(soil_processed_dir / "tadla_clay_10m.tif"),
        Path(dem_processed_path),
        Path(land_use_processed_dir / "Sentinel2_Tadla_NDVI_2023.tif"),
        Path(weather_processed_dir / "CHIRPS_Annual/CHIRPS_2023_reproj.tif"),
        Path(weather_processed_dir / "ERA5_Annual/ERA5_2023_reproj.tif")
    ]
    
    print("=== Spatial Alignment Check ===")
    for layer in layers:
        with rasterio.open(layer) as src:
            if src.crs != ref.crs:
                print(f"❌ CRS mismatch: {layer.name} (EPSG:{src.crs.to_epsg()})")
            if src.res != ref.res:
                print(f"❌ Resolution mismatch: {layer.name} ({src.res}m)")
            if src.transform != ref.transform:
                print(f"❌ Transform mismatch: {layer.name}")
    
    print("✅ Spatial alignment validated (CRS: EPSG:26191, Res: 10m)")


In [7]:
ndvi_ref =  raw_data_dir / config["paths"]["ndvi_raw"]
validate_spatial_alignment(ndvi_ref)

=== Spatial Alignment Check ===
✅ Spatial alignment validated (CRS: EPSG:26191, Res: 10m)


In [9]:
def validate_temporal_bands(years=range(2017, 2024)):
    """Verify 12 bands (months) exist in annual CHIRPS/ERA5 files."""
    print("\n=== Temporal Band Check ===")
    for year in years:
        chirps_path = weather_processed_dir / f"CHIRPS_Annual/CHIRPS_{year}_reproj.tif"
        era5_path = weather_processed_dir / f"ERA5_Annual/ERA5_{year}_reproj.tif"
        
        for path in [chirps_path, era5_path]:
            if not path.exists():
                print(f"❌ Missing: {path.name}")
                continue
            with rasterio.open(path) as src:
                if src.count != 12:
                    print(f"❌ {path.name}: {src.count} bands (expected 12)")

    print("✅ Temporal bands validated")


In [10]:
validate_temporal_bands()


=== Temporal Band Check ===
✅ Temporal bands validated


In [12]:
def validate_nodata():
    """Ensure NoData = -9999 and valid data ranges."""
    print("\n=== NoData & Value Ranges ===")
    layers = {
        "Soil_Clay": (Path(soil_processed_dir / "tadla_clay_10m.tif"), (0, 100)),  # %
        "NDVI": (Path(land_use_processed_dir / "Sentinel2_Tadla_NDVI_2023.tif"), (-1, 1)),
        "CHIRPS": (Path(weather_processed_dir / "CHIRPS_Annual/CHIRPS_2023_reproj.tif"), (0, 500)),  # mm/month
        "DEM": (Path(dem_processed_path), (0, 2000))  # meters
    }
    
    for name, (path, expected_range) in layers.items():
        with rasterio.open(path) as src:
            data = src.read(1)
            valid_data = data[data != src.nodata]
            
            # Check NoData
            if src.nodata != -9999:
                print(f"❌ {name}: NoData = {src.nodata} (expected -9999)")
            
            # Check value ranges
            min_val, max_val = np.nanmin(valid_data), np.nanmax(valid_data)
            if min_val < expected_range[0] or max_val > expected_range[1]:
                print(f"⚠️ {name}: Values ({min_val:.2f}-{max_val:.2f}) outside expected {expected_range}")

    print("✅ NoData & ranges validated")


In [13]:
validate_nodata()


=== NoData & Value Ranges ===
❌ NDVI: NoData = None (expected -9999)
⚠️ DEM: Values (0.00-3242.00) outside expected (0, 2000)
✅ NoData & ranges validated
