In [None]:
import os
import yaml
from pathlib import Path
import geopandas as gpd

import matplotlib.pyplot as plt
from matplotlib.patches import Patch

import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask
from rasterio.plot import show

import requests
import numpy as np

from dotenv import load_dotenv
from osgeo import gdal
import ee

import xarray as xr
import rioxarray

import rioxarray as rxr
from dask.distributed import Client, LocalCluster
import pandas as pd

In [None]:
# Get project root (adjust based on your folder depth)
current_dir = Path(os.getcwd())
project_root = current_dir.parent.parent  # Navigate up from "Scripts/Phase1_Data_Preprocessing"

with open(project_root / "config.yml", "r") as f:
    config = yaml.safe_load(f)


raw_data_dir = project_root / config["paths"]["raw_data"]
era5_raw_path = raw_data_dir / config["paths"]["era5_raw"]

# Processed data paths
processed_data_dir = project_root / config["paths"]["processed_data"]
soil_processed_dir = processed_data_dir / "GIS/Soil"
output_dir = processed_data_dir / "GIS/Study_Area_Boundary"
output_path = output_dir / "Tadla_plain_common.shp"
tadla_common_path = processed_data_dir / config["paths"]["tadla_boundary_processed"]
soil_processed_path = processed_data_dir / config["paths"]["soil_processed"]
dem_processed_path = processed_data_dir / config["paths"]["dem_processed"]
slope_path = processed_data_dir / "GIS/Topography/tadla_slope.tif"
aspect_path = processed_data_dir / "GIS/Topography/tadla_aspect.tif"
chirps_processed_path = processed_data_dir / config["paths"]["chirps_processed"]
era5_processed_path = processed_data_dir / config["paths"]["era5_processed"]
wv0010_processed_path = processed_data_dir / config["paths"]["wv0010_processed"]
topography_processed_dir = processed_data_dir / "GIS/Topography"

land_use_processed_dir = processed_data_dir / config["paths"]["land_use_processed"]

processed_dir = project_root / Path(config['paths']['processed_data'])
soil_dir = processed_dir / "GIS/Soil" # Soil data directory: clay, sand, silt, ocd, wv0110
dem_path = processed_dir / "GIS/Topography/tadla_dem_10m.tif" 
slope_path = processed_dir / "GIS/Topography/tadla_slope.tif"
aspect_path = processed_dir / "GIS/Topography/tadla_aspect.tif"
rainfall_dir = processed_dir / "Weather/CHIRPS_Annual" # Rainfall data directory: chirps from 2017 to 2023, 1 file per year with 12 bands
evapotranspiration_dir = processed_dir / "Weather/ERA5_Annual" # Evapotranspiration data directory: era5 from 2017 to 2023, 1 file per year with 12 bands
boundaries_dir = processed_dir / "GIS/Study_Area_Boundary" 
ndvi_dir = processed_dir / "GIS/Land_Use" # NDVI data directory: ndvi from 2017 to 2023, 1 file per year with 12 bands
weather_processed_dir = processed_data_dir / "Weather"
chirps_output_dir = Path(config["paths"]["chirps_dir"])


### 1. Spatial Alignment Validation

In [None]:
def validate_spatial_alignment(reference_path):
    """Check CRS, resolution, and transform across all datasets."""
    ref = rasterio.open(reference_path)
    layers = [
        Path(soil_processed_dir / "tadla_clay_10m.tif"),
        Path(dem_processed_path),
        Path(land_use_processed_dir / "Sentinel2_Tadla_NDVI_2023.tif"),
        Path(weather_processed_dir / "CHIRPS_Annual/CHIRPS_2023_reproj.tif"),
        Path(weather_processed_dir / "ERA5_Annual/ERA5_2023_reproj.tif")
    ]
    
    print("=== Spatial Alignment Check ===")
    for layer in layers:
        with rasterio.open(layer) as src:
            if src.crs != ref.crs:
                print(f"❌ CRS mismatch: {layer.name} (EPSG:{src.crs.to_epsg()})")
            if src.res != ref.res:
                print(f"❌ Resolution mismatch: {layer.name} ({src.res}m)")
            if src.transform != ref.transform:
                print(f"❌ Transform mismatch: {layer.name}")
    
    print("✅ Spatial alignment validated (CRS: EPSG:26191, Res: 10m)")


In [None]:
ndvi_ref =  raw_data_dir / config["paths"]["ndvi_raw"]
validate_spatial_alignment(ndvi_ref)

### 2. Temporal Consistency Check

In [None]:
def validate_temporal_bands(years=range(2017, 2024)):
    """Verify 12 bands (months) exist in annual CHIRPS/ERA5 files."""
    print("\n=== Temporal Band Check ===")
    for year in years:
        chirps_path = weather_processed_dir / f"CHIRPS_Annual/CHIRPS_{year}_reproj.tif"
        era5_path = weather_processed_dir / f"ERA5_Annual/ERA5_{year}_reproj.tif"
        
        for path in [chirps_path, era5_path]:
            if not path.exists():
                print(f"❌ Missing: {path.name}")
                continue
            with rasterio.open(path) as src:
                if src.count != 12:
                    print(f"❌ {path.name}: {src.count} bands (expected 12)")

    print("✅ Temporal bands validated")


In [None]:
validate_temporal_bands()

### 3. NoData Consistency Check

In [None]:
def fix_ndvi_nodata(ndvi_path):
    """Set NoData=-9999 for NDVI rasters and replace NaNs."""
    with rasterio.open(ndvi_path, 'r+') as src:  # Open in read/write mode
        # Read data and replace NaNs
        data = src.read(1)
        data = np.nan_to_num(data, nan=-9999)
        
        # Update metadata
        src.nodata = -9999
        
        # Write corrected data
        src.write(data, 1)
    print(f"Updated {ndvi_path.name}: NoData = -9999")

# Process all NDVI files (2017–2023)
for year in range(2017, 2024):
    ndvi_path = Path(land_use_processed_dir / f"Sentinel2_Tadla_NDVI_{year}.tif")
    if ndvi_path.exists():
        fix_ndvi_nodata(ndvi_path)

In [None]:
def validate_nodata():
    """Ensure NoData = -9999 and valid data ranges."""
    print("\n=== NoData & Value Ranges ===")
    layers = {
        "Soil_Clay": (Path(soil_processed_dir / "tadla_clay_10m.tif"), (0, 100)),  # %
        "NDVI": (Path(land_use_processed_dir / "Sentinel2_Tadla_NDVI_2023.tif"), (-1, 1)),
        "CHIRPS": (Path(weather_processed_dir / "CHIRPS_Annual/CHIRPS_2023_reproj.tif"), (0, 500)),
        "ERA5": (Path(weather_processed_dir / "ERA5_Annual/ERA5_2023_reproj.tif"), (0, 20)),  # mm/month
        "DEM": (Path(dem_processed_path), (0, 3500))  # meters
    }
    
    for name, (path, expected_range) in layers.items():
        with rasterio.open(path) as src:
            data = src.read(1)
            valid_data = data[data != src.nodata]
            
            # Check NoData
            if src.nodata != -9999:
                print(f"❌ {name}: NoData = {src.nodata} (expected -9999)")
            
            # Check value ranges
            min_val, max_val = np.nanmin(valid_data), np.nanmax(valid_data)
            if min_val < expected_range[0] or max_val > expected_range[1]:
                print(f"⚠️ {name}: Values ({min_val:.2f}-{max_val:.2f}) outside expected {expected_range}")

    print("✅ NoData & ranges validated")


In [None]:
validate_nodata()

In [None]:
# In your validate_nodata() function, update the DEM expected range:
layers = {
    "DEM": (Path(dem_processed_path), (0, 3500))  # New max = 3500m
}

In [None]:
with rasterio.open(dem_processed_path) as src:
    dem = src.read(1)
    print("Elevation distribution (meters):")
    print(f"- Min: {dem.min()}")
    print(f"- 95th percentile: {np.percentile(dem[dem != -9999], 95)}")
    print(f"- Max: {dem.max()}")

### 4. Boundary Overlap Check

In [None]:
def validate_boundary_overlap():
    """Ensure all data aligns with Tadla boundary."""
    print("\n=== Boundary Overlap Check ===")
    tadla = gpd.read_file(tadla_common_path)
    
    # Sample a central point in Tadla
    sample_point = tadla.geometry.centroid[0]
    x, y = sample_point.x, sample_point.y
    
    layers = [
        Path(soil_processed_dir / "tadla_clay_10m.tif"),
        Path(land_use_processed_dir / "Sentinel2_Tadla_NDVI_2023.tif")
    ]
    
    for path in layers:
        with rasterio.open(path) as src:
            bounds = src.bounds
            # Check if Tadla centroid is within the bounds of the raster
            if not bounds.left <= x <= bounds.right and bounds.bottom <= y <= bounds.top:
                print(f"❌ {path.name}: Does not contain Tadla centroid")
            # Check if >95% of pixels are valid
            data = src.read(1)
            valid_pct = (data != src.nodata).sum() / data.size * 100
            if valid_pct < 95:
                print(f"⚠️ {path.name}: Only {valid_pct:.1f}% valid pixels")

    print("✅ Boundary overlap validated")

In [None]:
validate_boundary_overlap()

### 1. Verify ERA5 Data Integrity

##### Check Raw ERA5 Values (Before Processing)

In [None]:
import rioxarray as rxr
from rasterio.plot import show

# Load raw ERA5 data for December 2023
era5_raw = rxr.open_rasterio("D:/ERA5/ERA5_2017_reproj.tif").isel(band=11)

print("=== ERA5 Metadata ===")
print(f"CRS: {era5_raw.rio.crs}")  # Should be EPSG:26191
print(f"NoData: {era5_raw.rio.nodata}")  # Should be -9999
print(f"Shape: {era5_raw.rio.shape}")  # Expected: (height, width) matching your study area
print(f"Min: {era5_raw.min().item()}, Max: {era5_raw.max().item()}")  # Expected: ~0–300 mm/month


# Check time metadata (if available)
if "time" in era5_raw.coords:
    print("Time coordinates:")
    print(era5_raw.time.values)  # Should show monthly/daily timestamps
else:
    print("No time coordinate found. Bands may represent arbitrary time steps.")

### 2. Reprocess ERA5 Data

#### Step 1: Convert Units

In [None]:
# Load the corrected ERA5 raster
era5 = rxr.open_rasterio("D:/ERA5/ERA5_2017_reproj.tif")

# Print global metadata
print("=== Global Metadata ===")
print(f"CRS:             {era5.rio.crs}")
print(f"Resolution:      {era5.rio.resolution()}")
print(f"NoData value:    {era5.rio.nodata}")
print(f"Shape (b, h, w): {era5.shape}")
print(f"Band count:      {era5.rio.count}")

In [None]:
print(era5.min().item(), era5.max().item())

: 

In [None]:
import rioxarray as rxr

# Open with Dask chunking (one band at a time, and 1024×1024 pixel tiles)
era5 = rxr.open_rasterio(
    weather_processed_dir / "ERA5_Annual/ERA5_2017_reproj.tif",
    masked=True,
    chunks={'band': 1, 'x': 1024, 'y': 1024}
)

# Lazy flip of sign
era5_corrected = era5 * -1

# Write nodata into the dataset
era5_corrected.rio.write_nodata(-9999, inplace=True)

# Export using windowed writes (never loads full array into memory)
era5_corrected.rio.to_raster(
    "D:/ERA5/ERA5_2017_reproj.tif",
    driver="GTiff",
    tiled=True,
    compress="DEFLATE",
    windowed=True
)


#### Step 2: Convert Units and Flip Signs

### 2. Boundary Overlap Check

In [None]:
import geopandas as gpd
from rasterio.mask import geometry_mask

# Load Tadla boundary
tadla = gpd.read_file(tadla_common_path).to_crs(era5_raw.rio.crs)

# Create mask of Tadla geometry
mask = geometry_mask(
    geometries=tadla.geometry,
    transform=era5_raw.rio.transform(),
    invert=True,
    out_shape=era5_raw.rio.shape
)

# Check overlap
if not np.any(mask):
    print("❌ ERA5 data does NOT overlap with Tadla boundary!")
else:
    print("✅ ERA5 overlaps with Tadla boundary")

# Plot the mask
plt.imshow(mask, cmap="gray")
plt.title("Tadla Boundary Mask on ERA5 Grid")
plt.show()

In [None]:
import dask

# globally set these defaults
dask.config.set({
    "distributed.worker.memory.target": 0.6,
    "distributed.worker.memory.spill":  0.7,
    "distributed.worker.memory.pause":  0.8,
})

from dask.distributed import Client
client = Client(memory_limit="15GB")