# Step 1: Defining the Study Area

In [None]:
import os
import yaml
from pathlib import Path
import geopandas as gpd

import matplotlib.pyplot as plt
from matplotlib.patches import Patch

import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.mask import mask

import requests
import numpy as np

from dotenv import load_dotenv
from osgeo import gdal
import ee

import xarray as xr
import rioxarray

In [None]:


# -----------------------------------------------------------------------------
# Load config.yml
# -----------------------------------------------------------------------------

# Get project root (adjust based on your folder depth)
current_dir = Path(os.getcwd())
project_root = current_dir.parent.parent  # Navigate up from "Scripts/Phase1_Data_Preprocessing"

with open(project_root / "config.yml", "r") as f:
    config = yaml.safe_load(f)

# Print the config dictionary to debug


# -----------------------------------------------------------------------------
# Construct paths
# -----------------------------------------------------------------------------

# Raw data paths
raw_data_dir = project_root / config["paths"]["raw_data"]
soil_raw_dir = raw_data_dir / "GIS/Soil"  # Matches your hardcoded path structure
morocco_path = raw_data_dir / config["paths"]["morocco_path"]
tadla_plain_path = raw_data_dir / config["paths"]["tadla_plain_raw"]
tadla_plain_boundary_path = raw_data_dir / config["paths"]["tadla_plain_boundary_raw"]
soil_raw_path = raw_data_dir / config["paths"]["soil_raw"]
dem_raw_path = raw_data_dir / config["paths"]["dem_raw"]
chirps_raw_path = raw_data_dir / config["paths"]["chirps_raw"]
era5_raw_path = raw_data_dir / config["paths"]["era5_raw"]
wv0010_raw_path = raw_data_dir / config["paths"]["wv0010_raw"]
ndvi_path = raw_data_dir / config["paths"]["ndvi_raw"]


# Processed data paths
processed_data_dir = project_root / config["paths"]["processed_data"]
soil_processed_dir = processed_data_dir / "GIS/Soil"
output_dir = processed_data_dir / "GIS/Study_Area_Boundary"
output_path = output_dir / "Tadla_plain_common.shp"
tadla_common_path = processed_data_dir / config["paths"]["tadla_boundary_processed"]
soil_processed_path = processed_data_dir / config["paths"]["soil_processed"]
dem_processed_path = processed_data_dir / config["paths"]["dem_processed"]
slope_path = processed_data_dir / "GIS/Topography/tadla_slope.tif"
aspect_path = processed_data_dir / "GIS/Topography/tadla_aspect.tif"
chirps_processed_path = processed_data_dir / config["paths"]["chirps_processed"]
era5_processed_path = processed_data_dir / config["paths"]["era5_processed"]
wv0010_processed_path = processed_data_dir / config["paths"]["wv0010_processed"]


# Harmonized data paths
harmonized_dir = Path(config["paths"]["harmonized_data"])
weather_processed_dir = processed_data_dir / "Weather"
chirps_output_dir = Path(config["paths"]["chirps_dir"])

output_path_dataset = harmonized_dir / "tadla_spatiotemporal_dataset.nc"



# Ensure output directories exist
harmonized_dir.mkdir(exist_ok=True, parents=True)

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
os.makedirs(era5_processed_path.parent, exist_ok=True)

In [None]:
# Load Morocco boundary
morocco = gpd.read_file(morocco_path)

# Check the first few rows to see province names
morocco.head()

In [None]:
print(morocco.crs)

In [None]:
morocco_merchiche = morocco.to_crs(epsg=26191)

In [None]:
morocco_merchiche.plot()

In [None]:
# Load Tadla Plain shapefile
tadla_plain_polygon = gpd.read_file(tadla_plain_path)

# Check the data
print(tadla_plain_polygon)  # Show first few rows


In [None]:
tadla_plain_polygon.plot()  # Plot the geometry

In [None]:
print(f"Study area size: {tadla_plain_polygon.geometry.area} m²") 

In [None]:
# Reproject to Merchich (EPSG:26191)
tadla_merchiche = tadla_plain_polygon.to_crs(epsg=26191)

# Calculate area
area_m2 = tadla_merchiche.geometry.area
print(f"Study area size: {area_m2[0]:.2f} m²")  
# Example output: "Study area size: 1300000000.00 m²"

area_ha = area_m2 / 10000
print(f"Study area size: {area_ha[0]:.2f} hectares")  
# Example output: "Study area size: 130000.00 hectares"


In [None]:
tadla_merchiche.plot()

In [None]:
# Load the cleaned boundary shapefile
Tadla_plain_boundary = gpd.read_file(tadla_plain_boundary_path)
# Check the current CRS
print(Tadla_plain_boundary.crs)

In [None]:
# Convert to Merchich CRS if needed
if Tadla_plain_boundary.crs != "EPSG:26191":
    Tadla_plain_boundary = Tadla_plain_boundary.to_crs(epsg=26191)


In [None]:
Tadla_plain_boundary.plot()

In [None]:
# Assume these are already loaded and in the same CRS (EPSG:26191)
# tadla_merchiche: full administrative boundary (Merchich)
# tadla_plain_polygone: digitized Tadla plain (which may be slightly off)

# Compute the common (intersecting) area between the two layers
tadla_plain = gpd.overlay(Tadla_plain_boundary, tadla_merchiche, how='intersection')

# Save the resulting common area shapefile for further analysis
tadla_plain.to_file(output_path)

In [None]:


# Plot layers with explicit labels
fig, ax = plt.subplots(figsize=(8, 8))
tadla_merchiche.plot(ax=ax, facecolor="none", edgecolor="red", linewidth=2)
Tadla_plain_boundary.plot(ax=ax, facecolor="blue", alpha=0.5, edgecolor="blue")
tadla_plain.plot(ax=ax, facecolor="green", alpha=0.5, edgecolor="black")

# Create custom legend
legend_labels = {
    "Full Admin Boundary": "red",
    "Digitized Tadla Plain": "blue",
    "Common Area": "green"
}
patches = [Patch(color=color, label=label) for label, color in legend_labels.items()]
plt.legend(handles=patches)

plt.title("Common Area between Tadla Plain and Full Admin Boundary")
plt.show()

In [None]:
# Plot layers with explicit labels
fig, ax = plt.subplots(figsize=(8, 8))
tadla_merchiche.plot(ax=ax, facecolor="none", edgecolor="red", linewidth=2)
Tadla_plain_boundary.plot(ax=ax, facecolor="blue", alpha=0.5, edgecolor="blue")
tadla_plain.plot(ax=ax, facecolor="green", alpha=0.5, edgecolor="black")
morocco_merchiche.plot(ax=ax, facecolor="none", edgecolor="brown", linewidth=1)

# Create custom legend
legend_labels = {
    "Full Admin Boundary": "red",
    "Digitized Tadla Plain": "blue",
    "Common Area": "green",
    "Morocco": "brown"
}
patches = [Patch(color=color, label=label) for label, color in legend_labels.items()]
plt.legend(handles=patches)

plt.title("Common Area between Tadla Plain and Full Admin Boundary of Morocco")
plt.show()

In [None]:
tadla_plain = tadla_plain.to_crs(epsg=26191)  # Ensure projection
tadla_merchiche = tadla_merchiche.to_crs(epsg=26191)

area_plain_m2 = tadla_plain.geometry.area.sum()
area_full_m2 = tadla_merchiche.geometry.area.sum()

print(f"Tadla Plain area: {area_plain_m2:.2f} m²")
print(f"Full Admin Boundary area: {area_full_m2:.2f} m²")


In [None]:


def reproject_raster(input_path, output_path, target_crs):
    with rasterio.open(input_path) as src:
        transform, width, height = calculate_default_transform(
            src.crs, target_crs, src.width, src.height, *src.bounds
        )
        metadata = src.meta.copy()
        metadata.update({
            "crs": target_crs,
            "transform": transform,
            "width": width,
            "height": height
        })

        with rasterio.open(output_path, "w", **metadata) as dest:
            reproject(
                source=rasterio.band(src, 1),
                destination=rasterio.band(dest, 1),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=target_crs
            )

# Step 2: Downloading Soil Data (SoilGrids)

In [None]:

# Load Tadla boundary (EPSG:26191)
tadla = gpd.read_file(tadla_common_path)
tadla = tadla.to_crs("EPSG:26191")

# Get bounding box in Merchich coordinates
minx, miny, maxx, maxy = tadla.total_bounds
print(f"X: {minx}, {maxx}")  # Easting bounds
print(f"Y: {miny}, {maxy}")  # Northing bounds

### 1. Defining Parameters

In [None]:
# Bounding box of Tadla Plain in EPSG:26191 (from your URL)
minx, maxx = 339200, 459750  # X (Easting)
miny, maxy = 164400, 241200  # Y (Northing)

# Soil layers and their COVERAGEIDs (adjust if needed)
layers = {
    "clay": "clay_0-5cm_mean",
    "silt": "silt_0-5cm_mean",
    "sand": "sand_0-5cm_mean",
    "ocd": "ocd_0-5cm_mean",    # Organic carbon density
    "wv0010": "wv0010_0-5cm_mean"     # Water content at saturation
}

### 2. Python Script to Download All Layers

In [None]:

os.makedirs(soil_raw_dir, exist_ok=True)

for param, coverage_id in layers.items():
    url = (
        f"https://maps.isric.org/mapserv?map=/map/{param}.map&"
        f"SERVICE=WCS&"
        f"VERSION=2.0.1&"
        f"REQUEST=GetCoverage&"
        f"COVERAGEID={coverage_id}&"
        f"FORMAT=GEOTIFF_INT16&"  # Or GEOTIFF_FLOAT32 for raw values
        f"SUBSET=X({minx},{maxx})&"
        f"SUBSET=Y({miny},{maxy})&"
        f"SUBSETTINGCRS=http://www.opengis.net/def/crs/EPSG/0/26191&"
        f"OUTPUTCRS=http://www.opengis.net/def/crs/EPSG/0/26191"
    )
    print(url)
    
    # Download and save
    response = requests.get(url)
    if response.status_code == 200:
        output_path = os.path.join(soil_raw_dir, f"tadla_{param}.tif")
        with open(output_path, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {param} to {output_path}")
    else:
        print(f"Failed to download {param}: HTTP {response.status_code}")


### 3. Post-Processing

1. Unit Conversion:

    SoilGrids stores integer values as actual value × 10. 
    
    For example:
        A pixel value of 150 = 15% clay.

In [None]:
# Process soil data

    # = src.profile
   

with rasterio.open(soil_raw_path) as src:
    clay = src.read(1)
    clay = clay.astype(np.float32) / 10  # Convert to %
    profile = src.profile.copy()
    profile.update(dtype=rasterio.float32)

    with rasterio.open(soil_processed_path, "w", **profile) as dst:
        dst.write(src.read())


In [None]:
# Open raw water content data
with rasterio.open(wv0010_raw_path) as src:
    data = src.read(1) / 10  # Convert to %
    profile = src.profile.copy()

    # Calculate new dimensions for 10m resolution
    new_width = int(src.width * (src.res[0] / 10))  # From ~326m → 10m
    new_height = int(src.height * (abs(src.res[1]) / 10))  # From ~533m → 10m

    # Create empty array for resampled data
    resampled_data = np.empty((new_height, new_width), dtype=np.float32)

    # Define target transform for 10m resolution
    target_transform = rasterio.Affine(10, 0, src.bounds.left, 0, -10, src.bounds.top)

    # Resample using bilinear interpolation
    reproject(
        source=data,
        destination=resampled_data,
        src_transform=src.transform,
        dst_transform=target_transform,
        src_crs=src.crs,
        dst_crs=src.crs,
        resampling=Resampling.bilinear
    )

# Update metadata for the processed file
profile.update({
    "transform": target_transform,
    "width": new_width,
    "height": new_height,
    "dtype": "float32"
})

# Save resampled data
with rasterio.open(wv0010_processed_path, "w", **profile) as dst:
    dst.write(resampled_data, 1)

print(f"Resampled water content saved to: {wv0010_processed_path}")

In [None]:
with rasterio.open(wv0010_processed_path) as src:
    print(src.res)  # Should output (10.0, 10.0)
    print(src.read(1).min(), src.read(1).max())  # e.g., 0.0–38.9%

2. Validate CRS Alignment

    Confirm all downloaded rasters are in EPSG:26191

In [None]:

with rasterio.open(soil_raw_path) as src:
    print(src.crs)  # Should print "EPSG:26191"

# Step 3: DEM Data

1. Download DEM Data

    We’ll use ALOS PALSAR Global DEM (12.5m resolution) from Google Earth Engine (GEE).

In [None]:

# Load Tadla boundary (ensure this path is correct)
tadla_shp_path = tadla_common_path
tadla = gpd.read_file(tadla_shp_path)

# Check current CRS
print(f"Current CRS: {tadla.crs}")  # Should be EPSG:26191 (Merchich)

# Reproject to WGS84 (EPSG:4326)
tadla_wgs84 = tadla.to_crs("EPSG:4326")

# Save reprojected shapefile
tadla_wgs84.to_file(tadla_shp_path)  # Overwrite or save to a new file

In [None]:


# Load environment variables from the .env file
load_dotenv()

project_id = os.environ.get('GCP_PROJECT')
if not project_id:
    raise ValueError("The environment variable GCP_PROJECT is not set.")

print("Using project ID:", project_id)

import ee
ee.Authenticate()
ee.Initialize(project=project_id)

In [None]:
# Test authentication
print(ee.Image("NASA/NASADEM_HGT/001").get("title").getInfo())

In [None]:
bbox = ee.Geometry.Rectangle(
    [-7.5, 32.0, -5.5, 32.8],  # minx, miny, maxx, maxy
    proj="EPSG:4326"
)

In [None]:
# Load ALOS DEM ImageCollection and select the 'DSM' band
dem_collection = ee.ImageCollection("JAXA/ALOS/AW3D30/V3_2").select('DSM')

# Mosaic the collection into a single image (combines all tiles over Tadla)
dem = dem_collection.mosaic().clip(bbox)


In [None]:
# Export to Google Drive
task = ee.batch.Export.image.toDrive(
    image=dem,
    description='Tadla_DEM',
    folder='Tadla_Project',
    scale=12.5,
    region=bbox,
    crs="EPSG:26191",  # Merchich CRS
    fileFormat='GeoTIFF',
    maxPixels=1e13
)
task.start()

# Monitor task progress
print(f"Task ID: {task.id}")
print("Check progress at: https://code.earthengine.google.com/tasks")

2. Preprocess DEM
    
    Once downloaded, move the DEM to Data/Raw/GIS/Topography/ and preprocess it:

In [None]:


# Load boundary and ensure it's in the same CRS as the DEM (EPSG:26191)
tadla = gpd.read_file(tadla_common_path)
if tadla.crs != "EPSG:26191":
    tadla = tadla.to_crs("EPSG:26191")

# Load DEM and check its CRS
with rasterio.open(dem_raw_path) as src:
    dem_crs = src.crs
    print(f"DEM CRS: {dem_crs}")  # Should be EPSG:26191

    # Fix 2: Reproject boundary if DEM is in a different CRS
    if tadla.crs != dem_crs:
        tadla = tadla.to_crs(dem_crs)

    # Fix 3: Validate overlap
    dem_bounds = src.bounds
    tadla_bounds = tadla.total_bounds
    print(f"DEM Bounds: {dem_bounds}")
    print(f"Tadla Bounds: {tadla_bounds}")

    if not (
        (tadla_bounds[0] > dem_bounds.left) &
        (tadla_bounds[2] < dem_bounds.right) &
        (tadla_bounds[1] > dem_bounds.bottom) &
        (tadla_bounds[3] < dem_bounds.top)
    ):
        raise ValueError("DEM and boundary do not overlap. Check their geographic extents!")

    # Clip DEM
    tadla_dem, transform = mask(src, tadla.geometry, crop=True)
    meta = src.meta.copy()
    meta.update({
        "height": tadla_dem.shape[1],
        "width": tadla_dem.shape[2],
        "transform": transform,
        "crs": dem_crs
    })

# Save clipped DEM
with rasterio.open(dem_processed_path, "w", **meta) as dest:
    dest.write(tadla_dem)
print(f"Clipped DEM saved to: {dem_processed_path}")

In [None]:
print(f"DEM exists: {dem_raw_path.exists()}")
print(f"Boundary exists: {tadla_common_path.exists()}")

### 3. Derive Slope and Aspect

1. Calculating Slope and Aspect Using GDAL

In [None]:
# Enable GDAL exceptions
gdal.UseExceptions()

# Ensure output directories exist
os.makedirs(slope_path.parent, exist_ok=True)

# Calculate slope
slope = gdal.DEMProcessing(
    destName=str(slope_path),
    srcDS=str(dem_processed_path),
    processing="slope",
    format="GTiff",
    slopeFormat="degree"
)

# Calculate aspect
aspect = gdal.DEMProcessing(
    destName=str(aspect_path),
    srcDS=str(dem_processed_path),
    processing="aspect",
    format="GTiff"
)

print(f"Slope saved to: {slope_path}")
print(f"Aspect saved to: {aspect_path}")

# Step 4: Weather Data

1. Download CHIRPS Rainfall Data

In [None]:
# Authenticate and initialize GEE
ee.Authenticate()
ee.Initialize(project=project_id)

# Load CHIRPS data
chirps = ee.ImageCollection("UCSB-CHG/CHIRPS/DAILY")

# Define Tadla Plain geometry (use your boundary)
tadla = ee.Geometry.Rectangle([-7.5, 32.0, -5.5, 32.8])

# Filter and export
chirps_tadla = chirps.filterBounds(tadla).filterDate('2010-01-01', '2023-12-31')
task = ee.batch.Export.image.toDrive(
    image=chirps_tadla.mean(),
    description='CHIRPS_Tadla',
    folder='Tadla_Project',
    scale=5000,
    region=tadla,
    crs="EPSG:26191"
)
task.start()

2. Preprocess CHIRPS Rainfall Data

In [None]:
# Load boundary (EPSG:4326)
boundary_path = tadla_common_path
tadla = gpd.read_file(boundary_path)

# Reproject boundary to EPSG:26191 (Merchich)
tadla_merc = tadla.to_crs("EPSG:26191")

# Save reprojected boundary
tadla_merc.to_file(boundary_path)  # Overwrite or save to a new file

# Load Tadla boundary
tadla = gpd.read_file(tadla_common_path)
print(f"Boundary CRS: {tadla.crs}")  # Should be EPSG:26191 (Merchich)

In [None]:
# Load Tadla boundary
tadla = gpd.read_file(tadla_common_path)

# Load CHIRPS data
with rasterio.open(chirps_raw_path) as src:
    chirps_data, transform = mask(src, tadla.geometry, crop=True)
    meta = src.meta.copy()

# Update metadata
meta.update({
    "height": chirps_data.shape[1],
    "width": chirps_data.shape[2],
    "transform": transform,
    "crs": "EPSG:26191"
})

# Save clipped rainfall data
with rasterio.open(chirps_processed_path, "w", **meta) as dest:
    dest.write(chirps_data)

print(f"Clipped CHIRPS data saved to: {chirps_processed_path}")

3. Download ERA5 Temperature/ET Data

In [None]:

# Authenticate and initialize Earth Engine
ee.Authenticate()
ee.Initialize(project=project_id)


# Define the bounding box for the Tadla Plain (in WGS84)
bbox = ee.Geometry.Rectangle(
   [-7.5, 32.0, -5.5, 32.8],  # minx, miny, maxx, maxy
    proj="EPSG:4326"
)

# Load the ERA5 DAILY ImageCollection for a chosen period and filter by location
era5_daily = ee.ImageCollection("ECMWF/ERA5/DAILY") \
    .filterDate("2010-01-01", "2023-12-31") \
    .filterBounds(bbox)

# Get the first image to inspect available bands
first_img = ee.Image(era5_daily.first())
band_names = first_img.bandNames().getInfo()
print("Available bands in ERA5 DAILY dataset:", band_names)

# Choose the appropriate band.
# For example, if you're aiming for evaporation data, check for "evaporation" or "total_evaporation"
if "evaporation" in band_names:
    selected_band = "evaporation"
elif "total_evaporation" in band_names:
    selected_band = "total_evaporation"
else:
    # If neither exists, default to the first available band (or update with the correct one)
    selected_band = band_names[0]

print("Selected band:", selected_band)

# Mosaic the collection to combine overlapping images and select the chosen band, then clip to your area
era5_selected = era5_daily.select(selected_band).mosaic().clip(bbox)

# Export the resulting image to your Google Drive
task = ee.batch.Export.image.toDrive(
    image=era5_selected,
    description='ERA5_Evaporation_Export',
    folder='ERA5_Exports',  # Your Google Drive folder name
    scale=1000,             # Adjust scale (resolution) as needed
    region=bbox,
    crs="EPSG:26191",        # Exporting in WGS84; change if needed
    fileFormat='GeoTIFF',
    maxPixels=1e13
)
task.start()

print("Export task started with ID:", task.id)
print("Monitor the task at: https://code.earthengine.google.com/tasks")


In [None]:
# Load Tadla boundary
tadla = gpd.read_file(tadla_common_path)
# Clip ERA5 data to Tadla boundary
with rasterio.open(era5_raw_path) as src:
    era5_data, transform = mask(src, tadla.geometry, crop=True)
    meta = src.meta.copy()
    meta.update({
        "height": era5_data.shape[1],
        "width": era5_data.shape[2],
        "transform": transform,
        "crs": src.crs  # Ensure this matches the boundary CRS (EPSG:26191)
    })

# Save clipped ERA5 data
with rasterio.open(era5_processed_path, "w", **meta) as dest:
    dest.write(era5_data)

print(f"Clipped ERA5 data saved to: {era5_processed_path}")

# Step 5: Land Use/Crop Maps (Sentinel-2)

1. Authenticate & Initialize Earth Engine

In [None]:
# Authenticate (this will open a browser window for authentication if needed)
ee.Authenticate()

# Initialize with your project settings (make sure you have set your GCP_PROJECT in your environment variables)
ee.Initialize(project=project_id)

print("Earth Engine has been initialized successfully!")


In [None]:
tadla = gpd.read_file(tadla_common_path)

# Reproject to WGS84 (EPSG:4326) if needed
if tadla.crs != "EPSG:26191":
    tadla = tadla.to_crs("EPSG:26191")

# Convert to GEE geometry
tadla_geom = ee.Geometry.Polygon(tadla.geometry[0].exterior.coords[:])

In [None]:
# Load Tadla boundary (WGS84)
tadla_geom = ee.Geometry.Polygon(
    [[-7.5, 32.0], [-5.5, 32.0], [-5.5, 32.8], [-7.5, 32.8]], 
    proj="EPSG:4326", 
    geodesic=False
)

# Reproject to EPSG:26191 (Merchich)
tadla_merc = tadla_geom.transform('EPSG:26191', 1)  # 1-meter error margin

In [None]:
def get_annual_composite(year):
    start_date = f'{year}-04-01'
    end_date = f'{year}-09-30'
    
    # Load Sentinel-2 collection
    s2_collection = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED") \
        .filterBounds(tadla_merc) \
        .filterDate(start_date, end_date) \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 10))
    
    # Harmonize bands: Select and rename critical bands (B4=Red, B8=NIR)
    s2_harmonized = s2_collection.map(
        lambda img: img.select(
            ['B4', 'B8', 'SCL'],  # Keep only Red, NIR, and Scene Classification
            ['red', 'nir', 'scl']  # Rename to avoid conflicts
        ).cast({'red': 'float', 'nir': 'float'})  # Force consistent data types
    )
    
    # Compute median composite
    composite = s2_harmonized.median()
    
    # Calculate NDVI
    ndvi = composite.expression(
        '(nir - red) / (nir + red)', 
        {'nir': composite.select('nir'), 'red': composite.select('red')}
    ).rename('NDVI')
    
    return ndvi.reproject(crs='EPSG:26191', scale=10)

In [None]:
def export_ndvi(year):
    ndvi = get_annual_composite(year)
    task = ee.batch.Export.image.toDrive(
        image=ndvi,
        description=f'Sentinel2_Tadla_NDVI_{year}',
        folder='Tadla_Project',
        scale=10,
        region=tadla_merc,
        crs='EPSG:26191',
        maxPixels=1e13,
        fileFormat='GeoTIFF'
    )
    task.start()
    print(f"Exported {year}: Task ID {task.id}")

# Run for all years (2017–2023)
for year in range(2017, 2024):
    export_ndvi(year)

# Step 6 – Data Harmonization

#### 1. Resample Coarse Data (Soil/DEM) to 10m Resolution

    Goal: Resample low-resolution datasets (e.g., SoilGrids at 250m) to match NDVI’s 10m grid.
    Why: To align all datasets spatially for ML training.

In [None]:
# Extract metadata from NDVI 2017
with rasterio.open(ndvi_path) as ndvi_ref:
    ndvi_transform = ndvi_ref.transform  # 10m resolution transform
    ndvi_crs = ndvi_ref.crs             # CRS (EPSG:26191)
    ndvi_width = ndvi_ref.width         # Number of columns
    ndvi_height = ndvi_ref.height       # Number of rows

print(f"Reference CRS: {ndvi_crs}")
print(f"Reference resolution: {ndvi_transform[0]}m")

In [None]:
from rasterio.warp import reproject, Resampling
import numpy as np

# Paths (update with your actual paths)
soil_clay_10m = soil_processed_dir / "tadla_clay_10m.tif"

# Resample clay to 10m using NDVI’s grid
with rasterio.open(soil_processed_path) as src:
    # Initialize destination array with NDVI dimensions
    dst_data = np.zeros((ndvi_height, ndvi_width), dtype=np.float32)
    
    reproject(
        source=rasterio.band(src, 1),
        destination=dst_data,
        src_transform=src.transform,
        dst_transform=ndvi_transform,
        src_crs=src.crs,
        dst_crs=ndvi_crs,
        resampling=Resampling.bilinear  # Use "nearest" for categorical data
    )
    
    # Save resampled clay
    with rasterio.open(
        soil_clay_10m,
        "w",
        driver="GTiff",
        height=ndvi_height,
        width=ndvi_width,
        count=1,
        dtype=np.float32,
        crs=ndvi_crs,
        transform=ndvi_transform,
        nodata=src.nodata
    ) as dst:
        dst.write(dst_data, 1)

In [None]:
with rasterio.open(soil_clay_10m) as clay_resampled:
    print(f"Resampled clay resolution: {clay_resampled.res}")  # Should be (10.0, 10.0)
    print(f"CRS: {clay_resampled.crs}")  # Should match NDVI (EPSG:26191)

In [None]:
def resample_soil_layer(raw_path, processed_path, ndvi_transform, ndvi_crs, ndvi_height, ndvi_width):
    with rasterio.open(raw_path) as src:
        dst_data = np.zeros((ndvi_height, ndvi_width), dtype=np.float32)
        reproject(
            source=rasterio.band(src, 1),
            destination=dst_data,
            src_transform=src.transform,
            dst_transform=ndvi_transform,
            src_crs=src.crs,
            dst_crs=ndvi_crs,
            resampling=Resampling.bilinear
        )
        with rasterio.open(
            processed_path,
            "w",
            driver="GTiff",
            height=ndvi_height,
            width=ndvi_width,
            count=1,
            dtype=np.float32,
            crs=ndvi_crs,
            transform=ndvi_transform,
            nodata=src.nodata
        ) as dst:
            dst.write(dst_data, 1)
    print(f"Resampled {raw_path.name} → {processed_path}")

# Example usage:
soil_params = {
    "silt": "tadla_silt_processed.tif",
    "sand": "tadla_sand_processed.tif",
    "ocd": "tadla_ocd_processed.tif",  # Organic carbon density
    "wv0010": "tadla_wv0010_processed.tif"   # Water content at saturation
}

for param, filename in soil_params.items():
    pre_processed_path = soil_processed_dir / filename
    processed_path_10m = soil_processed_dir / f"tadla_{param}_10m.tif"
    resample_soil_layer(pre_processed_path, processed_path_10m, ndvi_transform, ndvi_crs, ndvi_height, ndvi_width)

In [None]:
for param in ["silt", "sand", "ocd", "wv0010"]:
    with rasterio.open(soil_processed_dir / f"tadla_{param}_10m.tif") as src:
        print(f"{param} resolution: {src.res}, CRS: {src.crs}")

Resample DEM (12.5m → 10m)

In [None]:
# Paths (update with your actual paths)
dem_raw = raw_data_dir / config["paths"]["dem_raw"]
dem_processed = processed_data_dir / config["paths"]["dem_processed"]

with rasterio.open(dem_raw) as src:
    dst_data = np.zeros((ndvi_height, ndvi_width), dtype=np.float32)
    reproject(
        source=rasterio.band(src, 1),
        destination=dst_data,
        src_transform=src.transform,
        dst_transform=ndvi_transform,
        src_crs=src.crs,
        dst_crs=ndvi_crs,
        resampling=Resampling.bilinear  # Use cubic for elevation
    )
    with rasterio.open(
        dem_processed,
        "w",
        driver="GTiff",
        height=ndvi_height,
        width=ndvi_width,
        count=1,
        dtype=np.float32,
        crs=ndvi_crs,
        transform=ndvi_transform,
        nodata=src.nodata
    ) as dst:
        dst.write(dst_data, 1)

In [None]:
with rasterio.open(dem_processed) as src:
    print(f"DEM resolution: {src.res}, CRS: {src.crs}")  # Should be (10.0, 10.0), EPSG:26191

#### 2. Align All Rasters to NDVI Grid
    
    Goal: Ensure all datasets (soil, DEM, weather) are spatially aligned with the NDVI grid.
    Why: Even minor misalignments will break ML models.

1. Align Weather Data (CHIRPS Rainfall and ERA5 Evaporation)