# SOILGRIDS checks
We've been having quite a lot of problems with this data set. This script loops over all the subset maps and compares their sizes, to ensure we have the same grids for every part of the data set. If there's a mismatch, files are regridded to the most common grid size in this basin.

In [19]:
from collections import defaultdict
import geopandas as gpd
import glob
import numpy as np
import os
import pandas as pd
import pyproj
import rasterio
from rasterio.enums import Resampling
from rasterio.transform import from_bounds
from shapely.geometry import box
from shapely.ops import transform
import shutil
import sys
from pathlib import Path

sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### File locations

In [2]:
# Data location
cs_main_folder = Path("/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload")

In [3]:
# Destination location
cs_update_folder = Path("/scratch/gwf/gwf_cmt/wknoben/camels-spat-upload-updates")

In [4]:
# Specify the folder structure
att_path_part1 = "geospatial"
att_path_parts2 = ["headwater", "macro-scale", "meso-scale"]
att_path_part3 = "soilgrids"

### Functions

In [5]:
def get_raster_size_and_nodata_counts(file_paths):
    '''Extracts the raster sizes and nodata counts for a list of geotiffs'''
    size_counts = defaultdict(int)
    no_data_counts = defaultdict(int)
    extent_counts = defaultdict(int)
    
    file_sizes = {}
    file_nodata = {}
    file_extents = {}
    
    for file_path in file_paths:
        with rasterio.open(file_path) as src:
            # raster size
            size = (src.height, src.width)
            size_counts[size] += 1
            file_sizes[file_path] = size

            # No-data counts
            data = src.read(1)  # read first band
            nodata_val = src.nodata  # get the NoData value
            nodata_count = np.sum(data == nodata_val)
            no_data_counts[nodata_count] +=1
            file_nodata[file_path] = nodata_count

            # Bounds
            extent = tuple(np.round(src.bounds, 6))  # rounding to avoid float precision issues
            extent_counts[extent] += 1
            file_extents[file_path] = extent

    return (
        file_paths,
        file_sizes,
        size_counts,
        file_nodata,
        no_data_counts,
        file_extents,
        extent_counts
    )

In [6]:
def check_if_regrid_needed(file_paths,file_sizes,size_counts,disp_individual=True):
    '''Checks if the outcomes of get_raster_size() are identical and flags if not'''   
    if len(size_counts) == 1:
        print("All files have the same raster size:", next(iter(size_counts)))
        needs_regrid = False
    else:
        print("Files have different raster sizes:")
        needs_regrid = True
        for size, count in size_counts.items():
            print(f"Size {size} occurs {count} times")

        if disp_individual:
            print("\nIndividual file sizes:")
            for file_path, size in file_sizes.items():
                print(f"File {file_path:<150} : {size}")

    return needs_regrid

In [7]:
def check_if_extent_update_needed(file_paths, file_extents, extent_counts):
    if len(extent_counts) == 1:
        print("All files have the same extent:", next(iter(extent_counts)))
        needs_extent_fix = False
    else:
        print("Files have different extents:")
        needs_extent_fix = True
        for extent, count in extent_counts.items():
            print(f"Size {extent} occurs {count} times")
            
    return needs_extent_fix

In [8]:
def check_if_data_likely_missing(file_paths, file_nodata, nodata_counts):
    '''Checks if we're likely missing data'''

    incomplete_files = False
    missing_data = []
    if len(nodata_counts) == 1:
        print("All files have identical numbers of missing data", next(iter(nodata_counts)))

    else: # depends on the numbers, because we can get slight differences due to regridding

        # convert to a dataframe
        df = pd.DataFrame(list(no_data_summary.items()), columns=["value","count"])

        # missing data should occur relatively rarely, i.e. low counts (number of files)
        low_count_thresh = 0.10  # 10 % of total; better flag too many than too few
        total = df["count"].sum()
        df["is_rare"] = df["count"] / total < low_count_thresh

        # missing data should have a relatively large number of missing values
        median_val = df["value"].median()
        df["abs_dist_from_median"] = np.abs(df["value"] - median_val)
        threshold = 100 # pixels; again, better flag too many than too few
        df["is_distant"] = df["abs_dist_from_median"] > threshold
        
        # combine both
        df["is_outlier"] = df["is_rare"] & df["is_distant"]

        # report back
        if any(df['is_outlier']):
            incomplete_files = True
            print("Outliers identified for missing data counts:")
            print(" No data pixels | number of files")
            for k,v in nodata_counts.items():
                print(f"  {k:>13} | {v}")
            
            outlier_values = df.loc[df["is_outlier"], "value"] # get counts identified as outlier
            outlier_set = set(outlier_values) # for next line
            matching_keys = [k for k, v in file_nodata.items() if v in outlier_set]

            print(" Likely incomplete files:")
            for val in matching_keys:
                print(f"  - {val}")
                missing_data.append(val)

        else:
            print("No outliers identified for missing data counts:")
            print(" No data pixels | number of files")
            for k,v in nodata_counts.items():
                print(f"  {k:>13} | {v}")

    return incomplete_files, missing_data

In [24]:
def compare_geotiff_pixel_data(list1, list2, tolerance=1e-6):
    """
    Compares raster pixel data between two lists of GeoTIFFs by filename match.

    Parameters:
    - list1, list2: lists of Path objects (or strings) pointing to GeoTIFF files
    - tolerance: threshold for floating point differences

    Returns:
    - List of mismatches with filenames and stats
    """
    mismatches = []
    file_dict2 = {Path(p).name: p for p in list2}

    flagged = False # set the initial "needs check" flag to False for all the files we have for this basin
    for file1 in list1:
        name = Path(file1).name
        file2 = file_dict2.get(name)

        if not file2:
            print(f"No matching file found for {name}")
            continue

        with rasterio.open(file1) as src1, rasterio.open(file2) as src2:
            data1 = src1.read(1)
            data2 = src2.read(1)

            if data1.shape != data2.shape:
                print(f"Shape mismatch in {name}: {data1.shape} vs {data2.shape}")
                mismatches.append((name, "shape mismatch"))
                continue

            diff_mask = np.abs(data1 - data2) > tolerance
            num_diff = np.sum(diff_mask)

            if num_diff > 0:
                print(f"{name}: {num_diff} differing pixels (>{tolerance})")
                mismatches.append((name, f"{num_diff} differing pixels"))
                flagged = True

    return flagged,mismatches

In [20]:
def apply_extent_to_geotiffs(file_paths, target_extent, output_dir=None):
    """
    Updates the extent of each GeoTIFF in `file_paths` to match `target_extent`.
    
    Parameters:
        file_paths: list of Path objects to GeoTIFFs
        target_extent: tuple (xmin, ymin, xmax, ymax)
        output_dir: optional Path where to save modified files. If None, overwrites in-place.
    """
    xmin, ymin, xmax, ymax = target_extent
    
    for file_path in file_paths:
        with rasterio.open(file_path) as src:
            data = src.read()
            meta = src.meta.copy()

            # Calculate new transform based on target extent and original width/height
            transform = from_bounds(xmin, ymin, xmax, ymax, src.width, src.height)
            meta.update({
                "transform": transform,
                "height": src.height,
                "width": src.width
            })

            # Determine output path
            if output_dir:
                output_path = Path(output_dir) / file_path.name
            else:
                output_path = file_path

            # Save new file with updated transform
            with rasterio.open(output_path, "w", **meta) as dst:
                dst.write(data)

In [21]:
def find_best_extent_covering_shapefile(extent_counts, shapefile_path, target_crs=None):
    """
    extent_counts: dict with extents as (xmin, ymin, xmax, ymax)
    shapefile_path: path to the shapefile to compare against
    target_crs: CRS to reproject both the shapefile and extents to (e.g. 'EPSG:4326' or projected CRS for area)
    """
    # Read and reproject shapefile
    gdf = gpd.read_file(shapefile_path)
    shape_crs = gdf.crs

    if target_crs:
        gdf = gdf.to_crs(target_crs)

    # Calculate total area of shapefile
    shapefile_geom = gdf.unary_union
    shapefile_area = shapefile_geom.area

    best_fraction = 0
    best_extent = None

    for extent in extent_counts:
        xmin, ymin, xmax, ymax = extent
        extent_geom = box(xmin, ymin, xmax, ymax)

        # Reproject extent geometry if CRS differs
        if target_crs and target_crs != shape_crs:
            project = pyproj.Transformer.from_crs(shape_crs, target_crs, always_xy=True).transform
            extent_geom = transform(project, extent_geom)

        # Compute intersection and fractional overlap
        intersection = shapefile_geom.intersection(extent_geom)
        overlap_area = intersection.area
        fraction_covered = overlap_area / shapefile_area if shapefile_area > 0 else 0

        #print(f"Extent {extent}: fractional overlap = {fraction_covered:.2%}")

        if fraction_covered > best_fraction:
            best_fraction = fraction_covered
            best_extent = extent

    print(f"Best:  {best_extent}: {best_fraction:.2%} overlap")
    return best_extent

In [22]:
def check_geotiff_overlap_with_shapefile(geotiff_paths, shapefile_path, overlap_threshold=0.99):
    """
    Checks how much each GeoTIFF overlaps with a shapefile (e.g., a basin), assuming all are in the same CRS.

    Parameters:
    - geotiff_paths: list of paths to GeoTIFF files
    - shapefile_path: path to the shapefile to compare against
    - overlap_threshold: minimum acceptable fractional overlap (0–1)

    Prints warnings for any GeoTIFF with insufficient coverage.
    """
    # Read shapefile and combine geometries
    gdf = gpd.read_file(shapefile_path)
    basin_geom = gdf.unary_union
    basin_area = basin_geom.area

    flagged = False # flag to see if we need to check this later
    
    for path in geotiff_paths:
        with rasterio.open(path) as src:
            raster_bounds = box(*src.bounds)

        # Compute intersection and coverage fraction
        intersection = basin_geom.intersection(raster_bounds)
        overlap_area = intersection.area
        fraction_covered = overlap_area / basin_area if basin_area > 0 else 0

        if fraction_covered < overlap_threshold:
            print(f"Warning: {path.name} covers < {overlap_thrshold:.2%} ({fraction_covered:.2%}) of basin")
            flagged = True
    
    return flagged

### Check that we actually have all files we expect

In [10]:
variables_all_stats = ['bdod','cfvo','clay','sand','silt','soc']
variables_mean_only = ['conductivity','porosity']
depths = ['0-5cm','5-15cm','15-30cm','30-60cm','60-100cm','100-200cm']
stats = ['mean','Q0.5','Q0.05','Q0.95','uncertainty']

total_files_expected = len(variables_all_stats) * len(depths) * len(stats) + \
    len(variables_mean_only) * len(depths)
print(f"Expecting {total_files_expected} files per basin")

Expecting 192 files per basin


In [15]:
for att_path_part2 in att_path_parts2:

    # 1. Find the basin folders
    att_middle = f"{att_path_part1}/{att_path_part2}/{att_path_part3}"
    basin_paths = [f for f in (cs_main_folder / att_middle).iterdir() if f.is_dir()]

    # 2. Loop over the basin folders
    for basin_path in basin_paths:

        # 2.1. Extract the basin ID
        basin_id = basin_path.name # just the final part, e.g. USA_08164300

        # 2.2. Find the files
        basin_files = list(basin_path.glob('*.tif'))

        # 2.3. Check if we have everything
        assert len(basin_files) == total_files_expected, f'Files missing for basin {basin_id}'
print('All expected files found.')

All expected files found.


We have all the expected files. Now check if they have the right size.

### Check that all files for a given basin have the same grid size and number of missing values
It's easier to do this together because that means we only have to open the files once.

In [11]:
regridding_needed = []
missing_files = []
extent_fix_needed = []
for att_path_part2 in att_path_parts2:

    # 1. Find the basin folders
    att_middle = f"{att_path_part1}/{att_path_part2}/{att_path_part3}"
    basin_paths = [f for f in (cs_main_folder / att_middle).iterdir() if f.is_dir()]

    # 2. Loop over the basin folders
    for basin_path in basin_paths:

        # 2.1. Extract the basin ID
        basin_id = basin_path.name # just the final part, e.g. USA_08164300
        print(f"\nChecking basin {basin_id}")

        # 2.2. Find the files
        basin_files = list(basin_path.glob('*.tif'))

        # 2.3. Compare raster sizes for all files we have for this basin
        fp, fp_sizes, sizes_summary, fp_no_data, no_data_summary, fp_extents, extent_summary = \
            get_raster_size_and_nodata_counts(basin_files)

        # 2.4. Check if this needs regridding
        if check_if_regrid_needed(fp, fp_sizes, sizes_summary):
            regridding_needed.append(basin_path)

        # 2.5. Check if this is possibly missing data
        likely_missing_flag, likely_missing_files = check_if_data_likely_missing(fp, fp_no_data, no_data_summary)
        if likely_missing_flag:
            missing_files.append(likely_missing_files)

        # 2.6. Check if we need to fix the extents
        if check_if_extent_update_needed(fp, fp_extents, extent_summary):
            extent_fix_needed.append(basin_path)


Checking basin CAN_08GB013
All files have the same raster size: (60, 107)
No outliers identified for missing data counts:
 No data pixels | number of files
           2930 | 132
           2935 | 60
Files have different extents:
Size (-123.552808, 49.773736, -123.309256, 49.910307) occurs 132 times
Size (-123.552225, 49.773188, -123.308671, 49.909761) occurs 60 times

Checking basin CAN_02HB022
All files have the same raster size: (83, 52)
No outliers identified for missing data counts:
 No data pixels | number of files
           2931 | 132
           2922 | 60
Files have different extents:
Size (-80.100374, 43.377647, -79.982013, 43.56657) occurs 132 times
Size (-80.099344, 43.376992, -79.980981, 43.565919) occurs 60 times

Checking basin USA_06746095
All files have the same raster size: (16, 23)
No outliers identified for missing data counts:
 No data pixels | number of files
            106 | 132
            115 | 60
All files have the same extent: (-105.910073, 40.514202, -105.85

In [12]:
print(f"Regridding needed for {len(regridding_needed)} files.")

Regridding needed for 0 files.


In [13]:
print(f"Extent updates needed for {len(extent_fix_needed)} files ({len(extent_fix_needed)/1426:.2f} of total basins).")

Extent updates needed for 485 files (0.34 of total basins).


### Missing values summary

In [91]:
# flatten this list
file_list = [item for sublist in missing_files for item in sublist]

In [120]:
len(file_list)

244

In [121]:
# Add the handful of cases where the automated routine seems to have missed something
basins = ['USA_13337000', 'USA_06464500', 'USA_08190000', 'CAN_04FA001', 'USA_06453600']
scales = ['macro-scale',  'macro-scale',  'macro-scale',  'macro-scale', 'macro-scale']
outlier_sets = [set([101316,98950]), 
                set([72757,72746]), 
                set([24326,24289]), 
                set([262402]), 
                set([149216,149227])]

extra_files = []
for basin, scale, outlier_set in zip(basins, scales, outlier_sets):
    basin_path = cs_main_folder / "geospatial" / scale / "soilgrids" / basin
    basin_files = list(basin_path.glob('*.tif'))
    fp, fp_sizes, sizes_summary, fp_no_data, no_data_summary = \
            get_raster_size_and_nodata_counts(basin_files)
    print(f"{basin}: {no_data_summary}")
    files = [k for k, v in fp_no_data.items() if v in outlier_set]
    extra_files.extend(files)

file_list.extend(extra_files)
len(file_list)

USA_13337000: defaultdict(<class 'int'>, {98932: 129, 98924: 60, 101316: 1, 98950: 2})
USA_06464500: defaultdict(<class 'int'>, {44918: 190, 72757: 1, 72746: 1})
USA_08190000: defaultdict(<class 'int'>, {24121: 190, 24326: 1, 24289: 1})
CAN_04FA001: defaultdict(<class 'int'>, {262122: 131, 263540: 60, 262402: 1})
USA_06453600: defaultdict(<class 'int'>, {120097: 190, 149216: 1, 149227: 1})


254

In [123]:
# format this data into something we can easily store/upload to github
basins = []
variables = []
depths = []
statistics = []
for file in file_list:
    file_name = str(file).split("/")[-1]
    file_name,_ = os.path.splitext(file_name)
    file_parts = file_name.split("_")
    basins.append(f"{file_parts[0]}_{file_parts[1]}")
    variables.append(file_parts[2])
    depths.append(file_parts[3])
    statistics.append(file_parts[4])

In [130]:
# Create a CSV file for saving
df = pd.DataFrame(data={"basin": basins,
                        "variable": variables,
                        "depth": depths,
                        "statistic": statistics})
df.set_index("basin", inplace=True)
df.sort_index(inplace=True)
df.to_csv(cs_update_folder / "soilgrids_missing_tiles.csv")

### Extent fixes

In [25]:
# loop
check_overlap = []
check_values = []
for att_path_part2 in att_path_parts2:

    # 1. Find the basin folders
    att_middle = f"{att_path_part1}/{att_path_part2}/{att_path_part3}"
    basin_paths = [f for f in (cs_main_folder / att_middle).iterdir() if f.is_dir()]

    # 2. Loop over the basin folders
    for basin_path in basin_paths:

        # 2.0. Check if this basin was tagged for fixes
        if basin_path not in extent_fix_needed:
            continue        
        print(f"\nBasin {basin_path.name}")

        # 2.1. Extract the basin ID
        basin = basin_path.name # just the final part, e.g. USA_08164300

        # 2.2. Find the files
        basin_files = list(basin_path.glob('*.tif'))

        # 2.3. Compare raster sizes for all files we have for this basin
        fp, fp_sizes, sizes_summary, fp_no_data, no_data_summary, fp_extents, extent_summary = \
            get_raster_size_and_nodata_counts(basin_files)
        assert len(sizes_summary) == 1, f"Basin {basin}: geotiff dimensions are not consistent before fix"

        # 2.4. Select the most appropriate raster extent based on the shapefile
        shp_path = cs_main_folder / 'shapefiles' / att_path_part2 / 'shapes-lumped' / basin / f"{basin}_lumped.shp"
        best_extent = find_best_extent_covering_shapefile(extent_summary, shp_path)

        # 2.5. Apply this extent to all files, to ensure consistency
        des_dir = cs_update_folder / att_path_part1 / att_path_part2 / att_path_part3 / basin
        des_dir.mkdir(exist_ok=True, parents=True)
        apply_extent_to_geotiffs(fp, best_extent, output_dir=des_dir)

        # 2.6. Run the checks again
        new_files = list(des_dir.glob('*.tif'))
        fp, fp_sizes, sizes_summary, fp_no_data, no_data_summary, fp_extents, extent_summary = \
            get_raster_size_and_nodata_counts(new_files)
        assert len(sizes_summary) == 1, f"Basin {basin}: geotiff dimensions are not consistent after fix"
        assert len(extent_summary) == 1, f"Basin {basin}: geotiff extents are not consistent after fix"

        # 2.7. Add another check to make sure we still overlap the basin
        if check_geotiff_overlap_with_shapefile(new_files, shp_path):
            check_overlap.append(basin)

        # 2.8. Add another check to make sure values are still the same
        val_flag, _ = compare_geotiff_pixel_data(basin_files,new_files)
        if val_flag:
            check_values.append(basin)


Basin CAN_08GB013
Best:  (-123.552808, 49.773736, -123.309256, 49.910307): 100.00% overlap

Basin CAN_02HB022
Best:  (-80.100374, 43.377647, -79.982013, 43.56657): 100.00% overlap

Basin USA_11481200
Best:  (-124.089411, 40.955088, -123.893657, 41.08028): 100.00% overlap

Basin USA_10343500
Best:  (-120.317729, 39.402702, -120.233509, 39.46416): 99.99% overlap

Basin CAN_02ZA002
Best:  (-58.783872, 47.973271, -58.654129, 48.114394): 100.00% overlap

Basin CAN_08HB025
Best:  (-125.325394, 49.652548, -125.068182, 49.757254): 99.99% overlap

Basin CAN_08ME023
Best:  (-123.732627, 50.76843, -123.445827, 50.886792): 100.00% overlap

Basin USA_02430615
Best:  (-88.366545, 34.369966, -88.300535, 34.442805): 99.98% overlap

Basin CAN_02ZM016
Best:  (-53.136649, 47.290414, -53.079744, 47.356424): 100.00% overlap

Basin CAN_08NM240
Best:  (-119.425454, 49.650271, -119.389034, 49.679862): 100.00% overlap

Basin CAN_10ND002
Best:  (-133.81794, 68.674969, -133.487889, 68.754637): 99.89% overlap

B

In [26]:
# Did we flag anything for further checking?
print(f"Overlap for {len(check_overlap)} basins below 99%")
print(f"Differences in before and after data value found in {len(check_values)} basins")

Overlap for 0 basins below 99%
Differences in before and after data value found in 0 basins
