# SOILGRIDS checks
We've been having quite a lot of problems with this data set. This script loops over all the subset maps and compares their sizes, to ensure we have the same grids for every part of the data set. If there's a mismatch, files are regridded to the most common grid size in this basin.

In [1]:
from collections import defaultdict
import glob
import numpy as np
import os
import pandas as pd
import rasterio
from rasterio.enums import Resampling
import shutil
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))
import python_cs_functions as cs

### Config handling

In [2]:
# Specify where the config file can be found
config_file = '../0_config/config.txt'

# Get the required info from the config file
data_path = cs.read_from_config(config_file,'data_path')

# CAMELS-spat metadata
cs_meta_path = cs.read_from_config(config_file,'cs_basin_path')
cs_meta_name = cs.read_from_config(config_file,'cs_meta_name')
cs_unusable_name = cs.read_from_config(config_file,'cs_unusable_name')

# Basin folder
cs_basin_folder = cs.read_from_config(config_file, 'cs_basin_path')
basins_path = Path(data_path) / cs_basin_folder

### Data loading

In [3]:
# CAMELS-spat metadata file
cs_meta_path = Path(data_path) / cs_meta_path
cs_meta = pd.read_csv(cs_meta_path / cs_meta_name)

# Open list of unusable stations; Enforce reading IDs as string to keep leading 0's
cs_unusable = pd.read_csv(cs_meta_path / cs_unusable_name, dtype={'Station_id': object})

### Check that we actually have all files we expect

In [4]:
# Define the soilgrids folder
sub_folders = ['bdod','cfvo','clay','sand','silt','soc']

In [5]:
# Check if we have allt he files we expect
for ix,row in cs_meta.iterrows():
    basin_id = row['Country'] + '_' + row['Station_id']
    soil_folder = basins_path / 'basin_data' / basin_id / 'geospatial' / 'soilgrids' / 'raw'
    tif_files = []
    for sub_folder in sub_folders:
        tif_files.append(glob.glob( str( soil_folder / sub_folder / '*.tif' ) ))
    tif_files = [file for files in tif_files for file in files] # flattens
    assert len(tif_files) == 180, f'Files missing for basin {basin_id}'
print('All expected files found.')

All expected files found.


We have all the expected files. Now check if they have the right size.

### Check that all files for a given basin have the same grid size and regrid if not

#### Functions we need for this

In [6]:
def get_raster_size(file_paths):
    '''Extracts the raster sizes for a list of geotiffs'''
    size_counts = defaultdict(int)
    file_sizes = {}
    for file_path in file_paths:
        with rasterio.open(file_path) as src:
            width = src.width
            height = src.height
            size = (height, width)
            size_counts[size] += 1
            file_sizes[file_path] = size

    return file_paths,file_sizes,size_counts

In [7]:
def check_if_regrid_needed(file_paths,file_sizes,size_counts,disp_individual=True):
    '''Checks if the outcomes of get_raster_size() are identical and flags if not'''   
    if len(size_counts) == 1:
        print("All files have the same raster size:", next(iter(size_counts)))
        needs_regrid = False
    else:
        print("Files have different raster sizes:")
        needs_regrid = True
        for size, count in size_counts.items():
            print(f"Size {size} occurs {count} times")

        if disp_individual:
            print("\nIndividual file sizes:")
            for file_path, size in file_sizes.items():
                print(f"File {file_path:<150} : {size}")

    return needs_regrid

In [8]:
def get_most_common_profile(sizes,most_common):
    '''Extracts the GeoTIFF profile from a file that matches the most common size (H,W) across all files'''
    for path,size in sizes.items():
        if size == most_common:
            with rasterio.open(path, 'r') as src:
                return src.profile

In [9]:
def get_most_common_nodata_mask(sizes,most_common):
    '''Extracts the data as a masked array from a file that matches the most common size (H,W) across all files'''
    for path,size in sizes.items():
        if size == most_common:
            with rasterio.open(path, 'r') as src:
                return np.ma.masked_equal(src.read(), src.nodata)

In [10]:
def extract_edges(arr):
    '''Extracts the values along the edges of a 3D matrix, assuming that size of dimension 1 is 1.'''
    if arr.shape[0] != 1:
        raise ValueError("The first dimension of the array assumed to have size 1.")
    
    # Extract the edges
    top_edge = arr[:, 0, :]        # First row
    bottom_edge = arr[:, -1, :]    # Last row
    left_edge = arr[:, :, 0]       # First column
    right_edge = arr[:, :, -1]     # Last column

    return top_edge, bottom_edge, left_edge, right_edge

In [11]:
def update_geotiff_nodata_value(input_file):
    '''Sets the nodata value of an existing GeoTIFF file to the most common value found along the data's edges'''
    with rasterio.open(input_file, 'r+') as src:
        # Current values
        data = src.read()
        current_nodata = src.nodata

        # Check if the values on the edges and find most likely nodata value
        top,bottom,left,right = extract_edges(data)
        edge_values = np.concatenate(
            (top.flatten()[1:],
             bottom.flatten()[1:],
             left.flatten()[1:],
             right.flatten()[1:])
            ) # [1:] so we don't double-count the corners
        edge_uniques,edge_counts = np.unique(edge_values, return_counts=True)
        likely_nodata = edge_uniques[np.argmax(edge_counts)]

        # Compare and update
        if likely_nodata != current_nodata:
            print(f'Setting nodata to {likely_nodata} instead of {current_nodata} in {input_file}')
            src.nodata = likely_nodata

    return likely_nodata,current_nodata

In [12]:
def regrid_geotiff(input_file, output_file, new_height, new_width, nodata_template, profile_template):

    # In some of the files we had to obtain later the nodata value is
    # set correctly to -32768, but the actual nodata cells are set to
    # 0 (or maybe something else). Here we check the edges of the matrix
    # under the assumption that the edges should be mostly nodata values
    # and set the file's nodata value to whatever that is. 
    # This prevents issues with interpolation in the next step, where a
    # basin bordered by 0s will interpolate differently along its 
    # boundary than a basin bordered by nodata values will.    
    new_nodata,old_nodata = update_geotiff_nodata_value(input_file)
    
    # Resample the raster to the desired size
    with rasterio.open(input_file) as src:
        data = src.read(
            out_shape=(src.count, new_height, new_width),
            resampling=Resampling.bilinear
        )
        profile = src.profile

    # Update the profile with the new (correct) transform and nodata values.
    # The benefit of doing this over simply writing the new geotiff with the
    # template profile is that we keep things like datatype intact.
    profile['width'] = new_width
    profile['height'] = new_height
    profile['transform'] = profile_template['transform']
    
    # Apply the nodata mask. This ensures that the interpolated data has
    # the same internal and external nodata gaps as the other files have.
    if new_nodata != old_nodata:
        if profile['dtype'] == 'int16': # e.g. clay, sand, silt
            profile['nodata'] = -32768 # Found empirically by checking existing files
        elif profile['dtype'] == 'uint16': # e.g. bdod
            profile['nodata'] = 65535 # Found empirically by checking existing files
        
    data[nodata_template.mask] = profile['nodata'] # This is where we have known nodata

    # It may be that we replaced actual data values with nodata in the first
    # step (update_geotiff_value()). Here we check if there are any nodata
    # values in the regridded matrix that do not appear in our template nodata
    # mask, and set those cells back to their original value.
    if old_nodata is not None:
        data[(data == profile['nodata']) & (~nodata_template.mask)] =  old_nodata
        
    # Write the resampled data to a new GeoTIFF file
    with rasterio.open(output_file, 'w', **profile) as dst:
        dst.write(data)

#### Processing

In [13]:
# Loop over all basins
for ix,row in cs_meta.iterrows():

    # Get the basin ID
    basin_id = row['Country'] + '_' + row['Station_id']

    # Find the folder and files
    soil_folder = basins_path / 'basin_data' / basin_id / 'geospatial' / 'soilgrids' / 'raw'
    tif_files = []
    for sub_folder in sub_folders:
        tif_files.append(glob.glob( str( soil_folder / sub_folder / '*.tif' ) ))
        
    tif_files = [file for files in tif_files for file in files] # flattens
    tif_files = sorted(tif_files) # doesn't matter, but cleaner when looking at logs

    # Loop over the files and extract the raster sizes
    paths,sizes,counts = get_raster_size(tif_files)

    # Regrid if needed
    if check_if_regrid_needed(paths,sizes,counts, disp_individual = False):

        # Progress
        print(f' - {basin_id} regridding starting.')
    
        # Create a backup folder
        shutil.copytree(soil_folder, soil_folder.parent / 'raw_backup')
        
        # Get the most common size
        most_common = max(counts, key=counts.get) # (height, width)
    
        # Get the associated GeoTIFF profile
        template_profile = get_most_common_profile(sizes, most_common)
    
        # Get the nodata mask, to fix those geotiffs that don't have proper nodata values set
        template_masked_array = get_most_common_nodata_mask(sizes,most_common)
    
        # Loop over the files and regrid if needed
        for path,size in sizes.items():
            if size != most_common:
                regrid_geotiff(path, path, most_common[0], most_common[1], template_masked_array, template_profile)

        # progress
        print(f'   {basin_id} regridding complete.')

    else:
        print(f' - {basin_id} OK as is.')

Files have different raster sizes:
Size (814, 1037) occurs 60 times
Size (815, 1038) occurs 120 times
 - CAN_01AD002 regridding starting.
   CAN_01AD002 regridding complete.
All files have the same raster size: (283, 227)
 - CAN_01AD003 OK as is.
All files have the same raster size: (274, 393)
 - CAN_01AE001 OK as is.
Files have different raster sizes:
Size (112, 175) occurs 60 times
Size (112, 176) occurs 120 times
 - CAN_01AF007 regridding starting.
   CAN_01AF007 regridding complete.
Files have different raster sizes:
Size (102, 70) occurs 60 times
Size (102, 69) occurs 120 times
 - CAN_01AF009 regridding starting.
   CAN_01AF009 regridding complete.
Files have different raster sizes:
Size (246, 206) occurs 60 times
Size (247, 207) occurs 120 times
 - CAN_01AJ003 regridding starting.
   CAN_01AJ003 regridding complete.
Files have different raster sizes:
Size (167, 124) occurs 60 times
Size (167, 125) occurs 120 times
 - CAN_01AJ004 regridding starting.
   CAN_01AJ004 regridding comp

### Manual fixes
The "no data" assumption doesn't work in 3 cases. Here we fix those.

In [14]:
rerun_these = ['CAN_02PD014','CAN_05EE006','CAN_08OA003']

In [18]:
for basin_id in rerun_these:

    # Main path
    soil_folder = basins_path / 'basin_data' / basin_id / 'geospatial' / 'soilgrids' / 'raw'

    # Restore the backup
    shutil.rmtree(soil_folder) # Removes the improperly processed files
    shutil.copytree(soil_folder.parent / 'raw_backup', soil_folder) # Restores backup folder as main

    # Find the files
    tif_files = []
    for sub_folder in sub_folders:
        tif_files.append(glob.glob( str( soil_folder / sub_folder / '*.tif' ) ))
        
    tif_files = [file for files in tif_files for file in files] # flattens
    tif_files = sorted(tif_files) # doesn't matter, but cleaner when looking at logs

    # Loop over the files and extract the raster sizes
    paths,sizes,counts = get_raster_size(tif_files)

    # Confirm that we do indeed need to process these restored backup files
    if check_if_regrid_needed(paths,sizes,counts, disp_individual = False):

        # Progress
        print(f' - {basin_id} regridding starting.')

        # Get the most common size
        most_common = max(counts, key=counts.get) # (height, width)
    
        # Get the associated GeoTIFF profile
        template_profile = get_most_common_profile(sizes, most_common)
    
        # Get the nodata mask, to fix those geotiffs that don't have proper nodata values set
        template_masked_array = get_most_common_nodata_mask(sizes,most_common)
    
        # Loop over the files and regrid if needed
        for path,size in sizes.items():
            if size != most_common:

                # Here we would originally call regrid_geotiff()
                # Now do this manually to skip the step we don't need

                # Specify function inputs
                input_file = path
                output_file = path
                new_height = most_common[0]
                new_width = most_common[1]
                nodata_template = template_masked_array
                profile_template = template_profile

                # -- Regrid
                # Resample the raster to the desired size
                with rasterio.open(input_file) as src:
                    data = src.read(
                        out_shape=(src.count, new_height, new_width),
                        resampling=Resampling.bilinear
                    )
                    profile = src.profile

                # Update the profile with the new (correct) transform and nodata values.
                profile['width'] = new_width
                profile['height'] = new_height
                profile['transform'] = profile_template['transform']

                # Apply the nodata mask      
                data[nodata_template.mask] = profile['nodata'] # This is where we have known nodata

                # Write the resampled data to a new GeoTIFF file
                with rasterio.open(output_file, 'w', **profile) as dst:
                    dst.write(data)
        

Files have different raster sizes:
Size (7, 7) occurs 60 times
Size (7, 8) occurs 120 times
 - CAN_02PD014 regridding starting.
Files have different raster sizes:
Size (5, 5) occurs 60 times
Size (6, 4) occurs 120 times
 - CAN_05EE006 regridding starting.
Files have different raster sizes:
Size (5, 6) occurs 60 times
Size (5, 5) occurs 120 times
 - CAN_08OA003 regridding starting.
