# 2. Clip rasters 
to a specific extent based on a shapefile with queries and multiple polygons.

In [2]:
from importlib_resources import files
from beak.utilities.raster_processing import clip_raster
from multiprocessing import cpu_count

# Set paths
BASE_PATH = files("beak.data")
SHAPE_PATH = files("beak.data") / "BOUNDARIES" / "tl_2012_us_state_EPSG_4326.shp"

# Input folder: The folder containing the raster files to be clipped. 
# This folder should contain subfolders for each variable type (e.g. NUMERICAL, CATEGORICAL, etc.)
BASE_FOLDER = BASE_PATH / "LAWLEY22-EXPORT" / "EPSG_4326_RES_0_015"
input_folder = BASE_FOLDER / "COMPLETE_DATASET"

### U.S. Continental (lower 48 states)

In [1]:
# Output folders: The folders where the clipped raster files will be saved.
# There should be one output folder for each query. The folder structure will be copied.

output_folders = [
    BASE_FOLDER / "CLIPPED_USC",
]

# Queries: The queries used to clip the raster files.
shapefile = SHAPE_PATH              # The shapefile used to clip the raster files.
queries = [
    "NAME!='Alaska' & NAME!='Hawaii' & NAME!='Puerto Rico' & NAME!='United Sats Virgin Islands' & NAME!='American Samoa' & NAME!='Guam' & NAME!='Commonwealth of the Northern Mariana Islands'",
]

extensions = [".tif", ".tiff"]       # The extensions of the raster files to be clipped.
include_source = False               # Whether to include files in the root of the input folder.

# Clip the raster files
for i, query in enumerate(queries):
    print(f"Processing query: {i+1}/{len(queries)}")
    
    clip_raster(
        input_folder=input_folder,
        output_folder=output_folders[i],
        shapefile=shapefile,
        query=query,
        bounds=None,
        raster_extensions=[".tif", ".tiff"],
        include_source=include_source,
        n_workers=cpu_count()
    )


Processing query: 1/1
Starting parallel processing...
Done!


### Alaska
with crop at -129° east

In [3]:
# Output folders: The folders where the clipped raster files will be saved.
# There should be one output folder for each query. The folder structure will be copied.
output_folders = [
    BASE_FOLDER / "CLIPPED_ALASKA",
]

# Queries: The queries used to clip the raster files.
shapefile = SHAPE_PATH              # The shapefile used to clip the raster files.
queries = [
    "NAME=='Alaska'",
]

extensions = [".tif", ".tiff"]       # The extensions of the raster files to be clipped.
include_source = False               # Whether to include files in the root of the input folder.

# Clip the raster files
for i, query in enumerate(queries):
    print(f"Processing query: {i+1}/{len(queries)}")
    
    clip_raster(
        input_folder=input_folder,
        output_folder=output_folders[i],
        shapefile=shapefile,
        query=query,
        bounds=(None, None, -129, None),
        raster_extensions=[".tif", ".tiff"],
        include_source=include_source,
        n_workers=48,
    )


Processing query: 1/1
Starting parallel processing...
Done!
