In [1]:
## Trials for categorical parallel zonal stats

In [6]:
from functools import partial
from multiprocessing import Pool
from rasterstats import zonal_stats
import geopandas
import rasterio
import time

In [3]:
# The task for one worker
def calculate_n(n, zones, data_array, affine, stats, nodata, categorical=False, category_map=None):
    if not categorical:
        return zonal_stats(zones.at[n,'geometry'], data_array, affine=affine, stats=stats, nodata=nodata)[0] # [0] because we're only getting one polygon
    else:
        return zonal_stats(zones.at[n,'geometry'], data_array, affine=affine, nodata=nodata, categorical=True, category_map=category_map)[0]

In [4]:
def parallel_zonal_stats(raster, vector, zonal, n_cores, stats, categorical=False, category_map=None):

    # Read data files
    try:
        zones = geopandas.read_file(vector) 
    except Exception as e:
        print(f"Error reading vector file: {e}")
        return
    try:
        raster = rasterio.open(raster)    
    except Exception as e:
        print(f"Error reading raster file: {e}")
        return
    zones = zones[0:20]
    print('Debug: limited shapefile size')

    # Get the data array and affine transformation matrix
    affine = raster.transform
    data_array = raster.read(1)
    nodata = raster.nodata

    # Create a sequence of numbers for each zone
    num_poly = len(zones)
    n = range(0, num_poly)
    print(f'Number of polygons: {num_poly}')
    
    # Create a pool of workers and run the function calculate_n for each zone
    # map() function creates in background batches
    with Pool(n_cores) as pool:
        partial_calculate_n = partial(calculate_n, zones=zones, data_array=data_array, affine=affine, stats=stats, nodata=nodata, categorical=categorical, category_map=category_map)
        results = pool.map(partial_calculate_n, n)
    print(results)
    
    # Join the results back to geopandas dataframe
    if not categorical:
        for stat in stats:
            results_as_list = [d[stat] for d in results]
            zones[stat] = results_as_list
    else:
        for cat in category_map.values(): # water, non-crop, irrigated, rainfed
            results_as_list = [d[cat] if cat in d.keys() else 0 for d in results]
            zones[f'lg_{cat}'] = results_as_list
        
    # Write the results to file
    try:
        zones.to_file(zonal)
    except Exception as e:
        print(f"Error writing output file: {e}")
        return

In [38]:
def sequential_zonal_stats(raster, vector, zonal, stats, categorical=False, category_map=None, prefix=None):

    # Read data files
    try:
        zones = geopandas.read_file(vector) 
    except Exception as e:
        print(f"Error reading vector file: {e}")
        return
    try:
        raster = rasterio.open(raster)    
    except Exception as e:
        print(f"Error reading raster file: {e}")
        return

    # Get the data array and affine transformation matrix
    affine = raster.transform
    data_array = raster.read(1)
    nodata = raster.nodata

    # Create a sequence of numbers for each zone
    num_poly = len(zones)
    n = range(0, num_poly)
    print(f'Number of polygons: {num_poly}')
    
    # Run the zonal stats
    start_time = time.time()
    results = zonal_stats(zones, data_array, affine=affine, nodata=nodata, categorical=True, category_map=category_map)
    #print(results)
    elapsed_time = time.time() - start_time
    print(f"Elapsed zonal time: {elapsed_time} seconds")
    
    # Join the results back to geopandas dataframe
    for cat in category_map.values(): # water, non-crop, irrigated, rainfed
        results_as_list = [d[cat] if cat in d.keys() else 0 for d in results]
        zones[f'{prefix}_{cat}'] = results_as_list
    
    # Write the results to file
    try:
        zones.to_file(zonal)
    except Exception as e:
        print(f"Error writing output file: {e}")
    return zones

In [36]:
# Define the paths to your files and other parameters
raster = "/Users/wmk934/data/NorthAmerica_geospatial/modis_land/annual_mode_2001_2022/2001_2022_mode_MCD12Q1_LC_Type1.tif"
vector = "/Users/wmk934/data/NorthAmerica_geospatial/merit_basins/MERIT_Hydro_modified_North_America_shapes/basins/cat_pfaf_7_8_MERIT_Hydro_v07_Basins_v01_bugfix1_hillslopes_pfaf_7_8_clean_fixed.shp"
zonal = "/Users/wmk934/data/perceptual_models/data/zonal_stats/merit_hydro_basins_MODIS_IGBP_counts.shp"

n_cores = 2  # Number of cores to use
stats = ['mean', 'max', 'min', 'std']  # Statistics to calculate - Not used here 

categorical = True
lgrip30_categories = {0: 'water', 1: 'non-crop', 2: 'irrigated', 3: 'rainfed'}
prefix = 'lg'

categorical = True
categories = {1:  '1', #'evergreen needleleaf forest',
              2:  '2', #'evergreen broadleaf forest',
              3:  '3', #'deciduous needleleaf forest',
              4:  '4', #'deciduous broadleaf forest',
              5:  '5', #'mixed forests',
              6:  '6', #'closed shrublands',
              7:  '7', #'open shrublands',
              8:  '8', #'woody savannas',
              9:  '9', #'savannas',
              10: '10', # 'grasslands',
              11: '11', # 'permanent wetlands',
              12: '12', # 'croplands',
              13: '13', # 'urban and built-up',
              14: '14', # 'cropland/natural vegetation mosaic',
              15: '15', # 'snow and ice',
              16: '16', # 'barren or sparsely vegetated',
              17: '17', # 'water bodies',
              255:'255'} # 'unclassified'}
prefix = 'igbp'

In [39]:
start_time = time.time()
zones2 = sequential_zonal_stats(raster, vector, zonal, stats, categorical=categorical, category_map=categories, prefix=prefix)
elapsed_time = time.time() - start_time
print(f"Elapsed total time: {elapsed_time} seconds")

Number of polygons: 541832
Elapsed zonal time: 442.4088342189789 seconds
Elapsed total time: 667.3052051067352 seconds


In [None]:
# 20     -    0.09079694747924805 seconds
# 2000   -    5.26400089263916 sec
# 541832 - 1203.5353882312775 seconds