In [None]:
import os
import glob
import subprocess
import pandas as pd
import geopandas as gpd
import dask
from dask.distributed import Client, progress, LocalCluster
from pyFIRS.wrappers import lastools
from pyFIRS.utils import fname

Launch a parallel computing cluster. 

In [None]:
cluster=LocalCluster(scheduler_port=7001, diagnostics_port=7002)
c = Client(cluster)
num_cores = len(c.ncores()) # identify how many workers we have

At this point, you should also be able to view an interactive dashboard on port 7002. If you're executing this on a remote server, you'll need to set up port forward so you can view the dashboard on your local machine's browser. Once you've done that, or if you're processing on your own machine, you can view the dashboard at [http://localhost:7002/status](http://localhost:7002/status).

In [None]:
las = lastools.useLAStools('/storage/lidar/LAStools/bin')

In [None]:
# where the imported lidar data is currently stored
WORKDIR = os.path.abspath('/storage/lidar/klamath_2010/')
# define data handling directories
INTERIM = os.path.join(WORKDIR, 'interim')
PROCESSED = os.path.join(WORKDIR,'processed')

# the coordinate reference system we'll be working with
TARGET_EPSG = 6339 # utm 10N, NAD83_2011

In [None]:
# push our working directories and wrapper classes to the workers on the cluster as well
c.scatter([INTERIM, PROCESSED, las, TARGET_EPSG, num_cores], broadcast=True);

In [None]:
tiles_to_merge = [fname(tile) for tile in 
                  glob.glob(os.path.join(PROCESSED, 'points', '*.laz'))
                 ]

print('Found {:,d} tiles to merge derivative products from.'.format(len(tiles_to_merge)))

# Merge tiled derivative outputs together
Merge all the tiled GeoTiffs and Shapefiles into single overview files.

We'll produce a shapefile showing the layout of the non-buffered tiles as a single shapefile. This is a single process that takes a few seconds to run, so no need to distribute it using `dask`.

In [None]:
@dask.delayed
def tile_boundaries(*args, **kwargs):
    odir = os.path.join(PROCESSED, 'vectors')
    
    if os.path.exists(os.path.join(PROCESSED, 'vectors', 'tiles.shp')):
        pass
    else:
        proc = las.lasboundary(i=os.path.join(PROCESSED, 'points', '*.laz'),
                               use_bb=True, # use bounding box of tiles
                               overview=True,
                               labels=True,
                               cores=num_cores, # use parallel processing
                               oshp=True,
                               o=os.path.join(PROCESSED, 'vectors', 'tiles.shp'))
    return

In [None]:
@dask.delayed
def make_footprint(*args, **kwargs):
    if os.path.exists(os.path.join(PROCESSED, 'vectors', 'footprint.shp')):
        pass
    else:
        gdf = gpd.read_file(os.path.join(PROCESSED, 'vectors', 'tiles.shp'))
        gdf['mil_points'] = gdf['num_points'] / 1000000.
        buffered = gdf.drop(['file_name', 'point_size', 'point_type', 'num_points'], axis=1)
        buffered.geometry = gdf.buffer(0.01) # buffer by 1cm
        
        try:
            union = gpd.GeoDataFrame(geometry=list(buffered.unary_union), crs=buffered.crs)
        except TypeError: # line above will fail if there is only one polygon for the footprint
            union = gpd.GeoDataFrame(geometry=[buffered.unary_union], crs=buffered.crs)
    
        union['footprint_id'] = union.index + 1
        
        buffered = gpd.tools.sjoin(buffered, union, how='left').drop('index_right', axis=1)
        
        aggfuncs={'mil_points':'sum', 
          'version':'first', 
          'min_x':'min',
          'min_y':'min',
          'min_z':'min',
          'max_x':'max',
          'max_y':'max',
          'max_z':'max'}
        
        dissolved = buffered.dissolve(by='footprint_id', aggfunc=aggfuncs)
        OUTFILE = os.path.join(PROCESSED, 'vectors', 'footprint.shp')
        dissolved.to_file(OUTFILE)
        
        return 'footprint'

In [None]:
def merge_chunks(infiles, outfile):
    """Merges a list of rasters, one chunk at a time.
    
    Parameters
    -----------
    infiles : list
        list containing paths to input files
    outfile : string, path to file
        the output file that will be created by merging all the input files
    """
    if os.path.exists(outfile):
        pass
    else:
        outname = os.path.basename(outfile).split('.')[0]
        OUTDIR = os.path.join(PROCESSED, 'rasters','chunks')
        os.makedirs(OUTDIR, exist_ok=True)
        
        # break the list of input files into chunks of 500
        for i in range(0,len(infiles),500):
            chunk_infiles = infiles[i:i+500]
            chunk_outfile = os.path.join(OUTDIR, 'chunk{}_{}.tif'.format(i, outname))
            proc = subprocess.run(['rio', 'merge', *chunk_infiles, chunk_outfile, '--co', 'compress=LZW',
                              '--co', 'tiled=true', '--co', 'blockxsize=256', '--co', 'blockysize=256',
                              '--co', 'bigtiff=YES'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
        
        chunked_outfiles = glob.glob(os.path.join(OUTDIR, 'chunk*{}*.tif'.format(outname)))
    
        proc = subprocess.run(['rio', 'merge', *chunked_outfiles, outfile, '--co', 'compress=LZW',
                              '--co', 'tiled=true', '--co', 'blockxsize=256', '--co', 'blockysize=256',
                              '--co', 'bigtiff=YES'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return

Merge the bare earth tiles into a single GeoTiff.

In [None]:
@dask.delayed
def merge_dem(*args, **kwargs):
    infiles = glob.glob(os.path.join(PROCESSED, 'rasters', 'DEM_tiles', '*.tif'))
    OUTFILE = os.path.join(PROCESSED, 'rasters', 'dem.tif')
    
    if os.path.exists(OUTFILE):
        pass
    
    elif len(infiles) < 500:
            return subprocess.run(['rio', 'merge', *infiles, OUTFILE, '--co', 'compress=LZW',
                                  '--co', 'tiled=true', '--co', 'blockxsize=256', '--co', 'blockysize=256',
                                  '--co', 'bigtiff=YES'],
                                  stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        merge_chunks(infiles, OUTFILE)
    
    return

Now merge the hillshade tiles into a single raster formatted as GeoTiff.

In [None]:
@dask.delayed
def merge_hillshade(*args, **kwargs):
    infiles = glob.glob(os.path.join(PROCESSED, 'rasters', 'hillshade_tiles', '*.tif'))
    OUTFILE = os.path.join(PROCESSED, 'rasters', 'hillshade.tif')

    if os.path.exists(OUTFILE):
        pass
    
    elif len(infiles) < 500:
            return subprocess.run(['rio', 'merge', *infiles, OUTFILE, '--co', 'compress=LZW',
                              '--co', 'tiled=true', '--co', 'blockxsize=256', '--co', 'blockysize=256',
                              '--co', 'bigtiff=YES'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        merge_chunks(infiles, OUTFILE)
    return

Merge the trimmed canopy height model tiles into a single raster.

In [None]:
@dask.delayed
def merge_chm(*args, **kwargs):
    infiles = glob.glob(os.path.join(PROCESSED, 'rasters', 'chm_tiles', '*.tif'))
    OUTFILE = os.path.join(PROCESSED, 'rasters', 'chm.tif')
    
    if os.path.exists(OUTFILE):
        pass
    
    elif len(infiles) < 500:
            proc = subprocess.run(['rio', 'merge', *infiles, OUTFILE, '--co', 'compress=LZW',
                                  '--co', 'tiled=true', '--co', 'blockxsize=256', '--co', 'blockysize=256',
                                  '--co', 'bigtiff=YES'],
                                  stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        merge_chunks(infiles, OUTFILE)
    
    return

Merge the cleaned tiles of building footprints together into a single shapefile. We'll use `geopandas` to concatenate all the polygons together into a single geodataframe and then write out to a new shapefile.

In [None]:
@dask.delayed
def merge_bldgs(*args, **kwargs):
    
    if os.path.exists(os.path.join(PROCESSED, 'vectors', 'buildings.shp')):
        pass
    else:
        building_tiles = glob.glob(os.path.join(PROCESSED, 'vectors', 'building_tiles', '*.shp'))
        # create a list of geodataframes containing the tiles of building footprints
        gdflist = [gpd.read_file(tile) for tile in building_tiles]
        # merge them all together
        merged = gpd.GeoDataFrame(pd.concat(gdflist, ignore_index=True))
        # using pandas' concat caused us to lose projection information, so let's add that back in
        merged.crs = gdflist[0].crs
        # and write the merged data to a new shapefile
        merged.to_file(os.path.join(PROCESSED, 'vectors', 'buildings.shp'))

    return

In [None]:
all_grid_tiles_paths = glob.glob(os.path.join(INTERIM, 'gridmetrics', 'rasters', '*_strat0_intensity-median.tif'))
all_grid_tiles = [fname(tile).split('_strat0_intensity-median')[0] for 
                  tile in all_grid_tiles_paths]
example_tile = os.path.basename(all_grid_tiles_paths[0]).split('_strat0_intensity-median.tif')[0]
grid_rasters = [os.path.basename(file).split(example_tile)[-1][1:-4] for file in 
                glob.glob(os.path.join(INTERIM, 'gridmetrics', 'rasters', example_tile + '*.tif'))
               ]
                
print('{:d} different types of rasters from gridmetrics to process for each tile:\r\n'.format(len(grid_rasters)))
for i, raster in enumerate(grid_rasters):
    print('{}.  {}'.format(i+1, raster))

In [None]:
all_gridsurf_tiles_paths = glob.glob(os.path.join(INTERIM, 'gridsurface', '*_potential_volume.tif'))
all_gridsurf_tiles = [fname(tile).split('_strat0_intensity-median')[0] for 
                  tile in all_gridsurf_tiles_paths]
example_tile = os.path.basename(all_gridsurf_tiles_paths[0]).split('_potential_volume.tif')[0]
gridsurf_rasters = [os.path.basename(file).split(example_tile)[-1][1:-4] for file in 
                glob.glob(os.path.join(INTERIM, 'gridsurface', example_tile + '*.tif'))
               ]
                
print('{:d} different types of rasters from gridsurface to process for each tile:\r\n'.format(len(gridsurf_rasters)))
for i, raster in enumerate(gridsurf_rasters):
    print('{}.  {}'.format(i+1, raster))

In [None]:
# @dask.delayed
def merge_gridmetric(metric):
    infiles = glob.glob(os.path.join(INTERIM, 'gridmetrics', 'rasters', '*{}*.tif'.format(metric)))
    OUTFILE = os.path.join(PROCESSED, 'rasters', '{}.tif'.format(metric))
    
    if os.path.exists(OUTFILE):
        pass
    
    elif len(infiles) < 500:
        proc = subprocess.run(['rio', 'merge', *infiles, OUTFILE, '--co', 'compress=LZW',
                              '--co', 'tiled=true', #'--co', 'blockxsize=256', '--co', 'blockysize=256',
                              '--co', 'bigtiff=YES'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        merge_chunks(infiles, OUTFILE)
    
#     print(metric)
    return metric

In [None]:
# @dask.delayed
def merge_gridsurface(metric):
    infiles = glob.glob(os.path.join(INTERIM, 'rasters', 'gridsurface', '*{}*.tif'.format(metric)))
    OUTFILE = os.path.join(PROCESSED, 'rasters', '{}.tif'.format(metric))
    
    if os.path.exists(OUTFILE):
        pass
    
    elif len(infiles) < 500:
        proc = subprocess.run(['rio', 'merge', *infiles, OUTFILE, '--co', 'compress=LZW',
                              '--co', 'tiled=true', #'--co', 'blockxsize=256', '--co', 'blockysize=256',
                              '--co', 'bigtiff=YES'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        merge_chunks(infiles, OUTFILE)
    
#     print(metric)
    return metric

A single state that will depend upon the completion of the merged rasters and vectors.

In [None]:
@dask.delayed
def merge_done(*args, **kwargs):
    return

In [None]:
# building the computation receipe
merge_dsk = {}
merge_dsk['tile_boundaries'] = (tile_boundaries,)
merge_dsk['footprint'] = (make_footprint, 'tile_boundaries')
merge_dsk['merge_bldgs'] = (merge_bldgs,)
merge_dsk['merge_hill'] = (merge_hillshade,)
merge_dsk['merge_dem'] = (merge_dem,)
merge_dsk['merge_chm'] = (merge_chm,)
# for raster in grid_rasters:
#     merge_dsk['merge_gridmetric-{}'.format(raster)] = (merge_gridmetric, raster)
    
merge_dsk['merge_done']=(merge_done, ['tile_boundaries', 'merge_bldgs', 'footprint']) #+
#                                      ['merge_hill', 'merge_dem', 'merge_chm'] + 
#                                      ['merge_gridmetric-{}'.format(raster) for raster in grid_rasters])

In [None]:
merge_graph = c.get(merge_dsk, 'merge_done') # build the computation graph
merge_graph.visualize()

In [None]:
merge_results = c.compute(merge_graph) # this might take a while...

In [None]:
progress(merge_results)

In [None]:
# merge_results.result()

In [None]:
# c.cancel(merge_results)

In [None]:
for raster in grid_rasters:
    merge_gridmetric(raster)
    print(raster)
print('Done.')

In [None]:
for raster in gridsurf_rasters:
    merge_gridsurface(raster)
    print(raster)
print('Done.')

In [None]:
# c.close()
# cluster.close()