In [None]:
import os
import glob
import subprocess

import pandas as pd
import geopandas as gpd
import dask
from dask.distributed import Client, progress, LocalCluster

from pyFIRS.wrappers import lastools
from pyFIRS.utils import fname

In [None]:
# where the imported lidar data is currently stored
WORKDIR = os.path.abspath('/storage/lidar/siskiyou_2017/')

# the coordinate reference system we'll be working with
TARGET_EPSG = 6339  # utm 10N, NAD83_2011
# TARGET_EPSG = 6340  # utm 11N, NAD83_2011

Launch a parallel computing cluster. 

In [None]:
cluster = LocalCluster(scheduler_port=7001, dashboard_address=7002)
c = Client(cluster)
num_cores = len(c.ncores())  # identify how many workers we have

At this point, you should also be able to view an interactive dashboard on port 7002. If you're executing this on a remote server, you'll need to set up port forward so you can view the dashboard on your local machine's browser. Once you've done that, or if you're processing on your own machine, you can view the dashboard at [http://localhost:7002/status](http://localhost:7002/status).

In [None]:
las = lastools.useLAStools('/storage/lidar/LAStools/bin')

In [None]:
# define data handling directories
INTERIM = os.path.join(WORKDIR, 'interim')
PROCESSED = os.path.join(WORKDIR,'processed')

In [None]:
# push stuff to the workers on the cluster
c.scatter([INTERIM, PROCESSED, las, TARGET_EPSG, num_cores], broadcast=True);

In [None]:
tiles_to_merge = [fname(tile) for tile in
                  glob.glob(os.path.join(PROCESSED, 'points', '*.laz'))]

print('Found {:,d} tiles to merge derivative products from.'.format(
    len(tiles_to_merge)))

# Merge tiled derivative outputs together
Merge all the tiled GeoTiffs and Shapefiles into single overview files.

We'll produce a shapefile showing the layout of the non-buffered tiles as a single shapefile. This is a single process that takes a few seconds to run, so no need to distribute it using `dask`.

In [None]:
@dask.delayed
def tile_boundaries(*args, **kwargs):
    infiles_path = os.path.join(PROCESSED, 'points', '*.laz')
    OUTFILE = os.path.join(PROCESSED, 'vectors', 'tiles.shp')

    if os.path.exists(OUTFILE):
        pass
    else:
        proc = las.lasboundary(i=infiles_path,
                               use_bb=True,  # use bounding box of tiles
                               overview=True,
                               labels=True,
                               cores=num_cores,  # use parallel processing
                               oshp=True,
                               o=OUTFILE)
    return 'tile_boundaries'

In [None]:
@dask.delayed
def make_footprint(*args, **kwargs):
    if os.path.exists(os.path.join(PROCESSED, 'vectors', 'footprint.shp')):
        pass
    else:
        gdf = gpd.read_file(os.path.join(PROCESSED, 'vectors', 'tiles.shp'))
        gdf['mil_points'] = gdf['num_points'] / 1000000.
        buffered = gdf.drop(['file_name',
                             'point_size',
                             'point_type',
                             'num_points'],
                            axis=1)

        buffered.geometry = gdf.buffer(0.01)  # buffer by 1cm
        union = gpd.GeoDataFrame(geometry=[buffered.unary_union],
                                 crs=buffered.crs)
        union['footprint_id'] = union.index + 1

        buffered = gpd.tools.sjoin(buffered,
                                   union,
                                   how='left').drop('index_right', axis=1)

        aggfuncs = {'mil_points': 'sum',
                    'version': 'first',
                    'min_x': 'min',
                    'min_y': 'min',
                    'min_z': 'min',
                    'max_x': 'max',
                    'max_y': 'max',
                    'max_z': 'max'}

        dissolved = buffered.dissolve(by='footprint_id', aggfunc=aggfuncs)
        OUTFILE = os.path.join(PROCESSED, 'vectors', 'footprint.shp')
        dissolved.to_file(OUTFILE)

        return 'footprint'

Merge the bare earth tiles into a single GeoTiff.

In [None]:
# @dask.delayed
def make_vrt(infiles, vrtfile):
    """Mosaics files into a single GeoTiff
    
    Parameters
    ----------
    infiles : list
      list of paths to input files to mosaic
    vrtfile : str, path to file
      path to VRT file that will be created
    
    Returns
    --------
    proc : CompletedProcess
      the result of executing gdalbuildvrt using subprocess
    """
    proc = subprocess.run(['gdalbuildvrt',
                           vrtfile,
                           *infiles],
                          stderr=subprocess.PIPE,
                          stdout=subprocess.PIPE)
    return proc

In [None]:
@dask.delayed
def merge_dem(*args, **kwargs):
    infiles = glob.glob(
        os.path.join(PROCESSED, 'rasters', 'dem_tiles', '*.tif'))
    VRTFILE = os.path.join(PROCESSED, 'rasters', 'dem.vrt')

    if os.path.exists(VRTFILE):
        pass

    else:
        make_vrt(infiles, VRTFILE)

    return 'dem'

In [None]:
@dask.delayed
def merge_intensity(*args, **kwargs):
    infiles = glob.glob(
        os.path.join(PROCESSED, 'rasters', 'intensity_tiles', '*.tif'))
    VRTFILE = os.path.join(PROCESSED, 'rasters', 'intensity.vrt')
    
    if os.path.exists(VRTFILE):
        pass

    else:
        make_vrt(infiles, VRTFILE)

    return 'intensity'

Now merge the hillshade tiles into a single raster formatted as GeoTiff.

In [None]:
@dask.delayed
def merge_hillshade(*args, **kwargs):
    infiles = glob.glob(
        os.path.join(PROCESSED, 'rasters', 'hillshade_tiles', '*.tif'))
    VRTFILE = os.path.join(PROCESSED, 'rasters', 'hillshade.vrt')

    if os.path.exists(VRTFILE):
        pass

    else:
        make_vrt(infiles, VRTFILE)

    return 'hillshade'

Merge the trimmed canopy height model tiles into a single raster.

In [None]:
@dask.delayed
def merge_chm(*args, **kwargs):
    infiles = glob.glob(
        os.path.join(PROCESSED, 'rasters', 'chm_tiles', '*.tif'))
    VRTFILE = os.path.join(PROCESSED, 'rasters', 'chm.vrt')

    if os.path.exists(VRTFILE):
        pass

    else:
        make_vrt(infiles, VRTFILE)

    return 'chm'

Merge the cleaned tiles of building footprints together into a single shapefile. We'll use `geopandas` to concatenate all the polygons together into a single geodataframe and then write out to a new shapefile.

In [None]:
@dask.delayed
def merge_bldgs(*args, **kwargs):
    infiles = glob.glob(
        os.path.join(PROCESSED, 'vectors', 'building_tiles', '*.shp'))
    OUTFILE = os.path.join(PROCESSED, 'vectors', 'buildings.shp')

    if os.path.exists(OUTFILE):
        pass
    else:
        # list of geodataframes with tiles of building footprints
        gdflist = [gpd.read_file(tile) for tile in infiles]
        # merge them all together
        merged = pd.concat(gdflist, ignore_index=True)
        # add projection information back in
        merged.crs = gdflist[0].crs
        # and write the merged data to a new shapefile
        merged.to_file(OUTFILE)

    return 'bldgs'

In [None]:
@dask.delayed
def merge_highpoints(*args, **kwargs):
    infiles = glob.glob(
        os.path.join(INTERIM, 'chm_tiles', 'treesegs', '*HighPoints.shp'))
    OUTFILE = os.path.join(PROCESSED, 'vectors', 'high_points.shp')

    if os.path.exists(OUTFILE):
        pass
    else:
        # list of geodataframes with tiles of building footprints
        gdflist = [gpd.read_file(tile) for tile in infiles]
        # merge them all together
        merged = pd.concat(gdflist, ignore_index=True)
        # add projection information back in
        merged.crs = gdflist[0].crs
        # and write the merged data to a new shapefile
        merged.to_file(OUTFILE)

    return 'highpoints'

In [None]:
@dask.delayed
def merge_crowns(*args, **kwargs):
    infiles = glob.glob(
        os.path.join(INTERIM, 'chm_tiles', 'treesegs', '*Polygons.shp'))
    OUTFILE = os.path.join(PROCESSED, 'vectors', 'tree_crowns.shp')

    if os.path.exists(OUTFILE):
        pass
    else:
        # list of geodataframes with tiles of building footprints
        gdflist = [gpd.read_file(tile) for tile in infiles]
        # merge them all together
        merged = gpd.GeoDataFrame(pd.concat(gdflist, ignore_index=True))
        # add projection information back in
        merged.crs = gdflist[0].crs
        # and write the merged data to a new shapefile
        merged.to_file(OUTFILE)

    return 'crowns'

In [None]:
all_grid_tiles_paths = glob.glob(
    os.path.join(PROCESSED, 'rasters',
                 'gridmetrics_tiles', '*_strat0_intensity-median.tif'))

all_grid_tiles = [fname(tile).split('_strat0_intensity-median')[0] for
                  tile in all_grid_tiles_paths]

example_tile = os.path.basename(
    all_grid_tiles_paths[0]).split('_strat0_intensity-median.tif')[0]

grid_rasters = [os.path.basename(file).split(example_tile)[-1][1:-4] for
                file in glob.glob(
                    os.path.join(PROCESSED, 'rasters',
                                 'gridmetrics_tiles', example_tile + '*.tif'))]

print('{:d} different types of rasters from gridmetrics '
      'to process for each tile:\r\n'.format(len(grid_rasters)))

for i, raster in enumerate(grid_rasters):
    print('{}.  {}'.format(i+1, raster))

In [None]:
all_gridsurf_tiles_paths = glob.glob(
    os.path.join(PROCESSED, 'rasters',
                 'gridsurface_tiles', '*_potential_volume.tif'))

all_gridsurf_tiles = [fname(tile).split('_strat0_intensity-median')[0] for
                      tile in all_gridsurf_tiles_paths]

example_tile = os.path.basename(
    all_gridsurf_tiles_paths[0]).split('_potential_volume.tif')[0]

gridsurf_rasters = [os.path.basename(file).split(example_tile)[-1][1:-4] for
                    file in glob.glob(
                        os.path.join(PROCESSED, 'rasters',
                                     'gridsurface_tiles',
                                     example_tile + '*.tif'))]

# we don't want these redundant rasters
gridsurf_rasters = [x for x in gridsurf_rasters if x not
                    in ['mean_height', 'max_height']]

print('{:d} different types of rasters from gridsurface '
      'to process for each tile:\r\n'.format(len(gridsurf_rasters)))
for i, raster in enumerate(gridsurf_rasters):
    print('{}.  {}'.format(i+1, raster))

In [None]:
@dask.delayed
def merge_gridmetric(metric):
    infiles = glob.glob(
        os.path.join(PROCESSED, 'rasters',
                     'gridmetrics_tiles', '*{}.tif'.format(metric)))
    VRTFILE = os.path.join(PROCESSED, 'rasters', '{}.vrt'.format(metric))

    if os.path.exists(VRTFILE):
        pass

    else:
        make_vrt(infiles, VRTFILE)

    return metric

In [None]:
@dask.delayed
def merge_gridsurface(metric):
    infiles = glob.glob(
        os.path.join(PROCESSED, 'rasters',
                     'gridsurface_tiles', '*{}.tif'.format(metric)))
    VRTFILE = os.path.join(PROCESSED, 'rasters',
                           'gridsurface_tiles', '{}.vrt'.format(metric))

    if os.path.exists(VRTFILE):
        pass

    else:
        make_vrt(infiles, VRTFILE)

    return metric

A single state that will depend upon the completion of the merged rasters and vectors.

In [None]:
@dask.delayed
def merge_done(*args, **kwargs):
    return

In [None]:
# building the computation receipe
merge_dsk = {}
merge_dsk['tile_boundaries'] = (tile_boundaries,)
merge_dsk['footprint'] = (make_footprint, 'tile_boundaries')
merge_dsk['merge_bldgs'] = (merge_bldgs,)
merge_dsk['merge_hill'] = (merge_hillshade,)
merge_dsk['merge_dem'] = (merge_dem,)
merge_dsk['merge_intensity'] = (merge_intensity,)
merge_dsk['merge_chm'] = (merge_chm,)
for raster in grid_rasters:
    merge_dsk['merge_gridmetric-{}'.format(raster)] = (merge_gridmetric,
                                                       raster)
for raster in gridsurf_rasters:
    merge_dsk['merge_gridsurface-{}'.format(raster)] = (merge_gridsurface,
                                                        raster)

merge_dsk['merge_done'] = (merge_done,
                           ['tile_boundaries', 'footprint']+ #) # +
                           ['merge_bldgs'] + #)
                           ['merge_hill', 'merge_dem'] +
                           ['merge_chm', 'merge_intensity'] +
                           ['merge_gridmetric-{}'.format(raster) for
                            raster in grid_rasters] +
                           ['merge_gridsurface-{}'.format(raster) for
                            raster in gridsurf_rasters])

In [None]:
merge_graph = c.get(merge_dsk, 'merge_done')  # build the computation graph
merge_graph.visualize(rankdir='LR')

In [None]:
merge_results = c.persist(merge_graph) # this might take a while...

In [None]:
progress(merge_results)

In [None]:
# merge_results.result()

In [None]:
# c.cancel(merge_results)

In [None]:
# c.close()
# cluster.close()