In [1]:
import os, shutil, glob, platform, subprocess
import geopandas as gpd, pandas as pd
import rasterio
from matplotlib import pyplot as plt
from pyFIRS.wrappers import lastools, fusion



In [2]:
from pyFIRS.utils import clean_dir, clean_buffer_polys, clip_tile_from_shp, convert_project

In [3]:
from dask.distributed import Client, progress, LocalCluster
from dask import delayed
import dask

### Setting up parallel computing using `dask.distributed`
`LAStools` offers native multi-core processing as an optional argument (`cores`) supplied to its command-line tools. `FUSION` command line tools do not. To enable parallel processing of `FUSION` commands, we'll use `dask.distributed` to schedule the processing of tiles in asynchronous parallel batches. This approach also offers us the ability to track progress using a progress bar.

You'll first need to launch a parallel computing cluster. 

In [4]:
cluster=LocalCluster(scheduler_port=7001, diagnostics_port=7002)

In [5]:
c = Client(cluster)

At this point, you should also be able to view an interactive dashboard on port 7002. If you're executing this on a remote server, you'll need to set up port forward so you can view the dashboard on your local machine's browser. Once you've done that, or if you're processing on your own machine, you can view the dashboard at [http://localhost:7002/status](http://localhost:7002/status).

# Enough already, let's get to work with some lidar data

We'll define where we can find the binary executables for LAStools and FUSION command line tools.

In [6]:
las = lastools.useLAStools('/storage/lidar/LAStools/bin')
fus = fusion.useFUSION('/storage/lidar/FUSION/')

We'll create working directories for raw (imported with modest clean-up from source files), interim, and processed data.

In [7]:
# where the raw lidar data is currently stored
src_tiles = '/storage/lidar/OregonCIG_Lidar/olc_metro_2014/POINTS/HYDRO_FLATTENED_POINTS/*.laz'
workdir = os.path.abspath('/storage/lidar/olc_metro_2014')

target_epsg = 2992 # oregon lambert (ft) coordinate reference system
num_cores = len(c.ncores())

# define data handling directories
raw, interim, processed = os.path.join(workdir,'raw'), os.path.join(workdir,'interim'), os.path.join(workdir,'processed')

# push our working directories and wrapper classes to the workers on the cluster as well
c.scatter([raw, interim, processed, las, fus, target_epsg, num_cores], broadcast=True);

In [8]:
# wine_prefix = ['/storage/wine/.wine-{}'.format(x) for x in range(len(c.ncores()))]
# c.scatter(wine_prefix, broadcast=False);

## 0. Get the raw data into our working directory
First, move the tiles over to our working directory.

When we define functions using the `dask.delayed` decorator, the function will have 'lazy' instead of 'eager' execution. We can map the function to a list of inputs and it will not execute for any of them until we ask for results to be computed. When we use the `compute()` method for the client managing the scheduler that sends jobs to the workers, it then starts running the jobs.

In [8]:
@dask.delayed
def import_tiles(infile): # the function we'll map to a list of inputs
    return las.las2las(i=infile,
                       odir=raw,
                       drop_withheld=True, # drop any points flagged as withheld by vendor
                       drop_class=(7,18), # drop any points classified as noise by vendor
                       clip_to_bounding_box=True, # make sure corrupted files don't change bbox
                       set_version=1.4, # upgrade to latest LAS file version
                       epsg=target_epsg, # specify the CRS in case it isn't automatically recognized
                       set_ogc_wkt=True, # make sure CRS is recorded in VLR of las file
                       olaz=True)

infiles = glob.glob(src_tiles) # input files we'll map the function to
imp_results = c.compute([import_tiles(file) for file in infiles]) # begin computation
progress(imp_results) # monitor progress

VBox()

Next, create spatial indexes for the input files to allow fast spatial queries (which are used, for example, when retiling and adding buffers).

In [10]:
@dask.delayed
def make_index(infile): # the function we'll map to a list of inputs
    return las.lasindex(i=infile) 

infiles = glob.glob(os.path.join(raw, '*.laz'))
index_results = c.compute([make_index(file) for file in infiles])
progress(index_results)

VBox()

## 1. Retile the data to add buffers for avoiding edge effects during processing.

In practice, executing the `lastile` command on individual tiles in parallel is likely to corrupt your output files. I suspect this is because the dynamic re-tiling of input files means that many output tiles are likely to require inputs from multiple input files, and that parallel processing outside of LAStools may result in collisions writing data from multiple inputs to these output tiles. So, for this case, we'll let `lastile` handle the parallelism under the hood. We won't have a progress bar, but this shouldn't take more than 5-10 minutes per ~100 tiles (with vendor tile size ~1000x1000m with 4-8 pts/m2).

**THERE ARE ARGUMENTS IN THE FOLLOWING COMMAND THAT DEPEND UPON THE UNITS OF THE DATA.**

In [None]:
%%time
tile_proc = las.lastile(i=os.path.join(raw, '*.laz'),
                   tile_size=4000, # in units of lidar data
                   buffer=100, # assumes units are in feet... if using meters, suggest changing to ~25
                   flag_as_withheld=True, # flag buffer points as "withheld", enables handling with other LAStools
                   extra_pass=True, # if outputting to LAZ format, can help avoid memory limits
                   full_bb=True,
                   olaz=True,
                   odir=os.path.join(interim, 'retiled'),
                   cores=num_cores);

If the original tiles delivered by the vendor included overlapping edges, our retiling may result in duplicated points in the new tiles from overlapping edges of vendor-provided input tiles. In the next step, we will ensure that only one point with unique (X,Y,Z) coordinates are retained in the point cloud data.

In [None]:
@dask.delayed
def remove_dupes(infile): # the function we'll map to a list of inputs
    return las.lasduplicate(i=infile,
                            unique_xyz=True,
                            olaz=True,
                            odir=os.path.join(interim, 'deduped')) 

infiles = glob.glob(os.path.join(interim, 'retiled', '*.laz')
dedupe_results = c.compute([remove_dupes(file) for file in infiles])
progress(dedupe_results)

If you want to confirm that the point cloud files you've retiled haven't been corrupted (i.e., they still match valid LAS format specifications), you can use the following function.

In [None]:
@dask.delayed
def validate(infile): # the function we'll map to a list of inputs
    return las.lasvalidate(i=infile)

infiles = glob.glob(os.path.join(interim, 'deduped', '*.laz'))
val_results = c.compute([validate(file) for file in infiles])
progress(val_results)

In [None]:
# check to see if any of the files failed validation
corrupted = [val_results[i].result().args[3] for i in range(len(val_results)) if 'fail' in val_results[i].result().stderr.decode()]
corrupted

## 2. Classify points in the lidar point cloud
First we'll remove points that are isolated as likely noise.

In [9]:
@dask.delayed
def denoise(tile_id): # the function we'll map to a list of inputs
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'deduped', basename)
    odir = os.path.join(interim, 'denoised')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc= las.lasnoise(i=infile,
                           remove_noise=True,
                           odir=odir,
                           olaz=True)
    else:
        pass
    
    return tile_id

Next, calculate the height aboveground for each point for use in classifying them.

In [10]:
@dask.delayed
def hag(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'denoised', basename)
    odir = os.path.join(interim, 'lasheight')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = las.lasheight(i=infile,
                             odir=odir,
                             olaz=True)
    else:
        pass
    
    return tile_id

Now, we'll classify points as building or high vegetation that meet certain criteria for 'planarity' or 'ruggedness'. 

**THERE ARE ARGUMENTS IN THE FOLLOWING COMMAND THAT DEPEND UPON THE UNITS OF THE DATA.**

If your data are in meters, you should change these parameters, or consider reprojecting the data to a projection that is in feet when you copy the source data into our working directory using `las2las` command at the top of this notebook.

In [11]:
@dask.delayed
def classify(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'lasheight', basename)
    odir = os.path.join(interim, 'classified')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = las.lasclassify(i=infile,
                               odir=odir,
                               olaz=True,
                               step=5, # if your data are in meters, the LAStools default is 2.0
                               planar=0.5, # if your data are in meters, the LAStools default is 0.1
                               rugged=1) # if your data are in meters, the LAStools default is 0.4
    else:
        pass
    
    return tile_id

We'll now remove the points in the buffered area of each tile and put the clean tiles in the processed folder.

In [12]:
@dask.delayed
def dropwithheld(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(processed, 'points')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
#         proc = las.las2las(i=infile,
#                            odir=odir,
#                            olaz=True,
#                            drop_withheld=True, # remove points in tile buffers that were flagged as withhled with lastile
#                            set_user_data=0) # remove height aboveground calculated using lasheight
        proc = las.lastile(i=infile,
                           odir=odir,
                           olaz=True,
                           extra_pass=True,
                           full_bb=True, # keep bbox to full extent of tile, even if points don't use full extent
                           remove_buffer=True, # remove points in tile buffers
                           drop_withheld=True, # remove points flagged as withhled
                           set_user_data=0) # remove height aboveground calculated using lasheight
    else:
        pass
    
    return tile_id

We'll produce a shapefile showing the layout of the tiles as a single shapefile. This is a single process that takes a few seconds to run, so no need to distribute it using `dask`.

In [13]:
@dask.delayed
def tiles_overview(*args, **kwargs):
    infiles = os.path.join(processed, 'points', '*.laz')
    odir = os.path.join(processed, 'vectors')
    
    if not os.path.exists(os.path.join(odir, 'tiles.shp')):
        # file doesn't already exist
        proc = las.lasboundary(i=infiles,
                               odir=odir,
                               o='tiles.shp',
                               oshp=True,
                               use_bb=True, # use bounding box of tiles
                               overview=True,
                               labels=True,
                               cores=num_cores) # use parallel processing
    else:
        pass
    
    return

## 3. Generate a bare earth Digital Elevation Model
Generate tiles of the bare earth model. This assumes that there are already ground-classified points

In [14]:
@dask.delayed
def make_dem(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(processed, 'rasters', 'DEM_tiles')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = las.las2dem(i=infile,
                           odir=odir,
                           otif=True, # create tiles as GeoTiff rasters
                           keep_class=2, # keep ground-classified returns only
                           step=1, # resolution of output raster, in units of lidar data
                           thin_with_grid=1, # use a 1 x 1 resolution for creating the TIN for the DEM
                           extra_pass=True, # uses two passes over data to execute DEM creation more efficiently
                           use_tile_bb=True) # remove buffers from tiles
    else:
        pass
    
    return tile_id

In [15]:
@dask.delayed
def add_crs_dem(tile_id):
    basename = tile_id + '.tif'
    infile = os.path.join(processed, 'rasters', 'DEM_tiles', basename)
    
    proc = subprocess.run(['rio', 'edit-info', '--crs', 'EPSG:{}'.format(target_epsg), infile],
                   stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    
    return tile_id

Merge the bare earth tiles into a single GeoTiff.

In [16]:
@dask.delayed
def merge_dem(*args, **kwargs):
    infiles = os.path.join(processed, 'rasters', 'DEM_tiles', '*.tif')
    outfile = os.path.join(processed, 'rasters', 'dem.tif')
    
    if not os.path.exists(outfile):
        # file doesn't already exist
        return subprocess.run(['rio', 'merge', *glob.glob(infiles), outfile, '--co', 'compress=LZW'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        return

To create a hillshade layer, we'll first, generate hillshade tiles from the bare earth model.

In [17]:
@dask.delayed
def make_hillshade(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(processed, 'rasters', 'hillshade_tiles')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = las.las2dem(i=infile,
                           odir=odir,
                           otif=True, # create tiles as GeoTiffs
                           hillshade=True,
                           keep_class=2, # keep ground-classified returns only
                           step=1, # resolution of output raster, in units of lidar data
                           thin_with_grid=1, # use a 0.5 x 0.5 resolution for creating the TIN for the DEM
                           extra_pass=True, # uses two passes over data to execute DEM creation more efficiently
                           use_tile_bb=True) # remove buffers from tiles
    else:
        pass
    
    return tile_id

In [18]:
@dask.delayed
def add_crs_hill(tile_id):
    basename = tile_id + '.tif'
    infile = os.path.join(processed, 'rasters', 'hillshade_tiles', basename)
    
    proc = subprocess.run(['rio', 'edit-info', '--crs', 'EPSG:{}'.format(target_epsg), infile],
                   stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    
    return tile_id

Now merge the hillshade tiles into a single raster formatted as GeoTiff.

In [19]:
@dask.delayed
def merge_hillshade(*args, **kwargs):
    infiles = os.path.join(processed, 'rasters', 'hillshade_tiles', '*.tif')
    outfile = os.path.join(processed, 'rasters', 'hillshade.tif')

    if not os.path.exists(outfile):
        # file doesn't already exist
        return subprocess.run(['rio', 'merge', *glob.glob(infiles), outfile, '--co', 'compress=LZW'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        pass
    
    return

## 4. Identify building footprints
First start by building shapefiles showing building boundaries in each buffered tile.

In [20]:
@dask.delayed
def bldg_tiles(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(interim, 'building_tiles')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = las.lasboundary(i=infile,
                               odir=odir,
                               keep_class=6, # use only building-classified points
                               disjoint=True, # compute separate polygons for each building
                               concavity=3) # map concave boundary if edge length >= 3ft
    else:
        pass
    
    return tile_id

Generate shapefiles showing the bounding box of each (unbuffered) tile that we'll use to remove buildings that fall in the buffered area.

In [21]:
@dask.delayed
def bbox_shp(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(processed, 'points', basename)
    odir = os.path.join(interim, 'tile_boundaries')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = las.lasboundary(i=infile,
                               odir=odir,
                               oshp=True,
                               use_tile_bb=True)
    else:
        pass
    
    return tile_id

For each shapefile containing polygons of the building boundaries, we'll use the `clean_buffer_polys` function from `pyFIRS.utils` to remove polygons from a tile if their centroid falls in the buffered area of the tile.

In [22]:
@dask.delayed
def clean_bldgs(tile_id, *args):
    if type(tile_id) == list:
        tile_id = tile_id[0]
    basename = tile_id + '.shp'
    infile = os.path.join(interim, 'building_tiles', basename)
    tile_shp = os.path.join(interim, 'tile_boundaries', basename)
    odir = os.path.join(processed, 'vectors', 'building_tiles')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        clean_buffer_polys(infile,
                           tile_shp,
                           odir=odir,
                           simp_tol=3,
                           simp_topol=True)
    else:
        pass
    
    return tile_id

Merge the cleaned tiles of building footprints together into a single shapefile. We'll use `geopandas` to concatenate all the polygons together into a single geodataframe and then write out to a new shapefile.

In [23]:
@dask.delayed
def merge_bldgs(*args, **kwargs):
    
    if not os.path.exists(os.path.join(processed,'vectors','buildings.shp')):
        # output file doesn't already exist
        
        building_tiles = glob.glob(os.path.join(processed, 'vectors', 'building_tiles', '*.shp'))
        # create a list of geodataframes containing the tiles of building footprints
        gdflist = [gpd.read_file(tile) for tile in building_tiles]
        # merge them all together
        merged = gpd.GeoDataFrame(pd.concat(gdflist, ignore_index=True))
        # using pandas' concat caused us to lose projection information, so let's add that back in
        merged.crs = gdflist[0].crs
        # and write the merged data to a new shapefile
        merged.to_file(os.path.join(processed,'vectors','buildings.shp'))
    
    else:
        pass
    
    return

## 5. Create a Canopy Height Model
We're going to switch use a FUSION command line tool to generate a Canopy Height Models (CHMs). 

### Using FUSION's `canopymodel` to generate CHMs
`FUSION` wants to have ground models formatted as .dtm files, for CHM development and for estimating other canopy metrics. Let's generate these ground models first using a 1-meter x-y resolution.

In [24]:
@dask.delayed
def groundDTMs(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(interim, 'dtm_ground_tiles')
    outname = os.path.basename(infile).split('.')[0] + '.dtm'
    outfile = os.path.join(odir, outname)
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = fus.gridsurfacecreate(surfacefile=outfile,
                                     cellsize=3.28084,
                                     xyunits='F',
                                     zunits='F',
                                     coordsys=2, # in State Plane
                                     zone=0, # not in UTM
                                     horizdatum=2, # NAD83
                                     vertdatum=2, # NAVD88
                                     datafile=infile,
                                     las_class=2, # keep only ground-classified points
                                     odir=odir) # will make sure output directory is created if doesn't already exist
    else:
        pass
    
    return tile_id

In [25]:
@dask.delayed
def canopymodel(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(interim, 'chm_tiles')
    outname = os.path.basename(infile).split('.')[0] + '.dtm'
    outfile = os.path.join(odir, outname)
    
    if not os.path.exists(os.path.join(odir, basename)):
        # file doesn't already exist
        proc = fus.canopymodel(surfacefile=outfile,
                               cellsize=1,
                               xyunits='F',
                               zunits='F',
                               coordsys=2, # in State Plane
                               zone=0, # not in UTM
                               horizdatum=2, # NAD83
                               vertdatum=2, # NAVD88
                               datafiles=infile,
                               ground=os.path.join(interim, 'dtm_ground_tiles', outname),
                               median=3, # median smoothing in 3x3 kernel
                               las_class=(1,2,5), # keep only ground, unclassified, and high veg points
                               asc=True, # also output in ascii format
                               odir=odir) # will make sure output directory is created if doesn't already exist
    else:
        pass
    
    return tile_id

Convert the ascii files that `canopymodel` generated into GeoTiffs, specifying their projection. Then cleanup the files `canopymodel` generated that we don't care about.

In [26]:
@dask.delayed
def asc2tif(tile_id):
    basename = tile_id + '.asc'
    infile = os.path.join(interim, 'chm_tiles', basename)
    
    if not os.path.exists(os.path.join(interim, 'chm_tiles', tile_id + '.tif')):
        # output file doesn't already exist
        convert_project(infile, '.tif', 'EPSG:2286')
    else:
        pass
    
    return tile_id

Clip the canopy height model tiles to remove overlapping areas that were from tile buffering to avoid edge effects.

In [27]:
@dask.delayed
def clip(tile_id, *args):
    if type(tile_id) == list:
        tile_id = tile_id[0]
    basename = tile_id + '.tif'
    infile = os.path.join(interim, 'chm_tiles', basename)
    in_shp = os.path.join(interim, 'tile_boundaries', tile_id + '.shp')
    odir = os.path.join(processed, 'rasters', 'chm_tiles')
    
    if not os.path.exists(os.path.join(odir, basename)):
        # output file doesn't already exist
        clip_tile_from_shp(infile, in_shp, odir)
    else:
        pass
    
    return tile_id

Merge the trimmed canopy height model tiles into a single raster.

In [28]:
@dask.delayed
def merge_chm(*args, **kwargs):
    infiles = os.path.join(processed, 'rasters', 'chm_tiles', '*.tif')
    outfile = os.path.join(processed, 'rasters', 'chm.tif')
    
    if not os.path.exists(outfile):
        # output file doesn't already exist
        proc = subprocess.run(['rio', 'merge', *glob.glob(infiles), outfile, '--co', 'compress=LZW'],
                              stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    else:
        pass
    
    return

In [29]:
@dask.delayed
def done(*args, **kwargs):
    return

In [None]:
tile_ids = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'deduped', '*.laz'))]

In [None]:
dsk = {}
for tile in tile_ids:
    dsk['denoise-{}'.format(tile)]=(denoise, tile)
    dsk['normalize-{}'.format(tile)]=(hag, 'denoise-{}'.format(tile))
    dsk['classify-{}'.format(tile)]=(classify, 'normalize-{}'.format(tile))
    dsk['drop-{}'.format(tile)]=(dropwithheld, 'classify-{}'.format(tile))
    dsk['bbox-{}'.format(tile)]=(bbox_shp, 'drop-{}'.format(tile))
    dsk['dem-{}'.format(tile)]=(make_dem, 'classify-{}'.format(tile))
    dsk['prj_dem-{}'.format(tile)]=(add_crs_dem, 'dem-{}'.format(tile))
    dsk['hill-{}'.format(tile)]=(make_hillshade, 'classify-{}'.format(tile))
    dsk['prj_hill-{}'.format(tile)]=(add_crs_hill, 'hill-{}'.format(tile))
    dsk['bldgs_buff-{}'.format(tile)]=(bldg_tiles, 'classify-{}'.format(tile))
    dsk['bldgs_clean-{}'.format(tile)]=(clean_bldgs, ['bldgs_buff-{}'.format(tile), 'bbox-{}'.format(tile)])
    dsk['ground_dtm-{}'.format(tile)]=(groundDTMs, 'classify-{}'.format(tile))
    dsk['canopy-{}'.format(tile)]=(canopymodel, 'ground_dtm-{}'.format(tile))
    dsk['canopy_tif-{}'.format(tile)] = (asc2tif, 'canopy-{}'.format(tile))
    dsk['canopy_clip-{}'.format(tile)]=(clip, ['canopy_tif-{}'.format(tile), 'bbox-{}'.format(tile)])

dsk['tiles_over'] = (tiles_overview, ['drop-{}'.format(tile) for tile in tile_ids])
dsk['merge_bldgs'] = (merge_bldgs, ['bldgs_clean-{}'.format(tile) for tile in tile_ids])
dsk['merge_hill'] = (merge_hillshade, ['prj_hill-{}'.format(tile) for tile in tile_ids])
dsk['merge_dem'] = (merge_dem, ['prj_dem-{}'.format(tile) for tile in tile_ids])
dsk['merge_chm'] = (merge_chm, ['canopy_clip-{}'.format(tile) for tile in tile_ids])
dsk['done']=(done, ['tiles_over', 'merge_bldgs', 'merge_hill', 'merge_dem', 'merge_chm'])

In [None]:
res_big = c.get(dsk, 'done')

In [None]:
res_big.visualize()

In [None]:
res = c.persist(res_big)
progress(res)

In [None]:
%%time
# get rid of the .tfw and .kml files that LAStools generates
clean_dir(os.path.join(processed, 'rasters', 'DEM_tiles'), ['.tfw', '.kml'])

In [None]:
%%time
# get rid of the .tfw and .kml files that LAStools generates
clean_dir(os.path.join(processed, 'rasters', 'hillshade_tiles'), ['.tfw', '.kml'])

In [None]:
%%time
# get rid of the .asc and .dtm files that FUSION generates
clean_dir(os.path.join(interim, 'chm_tiles'), ['.asc', '.dtm'])

In [None]:
# c.cancel(res)

In [None]:
c.close()
cluster.close()