In [1]:
import os, shutil, glob, platform, subprocess
import geopandas as gpd, pandas as pd
import rasterio
from matplotlib import pyplot as plt
from pyFIRS.wrappers import lastools, fusion



In [2]:
from pyFIRS.utils import clean_dir, clean_buffer_polys, clip_tile_from_shp, convert_project

In [3]:
from dask.distributed import Client, progress, LocalCluster
from dask import delayed
import dask

### Setting up parallel computing using `dask.distributed`
`LAStools` offers native multi-core processing as an optional argument (`cores`) supplied to its command-line tools. `FUSION` command line tools do not. To enable parallel processing of `FUSION` commands, we'll use `dask.distributed` to schedule the processing of tiles in asynchronous parallel batches. This approach also offers us the ability to track progress using a progress bar.

You'll first need to launch a parallel computing cluster. 

In [4]:
cluster=LocalCluster(scheduler_port=7001, diagnostics_port=7002)

In [5]:
c = Client(cluster)

At this point, you should also be able to view an interactive dashboard on port 7002. If you're executing this on a remote server, you'll need to set up port forward so you can view the dashboard on your local machine's browser. Once you've done that, or if you're processing on your own machine, you can view the dashboard at [http://localhost:7002/status](http://localhost:7002/status).

# Enough already, let's get to work with some lidar data

We'll define where we can find the binary executables for LAStools and FUSION command line tools.

In [6]:
las = lastools.useLAStools('/storage/lidar/LAStools/bin')
fus = fusion.useFUSION('/storage/lidar/FUSION/')

We'll create working directories for raw (imported with modest clean-up from source files), interim, and processed data.

In [7]:
# where the raw lidar data is currently stored
src_tiles = '/storage/lidar/Swinomish_Lidar_2016/source/*.laz'
workdir = os.path.abspath('/storage/lidar/Swinomish_Lidar_2016')

# define data handling directories
raw, interim, processed = os.path.join(workdir,'raw'), os.path.join(workdir,'interim'), os.path.join(workdir,'processed')

# push our working directories and wrapper classes to the workers on the cluster as well
c.scatter([raw, interim, processed, las, fus], broadcast=True);

In [None]:
# wine_prefix = ['/storage/wine/.wine-{}'.format(x) for x in range(len(c.ncores()))]
# c.scatter(wine_prefix, broadcast=False);

## 0. Get the raw data into our working directory
First, move the tiles over to our working directory.

When we define functions using the `dask.delayed` decorator, the function will have 'lazy' instead of 'eager' execution. That means we can map the function to a list of inputs and it will not execute for any of them until we ask for results to be computed. When we use the `compute()` method for the client managing the scheduler that sends jobs to the workers, 

In [8]:
@dask.delayed
def import_tiles(infile): # the function we'll map to a list of inputs
    return las.las2las(i=infile,
                       odir=raw,
                       drop_withheld=True, # drop any points flagged as withheld by vendor
                       drop_class=(7,18), # drop any points classified as noise by vendor
                       olaz=True)

infiles = glob.glob(src_tiles) # input files we'll map the function to
imp_results = c.compute([import_tiles(file) for file in infiles]) # begin computation
progress(imp_results) # monitor progress

VBox()

Next, create spatial indexes for the input files to allow fast spatial queries (which we'll use for adding buffers).

In [11]:
@dask.delayed
def make_index(infile): # the function we'll map to a list of inputs
    return las.lasindex(i=infile) 

infiles = glob.glob(os.path.join(raw, '*.laz'))
index_results = c.compute([make_index(file) for file in infiles])
progress(index_results)

VBox()

## 1. Retile the data to add buffers for avoiding edge effects during processing.

In practice, executing the `lastile` command on individual tiles in parallel is likely to corrupt your output files. I suspect this is because the dynamic re-tiling of input files means that many output tiles are likely to require inputs from multiple input files, and that parallel processing outside of LAStools may result in collisions writing data from multiple inputs to these output tiles. So, for this case, we'll let `lastile` handle the parallelism under the hood. We won't have a progress bar, but this shouldn't take more than 5-10 minutes.

**THERE ARE ARGUMENTS IN THE FOLLOWING COMMAND THAT DEPEND UPON THE UNITS OF THE DATA.**

The workflow demonstrated here is working in units of US feet on a dataset in Washington State Plane (South). 

In [12]:
%%time
las.lastile(i=os.path.join(raw, '*.laz'),
            tile_size=4000, # in units of lidar data
            buffer=100, # assumes units are in feet... if using meters, suggest changing to ~25
            flag_as_withheld=True,
            olaz=True,
            odir=os.path.join(interim, 'retiled'),
            cores=len(c.ncores()));

CPU times: user 1min 9s, sys: 12.3 s, total: 1min 22s
Wall time: 3min 46s




If you want to confirm that the point cloud files you've retiled haven't been corrupted (i.e., they still match valid LAS format specifications), you can use the following function.

In [16]:
@dask.delayed
def validate(infile): # the function we'll map to a list of inputs
    return las.lasvalidate(i=infile)

infiles = glob.glob(os.path.join(interim, 'retiled', '*.laz'))
val_results = c.compute([validate(file) for file in infiles])
progress(val_results)

VBox()

In [17]:
# check to see if any of the files failed validation
corrupted = [val_results[i].result().args[3] for i in range(len(val_results)) if 'fail' in val_results[i].result().stderr.decode()]
corrupted

[]

## 2. Classify points in the lidar point cloud
First we'll remove points that are isolated as likely noise.

In [15]:
@dask.delayed
def denoise(tile_id): # the function we'll map to a list of inputs
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'retiled', basename)
    return las.lasnoise(i=infile,
                        remove_noise=True,
                        odir=os.path.join(interim, 'denoised'),
                        olaz=True) 

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'retiled', '*.laz'))]
denoise_results = c.compute([denoise(x) for x in tiles])
progress(denoise_results)

VBox()

Next, calculate the height aboveground for each point for use in classifying them.

In [20]:
@dask.delayed
def hag(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'denoised', basename)
    return las.lasheight(i=infile,
                         odir=os.path.join(interim, 'lasheight'),
                         olaz=True)

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'denoised', '*.laz'))]
hag_results = c.compute([hag(x) for x in tiles])
progress(hag_results)

VBox()

Now, we'll classify points as building or high vegetation that meet certain criteria for 'planarity' or 'ruggedness'. 

**THERE ARE ARGUMENTS IN THE FOLLOWING COMMAND THAT DEPEND UPON THE UNITS OF THE DATA.**

If your data are in meters, you should change these parameters, or consider reprojecting the data to a projection that is in feet when you copy the source data into our working directory using `las2las` command at the top of this notebook.

In [21]:
@dask.delayed
def classify(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'lasheight', basename)
    return las.lasclassify(i=infile,
                           odir=os.path.join(interim, 'classified'),
                           olaz=True,
                           step=5, # if your data are in meters, the LAStools default is 2.0
                           planar=0.5, # if your data are in meters, the LAStools default is 0.1
                           rugged=1) # if your data are in meters, the LAStools default is 0.4

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'lasheight', '*.laz'))]
classify_results = c.compute([classify(x) for x in tiles])
progress(classify_results)

VBox()

We'll now remove the points in the buffered area of each tile and put the clean tiles in the processed folder.

In [22]:
@dask.delayed
def dropwithheld(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    return las.las2las(i=infile,
                       odir=os.path.join(processed, 'points'),
                       olaz=True,
                       drop_withheld=True, # remove points in tile buffers that were flagged as withhled with lastile
                       set_user_data=0) # remove height aboveground calculated using lasheight

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'classified', '*.laz'))]
drop_results = c.compute([dropwithheld(x) for x in tiles])
progress(drop_results)

VBox()

We'll produce a shapefile showing the layout of the tiles as a single shapefile. This is a single process that takes a few seconds to run, so no need to distribute it using `dask`.

In [23]:
%%time
infiles = os.path.join(processed, 'points', '*.laz')
odir = os.path.join(processed, 'vectors')

las.lasboundary(i=infiles,
                odir=odir,
                o='tiles.shp',
                oshp=True,
                use_bb=True, # use bounding box of tiles
                overview=True,
                labels=True,
                cores=32) # use parallel processing

print('Produced a shapefile overview of clean tile boundaries.')

Produced a shapefile overview of clean tile boundaries.
CPU times: user 924 ms, sys: 148 ms, total: 1.07 s
Wall time: 4.72 s


## 3. Generate a bare earth Digital Elevation Model
Generate tiles of the bare earth model. This assumes that there are already ground-classified points

In [24]:
@dask.delayed
def make_dem(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    return las.las2dem(i=infile,
                       odir=os.path.join(processed, 'rasters', 'DEM_tiles'),
                       otif=True, # create tiles as GeoTiff rasters
                       keep_class=2, # keep ground-classified returns only
                       step=1, # resolution of output raster, in units of lidar data
                       thin_with_grid=1, # use a 1 x 1 resolution for creating the TIN for the DEM
                       extra_pass=True, # uses two passes over data to execute DEM creation more efficiently
                       use_tile_bb=True) # remove buffers from tiles

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'classified', '*.laz'))]
dem_results = c.compute([make_dem(x) for x in tiles])
progress(dem_results)

VBox()

In [25]:
@dask.delayed
def add_crs_dem(tile_id):
    basename = tile_id + '.tif'
    infile = os.path.join(processed, 'rasters', 'DEM_tiles', basename)
    return subprocess.run(['rio', 'edit-info', '--crs', 'EPSG:2286', infile],
                          stderr=subprocess.PIPE, stdout=subprocess.PIPE)

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(processed, 'rasters', 'DEM_tiles', '*.tif'))]
crs_results = c.compute([add_crs_dem(x) for x in tiles])
progress(crs_results)

VBox()

In [26]:
%%time
# get rid of the .tfw and .kml files that LAStools generates
clean_dir(os.path.join(processed, 'rasters', 'DEM_tiles'), ['.tfw', '.kml'])

Removed 88 files with extension .tfw.
Removed 88 files with extension .kml.
CPU times: user 20 ms, sys: 4 ms, total: 24 ms
Wall time: 20.1 ms


Merge the bare earth tiles into a single GeoTiff.

In [27]:
%%time
infiles = os.path.join(processed, 'rasters', 'DEM_tiles', '*.tif')
outfile = os.path.join(processed, 'rasters', 'dem.tif')

proc_merge = subprocess.run(['rio', 'merge', *glob.glob(infiles), outfile, '--co', 'compress=LZW'],
                            stderr=subprocess.PIPE, stdout=subprocess.PIPE)

print('Done producing merged DEM GeoTiff.')

Done producing merged DEM GeoTiff.
CPU times: user 40 s, sys: 4.52 s, total: 44.5 s
Wall time: 3min 15s


To create a hillshade layer, we'll first, generate hillshade tiles from the bare earth model.

In [28]:
@dask.delayed
def make_hillshade(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    return las.las2dem(i=infile,
                       odir=os.path.join(processed, 'rasters', 'hillshade_tiles'),
                       otif=True, # create tiles as GeoTiffs
                       hillshade=True,
                       keep_class=2, # keep ground-classified returns only
                       step=1, # resolution of output raster, in units of lidar data
                       thin_with_grid=1, # use a 0.5 x 0.5 resolution for creating the TIN for the DEM
                       extra_pass=True, # uses two passes over data to execute DEM creation more efficiently
                       use_tile_bb=True) # remove buffers from tiles

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'classified', '*.laz'))]
hill_results = c.compute([make_hillshade(x) for x in tiles])
progress(hill_results)

VBox()

In [29]:
@dask.delayed
def add_crs_hill(tile_id):
    basename = tile_id + '.tif'
    infile = os.path.join(processed, 'rasters', 'hillshade_tiles', basename)
    return subprocess.run(['rio', 'edit-info', '--crs', 'EPSG:2286', infile],
                          stderr=subprocess.PIPE, stdout=subprocess.PIPE)

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(processed, 'rasters', 'hillshade_tiles', '*.tif'))]
crs_results = c.compute([add_crs_hill(x) for x in tiles])
progress(crs_results)

VBox()

In [30]:
%%time
# get rid of the .tfw and .kml files that LAStools generates
clean_dir(os.path.join(processed, 'rasters', 'hillshade_tiles'), ['.tfw', '.kml'])

Removed 88 files with extension .tfw.
Removed 88 files with extension .kml.
CPU times: user 8 ms, sys: 4 ms, total: 12 ms
Wall time: 9.55 ms


Now merge the hillshade tiles into a single raster formatted as GeoTiff.

In [31]:
%%time
infiles = os.path.join(processed, 'rasters', 'hillshade_tiles', '*.tif')
outfile = os.path.join(processed, 'rasters', 'hillshade.tif')

proc_merge = subprocess.run(['rio', 'merge', *glob.glob(infiles), outfile, '--co', 'compress=LZW'],
                            stderr=subprocess.PIPE, stdout=subprocess.PIPE)

print('Done producing merged Hillshade GeoTiff.')

Done producing merged Hillshade GeoTiff.
CPU times: user 12.3 s, sys: 1.56 s, total: 13.9 s
Wall time: 59.6 s


## 4. Identify building footprints
First start by building shapefiles showing building boundaries in each buffered tile.

In [32]:
@dask.delayed
def bldg_tiles(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    return las.lasboundary(i=infile,
                           odir=os.path.join(interim, 'building_tiles'),
                           keep_class=6, # use only building-classified points
                           disjoint=True, # compute separate polygons for each building
                           concavity=3) # map concave boundary if edge length >= 3ft

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'classified', '*.laz'))]
bldg_results = c.compute([bldg_tiles(x) for x in tiles])
progress(bldg_results)

VBox()

Generate shapefiles showing the bounding box of each (unbuffered) tile that we'll use to remove buildings that fall in the buffered area.

In [33]:
@dask.delayed
def bbox_shp(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(processed, 'points', basename)
    return las.lasboundary(i=infile,
                           odir=os.path.join(interim, 'tile_boundaries'),
                           oshp=True,
                           use_tile_bb=True)

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(processed, 'points', '*.laz'))]
bbox_results = c.compute([bbox_shp(x) for x in tiles])
progress(bbox_results)

VBox()

For each shapefile containing polygons of the building boundaries, we'll use the `clean_buffer_polys` function from `pyFIRS.utils` to remove polygons from a tile if their centroid falls in the buffered area of the tile.

In [34]:
@dask.delayed
def clean_bldgs(tile_id):
    basename = tile_id + '.shp'
    infile = os.path.join(interim, 'building_tiles', basename)
    tile_shp = os.path.join(interim, 'tile_boundaries', basename)
    return clean_buffer_polys(infile,
                              tile_shp,
                              odir=os.path.join(processed, 'vectors', 'building_tiles'),
                              simp_tol=3,
                              simp_topol=True)

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'building_tiles', '*.shp'))]
clean_results = c.compute([clean_bldgs(x) for x in tiles])
progress(clean_results)

VBox()

Merge the cleaned tiles of building footprints together into a single shapefile. We'll use `geopandas` to concatenate all the polygons together into a single geodataframe and then write out to a new shapefile.

In [35]:
%%time
building_tiles = glob.glob(os.path.join(processed, 'vectors', 'building_tiles', '*.shp'))
# create a list of geodataframes containing the tiles of building footprints
gdflist = [gpd.read_file(tile) for tile in building_tiles]
# merge them all together
merged = gpd.GeoDataFrame(pd.concat(gdflist, ignore_index=True))
# using pandas' concat caused us to lose projection information, so let's add that back in
merged.crs = gdflist[0].crs
# and write the merged data to a new shapefile
merged.to_file(os.path.join(processed,'vectors','buildings.shp'))

print('Done merging tiles of building footprints into a single shapefile.')

Done merging tiles of building footprints into a single shapefile.
CPU times: user 1.22 s, sys: 160 ms, total: 1.38 s
Wall time: 1.35 s


## 5. Create a Canopy Height Model
We're going to switch use a FUSION command line tool to generate a Canopy Height Models (CHMs). 

### Using FUSION's `canopymodel` to generate CHMs
`FUSION` wants to have ground models formatted as .dtm files, for CHM development and for estimating other canopy metrics. Let's generate these ground models first using a 1-meter x-y resolution.

In [36]:
@dask.delayed
def groundDTMs(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(interim, 'dtm_ground_tiles')
    outname = os.path.basename(infile).split('.')[0] + '.dtm'
    outfile = os.path.join(odir, outname)
    return fus.gridsurfacecreate(surfacefile=outfile,
                           cellsize=3.28084,
                           xyunits='F',
                           zunits='F',
                           coordsys=2, # in State Plane
                           zone=0, # not in UTM
                           horizdatum=2, # NAD83
                           vertdatum=2, # NAVD88
                           datafile=infile,
                           las_class=2, # keep only ground-classified points
                           odir=odir) # will make sure output directory is created if doesn't already exist

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'classified', '*.laz'))]
dtm_results = c.compute([groundDTMs(x) for x in tiles])
progress(dtm_results)

VBox()

In [37]:
@dask.delayed
def canopymodel(tile_id):
    basename = tile_id + '.laz'
    infile = os.path.join(interim, 'classified', basename)
    odir = os.path.join(interim, 'chm_tiles')
    outname = os.path.basename(infile).split('.')[0] + '.dtm'
    outfile = os.path.join(odir, outname)
    return fus.canopymodel(surfacefile=outfile,
                           cellsize=1,
                           xyunits='F',
                           zunits='F',
                           coordsys=2, # in State Plane
                           zone=0, # not in UTM
                           horizdatum=2, # NAD83
                           vertdatum=2, # NAVD88
                           datafiles=infile,
                           ground=os.path.join(interim, 'dtm_ground_tiles', outname),
                           median=3, # median smoothing in 3x3 kernel
                           las_class=(1,2,5), # keep only ground, unclassified, and high veg points
                           asc=True, # also output in ascii format
                           odir=odir) # will make sure output directory is created if doesn't already exist

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'classified', '*.laz'))]
canopy_results = c.compute([canopymodel(x) for x in tiles])
progress(canopy_results)

VBox()

Convert the ascii files that `canopymodel` generated into GeoTiffs, specifying their projection. Then cleanup the files `canopymodel` generated that we don't care about.

In [38]:
@dask.delayed
def asc2tif(tile_id):
    basename = tile_id + '.asc'
    infile = os.path.join(interim, 'chm_tiles', basename)
    return convert_project(infile, '.tif', 'EPSG:2286')

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'chm_tiles', '*.asc'))]
proj_results = c.compute([asc2tif(x) for x in tiles])
progress(proj_results)

VBox()

In [39]:
clean_dir(os.path.join(interim, 'chm_tiles'), ['.asc', '.dtm'])

Removed 85 files with extension .asc.
Removed 85 files with extension .dtm.


Clip the canopy height model tiles to remove overlapping areas that were from tile buffering to avoid edge effects.

In [40]:
@dask.delayed
def clip(tile_id):
    basename = tile_id + '.tif'
    infile = os.path.join(interim, 'chm_tiles', basename)
    in_shp = os.path.join(interim, 'tile_boundaries', tile_id + '.shp')
    odir = os.path.join(processed, 'rasters', 'chm_tiles')
    return clip_tile_from_shp(infile, in_shp, odir)

tiles = [os.path.basename(file).split('.')[0] for file in glob.glob(os.path.join(interim, 'chm_tiles', '*.tif'))]
clip_results = c.compute([clip(x) for x in tiles])
progress(clip_results)

VBox()

Merge the trimmed canopy height model tiles into a single raster.

In [41]:
%%time
infiles = os.path.join(processed, 'rasters', 'chm_tiles', '*.tif')
outfile = os.path.join(processed, 'rasters', 'chm.tif')

proc_merge = subprocess.run(['rio', 'merge', *glob.glob(infiles), outfile, '--co', 'compress=LZW'],
                            stderr=subprocess.PIPE, stdout=subprocess.PIPE)

print('Done producing merged Canopy Height Model GeoTiff.')

Done producing merged Canopy Height Model GeoTiff.
CPU times: user 31.2 s, sys: 3.74 s, total: 35 s
Wall time: 2min 3s
