In [1]:
import os
import glob
import dask
from dask.distributed import Client, progress, LocalCluster
from pyFIRS.wrappers import lastools
from pyFIRS.utils import validation_summary, move_invalid_tiles

### Setting up parallel computing using `dask.distributed`
`LAStools` offers native multi-core processing as an optional argument (`cores`) supplied to its command-line tools. `FUSION` command line tools do not. To enable parallel processing of `FUSION` commands, we'll use `dask.distributed` to schedule the processing of tiles in asynchronous parallel batches. This approach also offers us the ability to track progress using a progress bar.

You'll first need to launch a parallel computing cluster. 

In [2]:
cluster=LocalCluster(scheduler_port=7001, diagnostics_port=7002)
c = Client(cluster)

At this point, you should also be able to view an interactive dashboard on port 7002. If you're executing this on a remote server, you'll need to set up port forward so you can view the dashboard on your local machine's browser. Once you've done that, or if you're processing on your own machine, you can view the dashboard at [http://localhost:7002/status](http://localhost:7002/status).

# Enough already, let's get to work with some lidar data
We'll define where we can find the binary executables for LAStools and FUSION command line tools.

In [3]:
las = lastools.useLAStools('/storage/lidar/LAStools/bin')

In [4]:
# where the raw lidar data is currently stored
src_dir = '/storage/lidar/OregonCIG_Lidar/odf_northwest_2015/WILKERSON/'
src_tiles = glob.glob(os.path.join(src_dir, '*.laz'))
src_epsg = 2992 # oregon lambert (intl_ft) coordinate reference system

target_epsg = 26910 # utm 10 N

workdir = os.path.abspath('/storage/lidar/odf_northwest_2015/wilkerson')

In [5]:
# define data handling directories
raw, interim = os.path.join(workdir,'raw'), os.path.join(workdir,'interim')

num_cores = len(c.ncores()) # identify how many workers we have

# push our working directories and wrapper classes to the workers on the cluster as well
c.scatter([src_dir, raw, interim, las, src_epsg, target_epsg, num_cores], broadcast=True);

## Get the raw data into our working directory
First, move the tiles over to our working directory.

When we define functions using the `dask.delayed` decorator, the function will have 'lazy' instead of 'eager' execution. We can map the function to a list of inputs and it will not execute for any of them until we ask for results to be computed. When we use the `compute()` method for the client managing the scheduler that sends jobs to the workers, it then starts running the jobs.

In [6]:
@dask.delayed
def import_tile(tile_id): # the function we'll map to a list of inputs
    if os.path.exists(os.path.join(raw, tile_id + '.laz')):
        pass
    else:
        proc_import =  las.las2las(i=os.path.join(src_dir, tile_id + '.laz'),
                                   drop_withheld=True, # drop any points flagged as withheld by vendor
                                   drop_class=(7,18), # drop any points classified as noise by vendor
                                   clip_to_bounding_box=True, # make sure corrupted files don't change bbox
                                   epsg=src_epsg, # specify the source lidar projection in case it isn't automatically recognized
                                   feet=True, # source data in feet
                                   elevation_feet=True, # source data in feet
                                   target_epsg=target_epsg, # reproject to utm zone 10 N
                                   target_meter=True, # use meters for output
                                   target_elevation_meter=True, # use meters for output
                                   rescale=(0.01,0.01,0.01),
                                   point_type=1, # removes color data if present
                                   dont_remove_empty_files=True,
                                   odir=raw,
                                   olaz=True) # compress .laz file output
    return tile_id

Next, validate that the data match LAS specifications and have not been corrupted.

In [7]:
@dask.delayed
def validate(tile_id):
    if os.path.exists(os.path.join(raw, tile_id + '.xml')):
        pass
    else:
        proc_validate = las.lasvalidate(i=os.path.join(raw, tile_id + '.laz'),
                               o=os.path.join(raw, tile_id + '.xml'))
    return tile_id

Next, create spatial indexes for the input files to allow fast spatial queries (which are used, for example, when retiling and adding buffers).

In [8]:
@dask.delayed
def make_index(tile_id): # the function we'll map to a list of inputs
    basename = tile_id + '.laz'
    infile = os.path.join(raw, basename)
    
    if not os.path.exists(os.path.join(raw, tile_id + '.lax')): 
        proc_index = las.lasindex(i=infile)
    else:
        pass
    
    return tile_id

## Hand-build the computational graph
Define the recipe for computations.

In [9]:
tile_ids = [os.path.basename(file).split('.')[0] for file in src_tiles]

get_data = {} # a dictionary that will be used to define dask's computational graph
for tile in tile_ids:
    get_data['import-{}'.format(tile)]=(import_tile, tile)
    get_data['validate-{}'.format(tile)]=(validate, 'import-{}'.format(tile))
    get_data['index-{}'.format(tile)]=(make_index, 'validate-{}'.format(tile))
    
# this empty function will be added to recipe for computations
# it will be defined to depend upon all previous steps being completed
@dask.delayed
def done_importing(*args, **kwargs):
    return

get_data['done_importing']=(done_importing, ['index-{}'.format(tile) for tile in tile_ids])

In [10]:
get_data_graph = c.get(get_data, 'done_importing') # builds the computational graph
get_data_results = c.persist(get_data_graph) # starts executing it
progress(get_data_results) # progress bars

VBox()

In [None]:
# c.cancel(get_data_results)

In [14]:
validation_summary(xml_dir=raw, verbose=True)

LASvalidate Summary
Passed: 197
Failed: 0
ParseErrors: 0

Details




In [13]:
move_invalid_tiles(xml_dir=raw, dest_dir=os.path.join(raw, 'invalid'))

Moved files for 38 invalid tiles to /storage/lidar/odf_northwest_2015/wilkerson/raw/invalid


In [15]:
# c.close()
# cluster.close()

