In [1]:
import os
import glob
import dask
from dask.distributed import Client, progress, LocalCluster
from pyFIRS.wrappers import lastools
from pyFIRS.wrappers import fusion
from pyFIRS.utils import validation_summary, move_invalid_tiles, fname

In [2]:
# data handling directories
WORKDIR = os.path.abspath('/storage/lidar/portland-metro_2014/')
TARGET_EPSG = 6339  # utm 10N, NAD83_2011
# TARGET_EPSG = 6340  # utm 11N, NAD83_2011

In [3]:
SRC = os.path.join(WORKDIR, 'src')
src_tiles = glob.glob(os.path.join(SRC, '*.laz'))
# src_tiles = glob.glob(os.path.join(SRC, '*.las'))

# where we're going to put processed source tiles
RAW = os.path.join(WORKDIR, 'raw')

print('Found {:,d} tiles in source directory:\n'
      ' {}'.format(len(src_tiles), SRC))

Found 2,602 tiles in source directory:
 /storage/lidar/portland-metro_2014/src


# Enough already, let's get to work with some lidar data
We'll define where we can find the binary executables for LAStools and FUSION command line tools.

In [4]:
las = lastools.useLAStools('/storage/lidar/LAStools/bin')
fus = fusion.useFUSION('/storage/lidar/FUSION/')

In [5]:
# take a peak at info from a lidar source tile
info_proc = las.lasinfo(i=src_tiles[0],
                        echo=True)


lasinfo (190321) report for '/storage/lidar/portland-metro_2014/src/45122D1414.laz'
reporting all LAS header entries:
  file signature:             'LASF'
  file source ID:             0
  global_encoding:            0
  project ID GUID data 1-4:   00000000-0000-0000-0000-000000000000
  version major.minor:        1.2
  system identifier:          'NOAA OCM'
  generating software:        'datum_shift (9515 2017-07-12)'
  file creation day/year:     126/2015
  header size:                227
  offset to point data:       525
  number var. length records: 3
  point data format:          3
  point data record length:   34
  number of point records:    52316423
  number of points by return: 21652327 17089749 9789058 3785289 0
  scale factor x y z:         0.0000001 0.0000001 0.01
  offset x y z:               -122 45 0
  min x y z:                  -122.0249857 45.3999944 693.78
  max x y z:                  -122.0124856 45.4124946 957.74
variable length header record 1 of 3:
  reserved  

### Setting up parallel computing using `dask.distributed`
`LAStools` offers native multi-core processing as an optional argument (`cores`) supplied to its command-line tools. `FUSION` command line tools do not. To enable parallel processing of `FUSION` commands, we'll use `dask.distributed` to schedule the processing of tiles in asynchronous parallel batches. This approach also offers us the ability to track progress using a progress bar.

You'll first need to launch a parallel computing cluster. 

In [7]:
cluster=LocalCluster()#scheduler_port=7001, diagnostics_port=7002)
c = Client(cluster)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


At this point, you should also be able to view an interactive dashboard on port 7002. If you're executing this on a remote server, you'll need to set up port forward so you can view the dashboard on your local machine's browser. Once you've done that, or if you're processing on your own machine, you can view the dashboard at [http://localhost:7002/status](http://localhost:7002/status).

In [8]:
num_cores = len(c.ncores()) # identify how many workers we have

# push our working directories and wrapper classes to the workers on the cluster as well
c.scatter([WORKDIR, SRC, RAW, 
           las, fus, 
           TARGET_EPSG, 
           num_cores], 
          broadcast=True);

## Get the raw data into our working directory
First, move the tiles over to our working directory.

When we define functions using the `dask.delayed` decorator, the function will have 'lazy' instead of 'eager' execution. We can map the function to a list of inputs and it will not execute for any of them until we ask for results to be computed. When we use the `compute()` method for the client managing the scheduler that sends jobs to the workers, it then starts running the jobs.

In [9]:
@dask.delayed
def import_tile(tile_id):
    INFILE = os.path.join(SRC, tile_id + '.laz')
#     INFILE = os.path.join(SRC, tile_id + '.las')
    OUTFILE = os.path.join(RAW, tile_id + '.laz')
    

    if os.path.exists(OUTFILE):
        pass
    else:
        proc_import =  las.las2las(i=INFILE,
                                   drop_withheld=True,
                                   drop_class=(7,18),  # classified as noise
#                                    epsg=32149,  # specify the source lidar projection, washington state plane south
#                                    epsg=2927,  # specify the source lidar projection, washington state plane south
                                   longlat=True,  # original data is in geographic coordinates
#                                    elevation_surveyfeet=True,
#                                    survey_feet=True,
#                                    nad83_2011=True,  # original data in nad83_2011 datum
                                   nad83_harn=True,  # original data in nad83_harn datum
                                   target_epsg=TARGET_EPSG, # reproject
                                   dont_remove_empty_files=True,
                                   odir=RAW,
                                   olaz=True)
    return tile_id

Next, validate that the data match LAS specifications and have not been corrupted.

In [10]:
@dask.delayed
def validate(tile_id):
    INFILE = os.path.join(RAW, tile_id + '.laz')
    OUTFILE = os.path.join(RAW, tile_id + '.xml')
    
    if os.path.exists(OUTFILE):
        pass
    else:
        proc_validate = las.lasvalidate(i=INFILE,
                                        o=OUTFILE)
    return tile_id

Next, create spatial indexes for the input files to allow fast spatial queries (which are used, for example, when retiling and adding buffers).

In [11]:
@dask.delayed
def make_index(tile_id):
    INFILE = os.path.join(RAW, tile_id + '.laz')
    OUTFILE = os.path.join(RAW, tile_id + '.lax')

    if os.path.exists(OUTFILE): 
        pass
    else:
        proc_index = las.lasindex(i=INFILE)

    return tile_id

## Hand-build the computational graph
Define the recipe for computations.

In [12]:
tile_ids = [fname(tile) for tile in src_tiles]

get_data = {}
for tile in tile_ids:
    get_data['import-{}'.format(tile)]=(
        import_tile, 
        tile)
    get_data['validate-{}'.format(tile)]=(
        validate, 
        'import-{}'.format(tile))
    get_data['index-{}'.format(tile)]=(
        make_index, 
        'validate-{}'.format(tile))
    
# this empty function will be added to recipe for computations
# it will be defined to depend upon all previous steps being completed
@dask.delayed
def done_importing(*args, **kwargs):
    return

get_data['done_importing']=(
    done_importing, 
    ['index-{}'.format(tile) for tile in tile_ids])

In [13]:
get_data_graph = c.get(get_data, 'done_importing')  # build the computational graph

In [14]:
get_data_results = c.persist(get_data_graph)  # start executing it

In [15]:
progress(get_data_results)  # progress bars

VBox()

In [15]:
# c.cancel(get_data_results)

In [16]:
validation_summary(xml_dir=RAW, verbose=False)

LASvalidate Summary
Passed: 2,602
Failed: 0
ParseErrors: 0



In [None]:
# move_invalid_tiles(xml_dir=RAW, dest_dir=os.path.join(RAW, 'invalid'))

In [17]:
# c.close()
# cluster.close()