In this notebook, we will clip 1-hectare and smaller plot-sized excerpts from lidar point cloud tiles provided by NOAA. We will update the names of these acquisitions for consistency with other data sources we're combining them with (instead of the NOAA ID #s)

In [1]:
import geopandas as gpd
import pandas as pd
import os
import glob
import subprocess
import dask
from dask.distributed import LocalCluster, Client, progress

In [13]:
PLOTS = '../data/processed/blm_usfs_wadnr_plots_1ha.shp'
plots = gpd.read_file(PLOTS)
plots['geometry'] = plots.centroid
plots.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5089 entries, 0 to 5088
Data columns (total 10 columns):
comments     496 non-null object
lat          5089 non-null float64
lon          5089 non-null float64
meas_date    3866 non-null object
orig_id      5089 non-null object
plot_id      3866 non-null object
source       5089 non-null object
meas_yr      5089 non-null int64
uuid         5089 non-null object
geometry     5089 non-null object
dtypes: float64(2), int64(1), object(7)
memory usage: 397.7+ KB


In [14]:
plots.crs

{'init': 'epsg:4326'}

In [15]:
TILE_IDX = '../data/raw/lidar/noaa_tiles/noaa_tileindex.shp'
tile_idx = gpd.read_file(TILE_IDX)
tile_idx.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3617 entries, 0 to 3616
Data columns (total 12 columns):
file_name     3617 non-null object
version       3617 non-null object
num_points    3617 non-null int64
point_type    3617 non-null int64
point_size    3617 non-null int64
min_x         3617 non-null float64
max_x         3617 non-null float64
min_y         3617 non-null float64
max_y         3617 non-null float64
min_z         3617 non-null float64
max_z         3617 non-null float64
geometry      3617 non-null object
dtypes: float64(6), int64(3), object(3)
memory usage: 339.2+ KB


In [16]:
tile_idx.crs

{'init': 'epsg:4326'}

In [17]:
tile_idx['noaa_id'] = tile_idx.file_name.apply(lambda x: x.split('_')[0].split('NOAA')[-1]).astype(int)
tile_idx = tile_idx.to_crs({'init':'epsg:4326'})

In [18]:
NOAA_NAMES = '../data/raw/lidar/noaa_tiles/noaa_acq_name_lookup.csv'
noaa_name_lookup = pd.read_csv(NOAA_NAMES).set_index('noaa_id')
noaa_name_lookup.head()

Unnamed: 0_level_0,acq_name
noaa_id,Unnamed: 1_level_1
3,west-coast_1997
4,west-coast_1998
12,willapa_2002
14,pacific-coast-shoreline_2002
81,panther-creek-leaf-on_2007


In [19]:
# assign acquisition names to noaa tiles based on tile_ids
tile_idx['acq_name'] = tile_idx.noaa_id.apply(lambda x: noaa_name_lookup.loc[x])

In [20]:
UTM_10 = '../data/external/utm_zone10_epsg4326.shp'
UTM_11 = '../data/external/utm_zone11_epsg4326.shp'
utm_10 = gpd.read_file(UTM_10)
utm_11 = gpd.read_file(UTM_11)
utm_zones = pd.concat((utm_10, utm_11),
                      axis=0,
                      ignore_index=True)[['geometry', 'ZONE']]
utm_zones.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
geometry    2 non-null object
ZONE        2 non-null object
dtypes: object(2)
memory usage: 112.0+ bytes


In [25]:
plot_utm = gpd.sjoin(plots, utm_zones)[['uuid', 'ZONE', 'source', 'geometry']].set_index('uuid')
plot_utm['epsg'] = plot_utm.ZONE.apply(lambda x: 6339 if (x == '10') else 6340)
len(plot_utm)

5089

In [85]:
# define the utm coordinates of the centroid of each plot in that zone
for epsg in [6339, 6340]:
    plot_utm.loc[plot_utm.epsg == epsg, 'utm_x'] = plot_utm.loc[plot_utm.epsg == epsg].to_crs({'init': 'epsg:{}'.format(epsg)}).centroid.x
    plot_utm.loc[plot_utm.epsg == epsg, 'utm_y'] = plot_utm.loc[plot_utm.epsg == epsg].to_crs({'init': 'epsg:{}'.format(epsg)}).centroid.y

In [31]:
joined = gpd.sjoin(plot_utm, tile_idx[['geometry', 'file_name', 'acq_name']]).drop('index_right', axis=1)

In [32]:
joined.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 3714 entries, 672f4279-a744-44ba-86b9-933cae1639cb to b4d87c56-955b-4184-aac2-5db1ab1fd4ba
Data columns (total 8 columns):
ZONE         3714 non-null object
source       3714 non-null object
geometry     3714 non-null object
epsg         3714 non-null int64
utm_x        3714 non-null float64
utm_y        3714 non-null float64
file_name    3714 non-null object
acq_name     3714 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 261.1+ KB


In [38]:
joined.reset_index().rename({'index':'uuid'}, axis=1).head()

Unnamed: 0,uuid,ZONE,source,geometry,epsg,utm_x,utm_y,file_name,acq_name
0,672f4279-a744-44ba-86b9-933cae1639cb,10,USFS-GIFFORD-PINCHOT,POINT (-122.2449995 45.75772433102937),6339,558716.024317,5067407.0,NOAA6331_20170403_USGS_LPC_WA_Western_South_20...,wa-western_2016
1,c5a52bf7-6d95-47d4-b319-00d3f34a6d42,10,USFS-GIFFORD-PINCHOT,POINT (-121.8728871000001 45.99883556102178),6339,587276.602779,5094536.0,NOAA6341_45121H71101A.laz,wasco_2015
2,dc5667e6-18e6-426b-9ea7-a6c52a74d2db,10,USFS-GIFFORD-PINCHOT,POINT (-121.8637798000001 45.98675064102214),6339,588000.966298,5093203.0,NOAA6341_45121H71106B.laz,wasco_2015
3,154f562f-c3b2-4d5a-a303-f62900fd8a60,10,USFS-GIFFORD-PINCHOT,POINT (-121.9608099999999 45.98510501102219),6339,580488.330145,5092918.0,NOAA6341_45121H81109A.laz,wasco_2015
4,ac47ea0b-14fc-43cb-9b24-610102be8dd5,10,USFS-GIFFORD-PINCHOT,POINT (-121.934112 45.97933816102238),6339,582564.733951,5092304.0,NOAA6341_45121H82206C.laz,wasco_2015


In [33]:
# we'll need to do a groupby forest data source (USFS, BLM, DNR) 
# as well as lidar acquisition name so that each combo can be processed separately
# we'll need to do this for both UTM 10 and 11

In [39]:
GROUP_COLS = ['acq_name', 'source', 'uuid', 'file_name']
matching_tiles = joined.reset_index().rename({'index':'uuid'}, axis=1).groupby(by=GROUP_COLS).sum().reset_index()[GROUP_COLS].set_index('uuid')

In [None]:
plot_utm = plot_utm.loc[joined.index]

In [73]:
@dask.delayed
def clip_1ha_plots(uuid):
    source = plot_utm.loc[uuid]['source']
    TILE_DIR = '../data/raw/lidar/noaa_tiles/utm_laz/'
    
    if 'BLM' in source:
        ODIR = '../data/interim/lidar/plot_clips/noaa/blm_plots/hectare_clips/'
    elif 'USFS' in source:
        ODIR = '../data/interim/lidar/plot_clips/noaa/usfs_plots/hectare_clips/'
    elif 'WA-DNR' in source:
        ODIR = '../data/interim/lidar/plot_clips/noaa/dnr_plots/hectare_clips/'
    
    # grab the coordinates of the plot for clipping
    x, y = plot_utm.loc[uuid][['utm_x', 'utm_y']]
    
    PLOT_RADIUS_M = 56.418  # radius of 1-hectare plot (185.1 ft)
    
    # find all matching tiles 
    all_matching_tiles = matching_tiles.loc[[uuid]]
    # loop through each lidar acquisition to clip the plots
    procs = []
    for acq in pd.unique(all_matching_tiles['acq_name']):
        out_name = uuid + '_' + acq + '.laz'
        outfile = os.path.join(ODIR, out_name)
        
        tiles = all_matching_tiles.loc[all_matching_tiles['acq_name'] == acq]['file_name'].values
        tiles = [os.path.join(TILE_DIR, t) for t in tiles]
        print(tiles)

        proc = subprocess.run(['wine', '/storage/lidar/LAStools/bin/las2las.exe',
                               '-i', *tiles, '-merged',
                               '-o', outfile,
                               '-keep_circle', str(x), str(y), str(PLOT_RADIUS_M)],
                              stderr=subprocess.PIPE,
                              stdout=subprocess.PIPE)
        procs.append(proc)
#         print(outfile, flush=True)
#     print(uuid, flush=True)
    return procs

In [74]:
@dask.delayed
def clip_smaller_plots(ha_clip):
    dirname, basename = os.path.split(ha_clip)
    outfile = os.path.abspath(os.path.join(dirname, '..', 'plot_clips', basename))

    uuid = os.path.basename(ha_clip).split('_')[0]
    
    # grab the coordinates of the plot for clipping
    x, y = plot_utm.loc[uuid][['utm_x', 'utm_y']]
    
    source = plot_utm.loc[uuid]['source']
    if 'WA-DNR' in source:
        # radius of 1/10th acre plot (37.2 ft)
        PLOT_RADIUS_M = 11.35  
    elif 'USFS' in source:
        # radius of 1/4 acre plot (58.9 ft)
        PLOT_RADIUS_M = 17.9454 
    elif 'BLM' in source:
        # radius of 1/8th acre plot (41.6 ft)
        PLOT_RADIUS_M = 12.69  
    
    proc = subprocess.run(['wine', '/storage/lidar/LAStools/bin/las2las.exe',
                           '-i', ha_clip,
                           '-o', outfile,
                           '-keep_circle', str(x), str(y), str(PLOT_RADIUS_M)],
                          stderr=subprocess.PIPE,
                          stdout=subprocess.PIPE)
#     print(outfile, flush=True)
    return proc

In [43]:
cluster=LocalCluster(scheduler_port=7001, diagnostics_port=7002)
c = Client(cluster)

In [72]:
c.scatter([plot_utm, joined, matching_tiles])

[<Future: status: finished, type: GeoDataFrame, key: GeoDataFrame-93930e39aabe2cab9ce0185177e747c9>,
 <Future: status: finished, type: GeoDataFrame, key: GeoDataFrame-c3a6361187a28d5d274966292ae97c9f>,
 <Future: status: finished, type: DataFrame, key: DataFrame-b9922fe204a2a9b8925c6d4007d7bcc1>]

In [75]:
plot_ids = pd.unique(joined.index)
len(plot_ids)

3140

In [76]:
res = c.persist([clip_1ha_plots(i) for i in plot_ids])
progress(res)

VBox()

In [77]:
usfs_ha = glob.glob('../data/interim/lidar/plot_clips/noaa/usfs_plots/hectare_clips/*.laz') 
blm_ha = glob.glob('../data/interim/lidar/plot_clips/noaa/blm_plots/hectare_clips/*.laz')
dnr_ha = glob.glob('../data/interim/lidar/plot_clips/noaa/dnr_plots/hectare_clips/*.laz')
ha_clips = usfs_ha + blm_ha + dnr_ha

res = c.persist([clip_smaller_plots(f) for f in ha_clips])
progress(res)

VBox()

In [78]:
usfs_plot = glob.glob('../data/interim/lidar/plot_clips/noaa/usfs_plots/plot_clips/*.laz') 
blm_plot = glob.glob('../data/interim/lidar/plot_clips/noaa/blm_plots/plot_clips/*.laz')
dnr_plot = glob.glob('../data/interim/lidar/plot_clips/noaa/dnr_plots/plot_clips/*.laz')
plot_clips = usfs_plot + blm_plot + dnr_plot
len(plot_clips)

3212

In [81]:
@dask.delayed
def get_boundary(infile, odir):
    proc = subprocess.run(['wine', '/storage/lidar/LAStools/bin/lasboundary.exe',
                           '-i', infile,
                           '-odir', odir,
                           '-oshp',
                           '-labels'], 
                          stderr=subprocess.PIPE,
                          stdout=subprocess.PIPE)
#     print(outfile)
    return proc

In [82]:
USFS_HA = glob.glob('../data/interim/lidar/plot_clips/noaa/usfs_plots/hectare_clips/*.laz')
USFS_PLOT = glob.glob('../data/interim/lidar/plot_clips/noaa/usfs_plots/plot_clips/*.laz')
DNR_HA = glob.glob('../data/interim/lidar/plot_clips/noaa/dnr_plots/hectare_clips/*.laz')
DNR_PLOT = glob.glob('../data/interim/lidar/plot_clips/noaa/dnr_plots/plot_clips/*.laz')
BLM_HA = glob.glob('../data/interim/lidar/plot_clips/noaa/blm_plots/hectare_clips/*.laz')
BLM_PLOT = glob.glob('../data/interim/lidar/plot_clips/noaa/blm_plots/plot_clips/*.laz')

jobs_to_do = []
PLOT_SETS = [USFS_HA, USFS_PLOT, DNR_HA, DNR_PLOT, BLM_HA, BLM_PLOT]
ODIRS = ['../data/interim/lidar/plot_clips/noaa/usfs_plots/hectare_clips/',
         '../data/interim/lidar/plot_clips/noaa/usfs_plots/plot_clips/',
         '../data/interim/lidar/plot_clips/noaa/dnr_plots/hectare_clips/',
         '../data/interim/lidar/plot_clips/noaa/dnr_plots/plot_clips/',
         '../data/interim/lidar/plot_clips/noaa/blm_plots/hectare_clips/',
         '../data/interim/lidar/plot_clips/noaa/blm_plots/plot_clips/']

for plot_set, odir in zip(PLOT_SETS, ODIRS):
    jobs_to_do += [get_boundary(p, odir=odir) for p in plot_set]

In [83]:
res = c.persist(jobs_to_do)
progress(res)

VBox()

In [79]:
# c.cancel(res)

In [86]:
# c.close()
# cluster.close()