# Pre-processing and Filtering

This notebook takes as input a folder with AHN point cloud tiles (assumed to be 1x1 km), and performs a number of pre-processing and filtering steps. The output is a reduced point cloud containing trees and "suspected trees" for further classification using RandLA-Net and / or post-processing.

In [None]:
import set_path

import numpy as np
import laspy
import pathlib
import geopandas as gpd
import shapely.geometry as sg

from tqdm.notebook import tqdm
tqdm.pandas()

from upcp.utils import ahn_utils
from upcp.utils.interpolation import FastGridInterpolator
from upcp.region_growing.label_connected_comp import LabelConnectedComp

import gvl.helper_functions as helpers

In [None]:
import warnings  # temporary, to supress deprecationwarnings from shapely
warnings.filterwarnings('ignore')

## Settings

In [None]:
BASE_FOLDER = pathlib.Path('../datasets/AHN4')

# Input: AHN subtiles created using notebook "0. LAS Splitter.ipynb"
input_dir = BASE_FOLDER / 'AMS_subtiles_1000'
output_dir = BASE_FOLDER / 'AMS_subtiles_1000_reduced'

# DTM corresponding to AHN subtiles, stored as .npz, created using notebook "0. LAS Splitter.ipynb"
ahn_dtm_folder = BASE_FOLDER / 'npz_subtiles_1000'

# Known tree locations, if available.
tree_ref_file = '../datasets/validation/joined_trees_bgt_gissib_1_5_amsterdam.gpkg' # TODO get geovisia dataset

MIN_HAG = 2.5  # Minimum height above ground in meters

tree_filter = {'grid_size': 0.6,
               'min_component_size': 50,
               'min_height': 3.5}  # TODO
noise_filter = {'grid_size': 0.9,
                'min_component_size': 50}  # TODO
other_filter = {'min_nz_flat': 0.85,
                'min_width': 1.0,
                'min_height': 2.5,
                'max_height': 38}  # TODO

# AHN classification
AHN_OTHER = 1
AHN_GROUND = 2
AHN_BUILDING = 6
AHN_WATER = 9
AHN_ARTIFACT = 26

# Our classification
UNKNOWN = 0
TREE = 1
NOISE = 2
OTHER = 3

In [None]:
tree_gdf = gpd.read_file(tree_ref_file, crs='epsg:4326')

In [None]:
# Create output folder
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

# Create DTM reader
ahn_reader = ahn_utils.NPZReader(ahn_dtm_folder, caching=False)

# Load tree locations
tree_gdf = gpd.read_file(tree_ref_file, crs='epsg:28992')

## Main loop

In [None]:
input_files = list(pathlib.Path(input_dir).glob('ahn4*.laz'))

In [None]:
# Check existing output files (ignore this cell to re-run for all tiles)
existing_files = list(pathlib.Path(output_dir).glob('tree*.laz'))
existing_codes = {helpers.get_tilecode_from_filename(file.name)
                  for file in existing_files}

input_files = [file for file in input_files
               if helpers.get_tilecode_from_filename(file.name) not in existing_codes]

In [None]:
pbar = tqdm(input_files, unit='file', smoothing=0)

for file in pbar:
    tilecode = helpers.get_tilecode_from_filename(file.name)
    pbar.set_postfix_str(tilecode)
    
    # Load LAS data
    las = laspy.read(file)
    points_xyz = np.vstack((las.x, las.y, las.z)).T
    
    # Get trees inside LAS bounding box
    bbox = sg.box(las.header.min[0], las.header.min[1], las.header.max[0], las.header.max[1], ccw=True)
    trees_in_bbox = tree_gdf[tree_gdf.within(bbox)]
    tree_points = list(trees_in_bbox['geometry'].values)
    
    
    ### Reduce the point cloud
    
    # Check if las file is valid
    if np.all(las.classification==0):
        print(f'No classification provided in the laz file {file.name}. Aborting...')
        break
    
    las.classification
    # Use only AHN_OTHER class
    # We dont want to filter on the number_of_returns scalar field. It will remove too much valuable tree points.
    mask = (las.classification == AHN_OTHER)

    # Remove points close to ground
    ground_z = ahn_reader.interpolate(tilecode,
                                      points_xyz[mask])
    height_mask = (points_xyz[mask, 2] - ground_z >= MIN_HAG) | np.isnan(ground_z)

    points_xyz = points_xyz[mask][height_mask]
    points_i = las.intensity[mask][height_mask]
    ground_z = ground_z[height_mask]
    points_orig_idx = np.where(mask)[0][height_mask]


    ### Generate two new scalar fields

    # Compute normals
    normals = helpers.calculate_normals(points_xyz)

    # Compute height above ground
    hag = points_xyz[:,2] - ground_z

    
    ### Label the point cloud
    
    # Init masks
    normals_mask = np.zeros(len(points_xyz), dtype=bool)
    dims_mask = np.zeros(len(points_xyz), dtype=bool)
    hag_mask = np.zeros(len(points_xyz), dtype=bool)    
    new_hag = np.copy(hag) # Because we index and manipulate this array in a for loop

    ## Label "tree" clusters based on ground truth tree points
    lcc = LabelConnectedComp(grid_size=tree_filter['grid_size'],
                             min_component_size=tree_filter['min_component_size'])
    point_components = lcc.get_components(points_xyz)

    tree_mask = helpers.label_tree_like_components(
                                    points_xyz, ground_z, point_components,
                                    tree_points, tree_filter['min_height'])
    
    cc_labels = np.unique(point_components)
    cc_labels = set(cc_labels).difference((-1,))

    # Iterate over the clusters
    for cc in tqdm(cc_labels, smoothing=0, leave=False):
        # select points that belong to the cluster
        cc_mask = (point_components == cc)
        cc_z = points_xyz[cc_mask][:, 2]
        cc_hag = hag[cc_mask]
        
        if np.isnan(cc_hag).all():
            cc_offset = 0.
        else:
            cc_offset = np.nanmean(cc_hag) - np.mean(cc_z)
        new_hag[cc_mask] = cc_z + cc_offset

    ## Label "noise" points
    mask_ids = np.where(~tree_mask)[0] # Only the possible noise points
    lcc = LabelConnectedComp(grid_size=noise_filter['grid_size'],
                             min_component_size=noise_filter['min_component_size'])
    point_components = lcc.get_components(points_xyz[mask_ids])

    noise_mask = np.zeros((len(points_xyz),), dtype=bool)
    noise_mask[mask_ids] = point_components == -1

    ## Label "non-tree" clusters based on normal values and HAG
    mask_ids = np.where(~(tree_mask | noise_mask))[0]
    lcc = LabelConnectedComp(grid_size=tree_filter['grid_size'],
                             min_component_size=tree_filter['min_component_size'])
    point_components = lcc.get_components(points_xyz[mask_ids])
    
    cc_labels = np.unique(point_components)
    cc_labels = set(cc_labels).difference((-1,))

    # Iterate over the clusters
    for cc in cc_labels:
        # select points that belong to the cluster
        cc_mask = (point_components == cc)

        # If most of the points point up, it's not a tree.
        if np.abs(normals[:,2][mask_ids[cc_mask]]).mean() > other_filter['min_nz_flat']:
            normals_mask[mask_ids[cc_mask]] = True

        # TODO come up with something clever for x/y flatness
        # Do a similar thing with the x normals
        # if normals[:,0][mask_ids[cc_mask]].mean() < 0.03 and normals[:,0][mask_ids[cc_mask]].mean() > -0.03:
        #     normals_mask[mask_ids[cc_mask]] = True

        # Look at shape of cluster, e.g. minimum bounding rectangle + min_width check.
        min_dim, _ = helpers.get_wl_box(points_xyz[mask_ids[cc_mask]])
        if min_dim < other_filter['min_width']:
            dims_mask[mask_ids[cc_mask]] = True

        # If the object is smaller than MIN_HEIGHT or higher than MAX_HEIGHT, it's not a tree.
        if new_hag[mask_ids[cc_mask]].max() > other_filter['max_height'] or new_hag[mask_ids[cc_mask]].max() < other_filter['min_height']:
            hag_mask[mask_ids[cc_mask]] = True

    ## Set labels
    labels = np.ones((len(points_xyz),), dtype='uint16') * UNKNOWN
    labels[tree_mask] = TREE
    labels[noise_mask] = NOISE
    labels[normals_mask] = OTHER
    labels[dims_mask] = OTHER
    labels[hag_mask] = OTHER

    
    ### Save the point cloud

    header = laspy.LasHeader(point_format=3, version="1.2")
    header.offsets = las.header.offsets
    header.scales = las.header.scales

    new_las = laspy.LasData(header)

    new_las.x = points_xyz[:, 0]
    new_las.y = points_xyz[:, 1]
    new_las.z = points_xyz[:, 2]
    new_las.intensity = points_i

    new_las.add_extra_dim(laspy.ExtraBytesParams(name="label", type="uint16",
                                                 description="Label"))  
    new_las.add_extra_dim(laspy.ExtraBytesParams(name="orig_idx", type="uint32",
                                                 description="Original index"))  
    new_las.add_extra_dim(laspy.ExtraBytesParams(name="hag", type="float",
                                                 description="Height above ground"))
    new_las.add_extra_dim(laspy.ExtraBytesParams(name="normal_x", type="float",
                                                 description="normal_x"))
    new_las.add_extra_dim(laspy.ExtraBytesParams(name="normal_y", type="float",
                                                 description="normal_y"))    
    new_las.add_extra_dim(laspy.ExtraBytesParams(name="normal_z", type="float",
                                                 description="normal_z"))
    
    new_las.label = labels
    new_las.orig_idx = points_orig_idx
    new_las.hag = new_hag
    new_las.normal_x = normals[:,0]
    new_las.normal_y = normals[:,1]
    new_las.normal_z = normals[:,2]

    new_las.write(output_dir / f'tree_{tilecode}.laz')