# Pre-processing and Filtering

This notebook takes as input a folder with AHN point cloud tiles (assumed to be 1x1 km), and performs a number of pre-processing and filtering steps. The output is a reduced point cloud containing probable trees and other "unknown" elements.

In [None]:
import set_path

import numpy as np
import laspy
import pathlib

from tqdm.notebook import tqdm

from upcp.utils import ahn_utils
from upcp.region_growing.label_connected_comp import LabelConnectedComp

import gvl.helper_functions as utils

## Settings

In [None]:
DATA_FOLDER = pathlib.Path("../data")
OUTPUT_FOLDER = pathlib.Path("../data")

# Input: AHN subtiles created using notebook "0. LAS Splitter.ipynb"
input_dir = DATA_FOLDER / "ahn4"
output_dir = OUTPUT_FOLDER / "ahn4_trees"
N = 4  # Number of digits in tilecode format

# DTM corresponding to AHN subtiles, stored as .npz, created using notebook "0. LAS Splitter.ipynb"
ahn_dtm_folder = DATA_FOLDER / "ahn4"

MIN_HAG = 1.5  # Minimum height above ground (meters), used to cut off noise like cars, bushes

# Tree filter settings, fine-tune these to your needs to balance FP / FN.
tree_filter = {
    "grid_size": 0.4,  # 0.5 grid size, input LCC, maximum distance
    "min_component_size": 50,  # minimum component size, input LCC, minimum number of samples
    "minmax_hag": 3.5,  # if its smaller than this height, it is not a tree
    "max_nz_flat": 0.85,  # maximum amount of points that point up
    "min_nz_flat": 0.5,  # minimum amount of points that point up
    "min_width": 1.0,  # if width is smaller than this, it is not a tree
    "max_refl": -5.0,  # maximum reflectance
    "max_ampl": 9.0,  # maximum amplitude
    "min_nor": 1.5,  # minimum number of returns
    "nor_perc": 80,  # number of returns percentile (at which the min nor check is performed)
}

# AHN classification
AHN_OTHER = 1
AHN_GROUND = 2
AHN_BUILDING = 6
AHN_WATER = 9
AHN_ARTIFACT = 26

# Our classification
UNKNOWN = 0
TREE = 1
NOISE = 2
OTHER = 3

In [None]:
# Create output folder
output_dir.mkdir(parents=True, exist_ok=True)

# Create DTM reader
ahn_reader = ahn_utils.NPZReader(ahn_dtm_folder, caching=False)

## Main loop

In [None]:
input_files = list(input_dir.glob("ahn*.laz"))

In [None]:
# Check existing output files (ignore this cell to re-run for all tiles)
existing_files = list(pathlib.Path(output_dir).glob("trees*.laz"))
existing_codes = {
    utils.get_tilecode_from_filename(file.name, n_digits=N) for file in existing_files
}

In [None]:
input_files = [
    file
    for file in input_files
    if utils.get_tilecode_from_filename(file.name, n_digits=N) not in existing_codes
]

In [None]:
pbar = tqdm(input_files, unit="file", smoothing=0)

for file in pbar:
    tilecode = utils.get_tilecode_from_filename(file.name, n_digits=N)
    pbar.set_postfix_str(tilecode)

    # Load LAS data
    las = laspy.read(file)
    points_xyz = np.vstack((las.x, las.y, las.z)).T

    ### Reduce the point cloud

    # Use only AHN_OTHER class
    mask = las.classification == AHN_OTHER

    # Remove points close to ground
    ground_z = ahn_reader.interpolate(tilecode, points_xyz[mask])
    height_mask = (points_xyz[mask, 2] - ground_z >= MIN_HAG) | np.isnan(ground_z)

    # Get scalar fields of masked cloud
    orig_idx = np.where(mask)[0][height_mask]
    if len(orig_idx) == 0:
        continue

    xyz = points_xyz[orig_idx]
    refl = las.Reflectance[orig_idx]
    ampl = las.Amplitude[orig_idx]
    nor = las.number_of_returns[orig_idx]

    # New scalar fields
    hag = np.around(xyz[:, 2] - ground_z[height_mask], decimals=2)
    normals = np.around(utils.calculate_normals(xyz), decimals=2)

    ### Label the point cloud

    # Init masks
    tree_mask = np.ones(len(xyz), dtype=bool)
    new_hag = np.copy(hag)  # Because we index and manipulate this array in a for loop

    ## Cluster the point cloud data
    lcc = LabelConnectedComp(
        grid_size=tree_filter["grid_size"],
        min_component_size=tree_filter["min_component_size"],
    )
    point_components = lcc.get_components(xyz)

    # Noise filter
    tree_mask[point_components == -1] = False

    cc_labels = np.unique(point_components)
    cc_labels = set(cc_labels).difference((-1,))

    # Iterate over the clusters
    for cc in tqdm(cc_labels, smoothing=0, leave=False):
        # select points that belong to the cluster
        cc_mask = np.where(point_components == cc)[0]

        # The height above ground values are unknown when a tree is partially above water.
        # To "standardize" the HAG values per tree, we correct them based on the average HAG
        # for each tree cluster.
        # If a cluster has no valid HAG values, the NAP (z-coordinate) value is used instead.
        cc_z = xyz[cc_mask][:, 2]
        cc_hag = hag[cc_mask]

        if not np.isnan(cc_hag).all():
            unknown = np.isnan(cc_hag)
            if np.count_nonzero(unknown) > 0:
                known_idx = np.where(~unknown)[0]
                min_point = np.nanargmin(cc_z[known_idx])
                if np.ndim(min_point) > 0:
                    min_point = min_point[0]
                cc_offset = cc_hag[known_idx[min_point]] - cc_z[known_idx[min_point]]
                if np.isnan(cc_offset):
                    print(
                        f"HAG: {cc_hag[known_idx[min_point]]}, NAP: {cc_z[known_idx[min_point]]}"
                    )
                cc_hag[unknown] = cc_z[unknown] + cc_offset
                new_hag[cc_mask] = cc_hag

        ## Filters

        # If most of the points point up, it's not a tree.
        if np.abs(normals[:, 2][cc_mask]).mean() > tree_filter["max_nz_flat"]:
            tree_mask[cc_mask] = False

        ## If too little of the points point up, it's not a tree.  # NEW
        if np.abs(normals[:, 2][cc_mask]).mean() < tree_filter["min_nz_flat"]:
            tree_mask[cc_mask] = False

        # TODO come up with something clever for x/y flatness
        # Do a similar thing with the x normals
        # elif normals[:,0][cc_mask].mean() < 0.03 and normals[:,0][cc_mask].mean() > -0.03:
        #     tree_mask[cc_mask] = False

        # If reflectance is too high, it's not a tree.
        elif np.mean(refl[cc_mask]) > tree_filter["max_refl"]:
            tree_mask[cc_mask] = False

        # If amplitude is too high, it's not a tree.
        elif np.mean(ampl[cc_mask]) > tree_filter["max_ampl"]:
            tree_mask[cc_mask] = False

        # If number_of_returns is too low, it's not a tree.
        elif (
            np.percentile(nor[cc_mask], tree_filter["nor_perc"])
            < tree_filter["min_nor"]
        ):
            tree_mask[cc_mask] = False

        # Look at shape of cluster, e.g. minimum bounding rectangle + min_width check.
        elif utils.get_wl_box(xyz[cc_mask])[0] < tree_filter["min_width"]:
            tree_mask[cc_mask] = False

        # If the object is smaller than MIN_HEIGHT or higher than MAX_HEIGHT, it's not a tree.
        elif np.nanmax(cc_hag) < tree_filter["minmax_hag"]:
            tree_mask[cc_mask] = False

    ## Set labels
    labels = np.ones((len(xyz),), dtype="uint16") * UNKNOWN
    labels[tree_mask] = TREE

    ### Save the point cloud

    header = laspy.LasHeader(
        point_format=las.header.point_format, version=las.header.version
    )
    header.offsets = las.header.offsets
    header.scales = las.header.scales

    new_las = laspy.LasData(header)

    new_las.points = las.points[orig_idx]

    new_las.add_extra_dim(
        laspy.ExtraBytesParams(name="label", type="uint16", description="Label")
    )
    new_las.add_extra_dim(
        laspy.ExtraBytesParams(
            name="orig_idx", type="uint32", description="Original index"
        )
    )
    new_las.add_extra_dim(
        laspy.ExtraBytesParams(
            name="hag", type="float", description="Height above ground"
        )
    )

    new_las.label = labels
    new_las.orig_idx = orig_idx
    new_las.hag = new_hag

    new_las.write(output_dir / f"trees_{tilecode}.laz")