# Point Cloud Processing

Alan Youssef

## Task 1: Finding the Ground Level


In this task, our goal is to identify the ground level in LiDAR point cloud data. LiDAR captures 3D points of the environment, including the ground, buildings, trees, and other objects. To analyze or segment these points properly, we first need to separate the ground points from everything else.

We solve this by analyzing the height (Z) values of all points. A histogram of Z-values helps us see where most points are located vertically. The tallest peak in the histogram usually corresponds to the ground because most of the scene is typically ground-level points. 

Using this method, we estimate the ground level and then remove all points below it. This allows us to focus only on objects above the ground for further analysis, like clustering or object detection. 

We also visualize the histogram and the 3D point cloud above the ground to verify our results. This task is important because correctly identifying the ground level ensures that subsequent analyses, such as clustering with DBSCAN, are more accurate and meaningful.

In [None]:
#%%
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.spatial import KDTree
from sklearn.cluster import DBSCAN
from mpl_toolkits.mplot3d import Axes3D

#%% utility functions
def show_cloud(points_plt):
    ax = plt.axes(projection='3d')
    ax.scatter(points_plt[:,0], points_plt[:,1], points_plt[:,2], s=0.01)
    plt.show()

def show_scatter(x,y):
    plt.scatter(x, y)
    plt.show()

def get_ground_level(pcd, dataset_name="dataset"):
    z_values = pcd[:, 2]  # ta höjddata

    # skapa histogram
    counts, bins = np.histogram(z_values, bins=100)
    max_bin_index = np.argmax(counts)
    ground_level = (bins[max_bin_index] + bins[max_bin_index + 1]) / 2

    # plotta och spara histogram
    plt.figure()
    plt.hist(z_values, bins=100)
    plt.axvline(ground_level, color='red', linestyle='--',
                label=f'Ground level: {ground_level:.2f}')
    plt.legend()
    plt.xlabel("Höjd (Z)")
    plt.ylabel("Antal punkter")
    plt.title(f"Histogram av höjdfördelning ({dataset_name})")
    plt.savefig(f"images/histogram_{dataset_name}.png")
    plt.close()

    return ground_level

#%% Lista med dataset-filer
datasets = ["dataset1.npy", "dataset2.npy"]

#%% Kör analysen för varje dataset
for filename in datasets:
    print(f"Processing {filename}...")
    pcd = np.load(filename)
    
    # beräkna marknivå
    est_ground_level = get_ground_level(pcd, dataset_name=filename.split('.')[0])
    print(f"{filename}: Beräknad marknivå = {est_ground_level:.2f}")
    
    # ta bort marknivå för visualisering
    pcd_above_ground = pcd[pcd[:,2] > est_ground_level]
    
    print(f"{filename}: Antal punkter ovanför marknivå = {pcd_above_ground.shape[0]}")
    
    # visa punktskyen
    %matplotlib qt
    show_cloud(pcd_above_ground)

    # Exempel på DBSCAN-klustring för det första datasetet
    if filename == "dataset1.npy":
        unoptimal_eps = 10
        clustering = DBSCAN(eps=unoptimal_eps, min_samples=5).fit(pcd_above_ground)
        
        clusters = len(set(clustering.labels_)) - (1 if -1 in clustering.labels_ else 0)
        colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, clusters)]
        
        plt.figure(figsize=(10,10))
        plt.scatter(pcd_above_ground[:,0], 
                    pcd_above_ground[:,1],
                    c=clustering.labels_,
                    cmap=matplotlib.colors.ListedColormap(colors),
                    s=2)
        plt.title('DBSCAN: %d clusters' % clusters, fontsize=20)
        plt.xlabel('x axis', fontsize=14)
        plt.ylabel('y axis', fontsize=14)
        plt.show()


## Task 2 – Optimizing DBSCAN Clustering

In Task 2, we focused on finding an optimal parameter for DBSCAN called `eps`. This parameter determines how close points need to be to each other to belong to the same cluster. If `eps` is too small, many points will be considered "noise" and not belong to any cluster. If `eps` is too large, separate objects may be merged into a single cluster.

To find a suitable `eps`, we used an **elbow plot** based on the distance to each point's 5th nearest neighbor. By sorting these distances and looking for the "elbow" in the graph, we can heuristically choose a value that balances between too many small clusters and too few large clusters.

After determining an approximate optimal `eps`, we ran DBSCAN again and visualized the clusters. This step ensures that the parameter works in practice—meaning the clusters represent real objects in the LiDAR data, such as buildings, trees, or cars.

The outcomes of Task 2 are:  
1. An elbow plot showing how we selected `eps`.  
2. A final cluster plot showing meaningful clusters in the data.

These visualizations were saved in the `images` folder to include them in the project Readme.

In [None]:
#%%
from sklearn.neighbors import NearestNeighbors

def find_optimal_eps(pcd_above_ground, dataset_name="dataset"):
    # Skapa NearestNeighbors-objekt
    neigh = NearestNeighbors(n_neighbors=5)
    nbrs = neigh.fit(pcd_above_ground[:, :2])  # endast XY-plan
    distances, indices = nbrs.kneighbors(pcd_above_ground[:, :2])
    
    # Ta avstånd till 5:e närmaste granne
    k_distances = np.sort(distances[:, 4])
    
    # Plotta och spara elbow
    plt.figure(figsize=(10,6))
    plt.plot(k_distances)
    plt.xlabel("Points sorted by distance")
    plt.ylabel("5th Nearest Neighbor Distance")
    plt.title(f"Elbow plot for {dataset_name}")
    plt.grid(True)
    plt.savefig(f"images/elbow_{dataset_name}.png")
    plt.close()
    
    # Heuristiskt sätt att hitta "elbow": vi tar ett ungefärligt värde
    optimal_eps = np.percentile(k_distances, 95)  # kan justeras visuellt
    return optimal_eps

#%%
for filename in datasets:
    print(f"Processing {filename} for Task 2...")
    pcd = np.load(filename)
    est_ground_level = get_ground_level(pcd, dataset_name=filename.split('.')[0])
    pcd_above_ground = pcd[pcd[:,2] > est_ground_level]

    # Hitta optimal eps
    optimal_eps = find_optimal_eps(pcd_above_ground, dataset_name=filename.split('.')[0])
    print(f"{filename}: Optimal eps = {optimal_eps:.2f}")

    # Kör DBSCAN med optimal eps
    clustering = DBSCAN(eps=optimal_eps, min_samples=5).fit(pcd_above_ground)
    clusters = len(set(clustering.labels_)) - (1 if -1 in clustering.labels_ else 0)
    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, clusters)]
    
    # Plotta och spara klusterplot
    plt.figure(figsize=(10,10))
    plt.scatter(pcd_above_ground[:,0], 
                pcd_above_ground[:,1],
                c=clustering.labels_,
                cmap=matplotlib.colors.ListedColormap(colors),
                s=2)
    plt.title(f'DBSCAN: {clusters} clusters ({filename})', fontsize=20)
    plt.xlabel('x axis', fontsize=14)
    plt.ylabel('y axis', fontsize=14)
    plt.savefig(f"images/clusters_{filename.split('.')[0]}.png")
    plt.close()



Processing dataset1.npy for Task 2...
dataset1.npy: Optimal eps = 0.39
Processing dataset2.npy for Task 2...
dataset2.npy: Optimal eps = 0.36


![Histogram dataset1](images/histogram_dataset1.png)
![Histogram dataset2](images/histogram_dataset2.png)
![Elbow plot dataset1](images/elbow_dataset1.png)
![Elbow plot dataset2](images/elbow_dataset2.png)
![Clusters dataset1](images/clusters_dataset1.png)
![Clusters dataset2](images/clusters_dataset2.png)
