In [1]:
# ============================
# STAGE-1 (A): Ground vs Non-Ground (ML) â€” CLEAN FINAL SCRIPT
# - Trains on MANY labeled LAZ files (Ground class=2)
# - Predicts on MANY unclassified LAZ files
# - Outputs: Ground=2, everything else=1 (Default)
# - Enforces rule: up to 1m above ground stays Default (class 1)
# ============================

import os, glob, pickle
import numpy as np
import laspy
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ----------------------------
# EDIT THESE PATHS
# ----------------------------
LABELED_DIR = r"D:/lidarrrrr/anbu/training_labeled"   # folder with your 9+ labeled files
UNCLASSIFIED_FILES = [
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000001.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000002.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000003.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000004.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000005.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000006.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000007.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000008.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000009.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000010.laz"    
]
OUT_DIR   = r"D:/lidarrrrr/anbu/New folder/stage1_outputs"
MODEL_PKL = r"D:/lidarrrrr/anbu/New folder/stage1_ground_model_v2.pkl"

# ----------------------------
# LABELS (from your scope)
# ----------------------------
GROUND_CLASS  = 2
DEFAULT_CLASS = 1

# "Maintain default class -1 for up to 1m from ground level."
# LAS cannot store -1 => we store those as DEFAULT_CLASS (1)
DEFAULT_UP_TO_H = 1.0

# ----------------------------
# FEATURE SETTINGS
# ----------------------------
GRID_CELL = 2.0   # meters; 2.0 is stable and fast

# ----------------------------
# TRAINING SAMPLE SETTINGS
# ----------------------------
TOTAL_EACH_CLASS = 700_000   # total ground and total non-ground across all files
MIN_PER_FILE_EACH = 25_000   # minimum each class per file (if available)

# ----------------------------
# FAST GRID-BASED FEATURES (IMPROVED)
# ----------------------------
def grid_stats(xyz, cell=2.0):
    """
    Returns per-point arrays aligned with xyz:
    zmin, zmax, zstd, zspan (p95-p5) per XY grid cell
    """
    x, y, z = xyz[:,0], xyz[:,1], xyz[:,2]
    minx, miny = x.min(), y.min()

    gx = np.floor((x - minx)/cell).astype(np.int32)
    gy = np.floor((y - miny)/cell).astype(np.int32)
    key = gx.astype(np.int64)*1_000_000 + gy.astype(np.int64)

    order = np.argsort(key)
    key_s = key[order]
    z_s   = z[order]

    uniq, start = np.unique(key_s, return_index=True)

    zmin  = np.full(len(uniq), np.inf,  dtype=np.float32)
    zmax  = np.full(len(uniq), -np.inf, dtype=np.float32)
    zstd  = np.full(len(uniq), 0.0,     dtype=np.float32)
    zspan = np.full(len(uniq), 0.0,     dtype=np.float32)

    for i in range(len(uniq)):
        a = start[i]
        b = start[i+1] if i+1 < len(uniq) else len(key_s)
        zs = z_s[a:b]

        zmin[i] = float(zs.min())
        zmax[i] = float(zs.max())
        zstd[i] = float(zs.std())
        # robust spread (ignores extreme outliers)
        zspan[i] = float(np.percentile(zs, 95) - np.percentile(zs, 5))

    pos = np.searchsorted(uniq, key)
    pos = np.clip(pos, 0, len(uniq)-1)

    return zmin[pos], zmax[pos], zstd[pos], zspan[pos]

def make_features(xyz, intensity, ret_num, n_returns, cell=2.0):
    """
    Features (9 dims):
    z, hag, local_range, slope_proxy, zstd, zspan, intensity, return_number, number_of_returns
    """
    zmin, zmax, zstd, zspan = grid_stats(xyz, cell=cell)
    z = xyz[:,2].astype(np.float32)

    hag = (z - zmin).astype(np.float32)
    local_range = (zmax - zmin).astype(np.float32)
    slope_proxy = hag / (local_range + 1e-6)

    inten = intensity.astype(np.float32)
    rn    = ret_num.astype(np.float32)
    nr    = n_returns.astype(np.float32)

    X = np.stack([z, hag, local_range, slope_proxy, zstd, zspan, inten, rn, nr], axis=1).astype(np.float32)
    return X, hag

# ----------------------------
# LOAD FILE LISTS
# ----------------------------
os.makedirs(OUT_DIR, exist_ok=True)

labeled_files = sorted(glob.glob(os.path.join(LABELED_DIR, "*.la*")))
if not labeled_files:
    raise RuntimeError(f"No labeled .laz/.las files found in: {LABELED_DIR}")

print("Found labeled files:", len(labeled_files))
for f in labeled_files[:10]:
    print(" -", f)

# ----------------------------
# BUILD TRAINING DATASET (BALANCED)
# ----------------------------
rng = np.random.default_rng(42)

per_file_each = max(MIN_PER_FILE_EACH, TOTAL_EACH_CLASS // len(labeled_files))
print("Per-file target per class:", per_file_each)

X_list, y_list = [], []

for fp in labeled_files:
    las = laspy.read(fp)

    xyz = np.vstack([las.x, las.y, las.z]).T.astype(np.float32)
    cls = np.asarray(las.classification, dtype=np.int32)

    intensity = np.asarray(las.intensity)
    ret_num   = np.asarray(las.return_number)
    n_returns = np.asarray(las.number_of_returns)

    y = (cls == GROUND_CLASS).astype(np.int32)   # 1=ground, 0=non-ground

    g_idx = np.where(y == 1)[0]
    n_idx = np.where(y == 0)[0]

    if len(g_idx) < 2000 or len(n_idx) < 2000:
        print("Skipping (too few for sampling):", os.path.basename(fp))
        continue

    g_s = rng.choice(g_idx, min(per_file_each, len(g_idx)), replace=False)
    n_s = rng.choice(n_idx, min(per_file_each, len(n_idx)), replace=False)

    idx = np.concatenate([g_s, n_s])
    rng.shuffle(idx)

    X, _ = make_features(xyz[idx], intensity[idx], ret_num[idx], n_returns[idx], cell=GRID_CELL)
    y_s = y[idx]

    X_list.append(X)
    y_list.append(y_s)

    print(f"Loaded: {os.path.basename(fp)} | samples={len(idx)} | ground={int(y_s.sum())}")

X_all = np.vstack(X_list)
y_all = np.concatenate(y_list)

print("Final training samples:", len(y_all), "| ground:", int(y_all.sum()), "| non-ground:", int((1-y_all).sum()))

# ----------------------------
# TRAIN MODEL
# ----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

model = HistGradientBoostingClassifier(
    max_depth=12,
    learning_rate=0.06,
    max_iter=350,
    l2_regularization=1e-3
)
model.fit(X_train, y_train)

pred = model.predict(X_val)
print("\nValidation report (0=non-ground, 1=ground):")
print(classification_report(y_val, pred, digits=4))

with open(MODEL_PKL, "wb") as f:
    pickle.dump(model, f)
print("Saved model:", MODEL_PKL)

# ----------------------------
# PREDICT ON UNCLASSIFIED FILES
# ----------------------------
for in_path in UNCLASSIFIED_FILES:
    las = laspy.read(in_path)

    xyz = np.vstack([las.x, las.y, las.z]).T.astype(np.float32)
    intensity = np.asarray(las.intensity)
    ret_num   = np.asarray(las.return_number)
    n_returns = np.asarray(las.number_of_returns)

    X, hag = make_features(xyz, intensity, ret_num, n_returns, cell=GRID_CELL)

    proba = model.predict_proba(X)[:, 1]          # P(ground)
    is_ground = proba >= 0.5

    new_cls = np.full((len(xyz),), DEFAULT_CLASS, dtype=np.uint8)
    new_cls[is_ground] = GROUND_CLASS

    # enforce: up to 1m above ground stays default
    new_cls[(new_cls == GROUND_CLASS) & (hag > DEFAULT_UP_TO_H)] = DEFAULT_CLASS

    las.classification = new_cls

    base = os.path.splitext(os.path.basename(in_path))[0]
    out_las = os.path.join(OUT_DIR, f"{base}_stage1_ground.las")
    out_laz = os.path.join(OUT_DIR, f"{base}_stage1_ground.laz")

    # Write LAS (CloudCompare safest)
    las.write(out_las)
    print("\nSaved:", out_las)

    # Try LAZ too
    try:
        las.write(out_laz)
        print("Saved:", out_laz)
    except Exception as e:
        print("LAZ write failed (ok). Use LAS. Error:", e)

    u, c = np.unique(new_cls, return_counts=True)
    print("Output class counts:", dict(zip(u.tolist(), c.tolist())))

print("\nDONE Stage-1 Ground.")

Found labeled files: 10
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000001.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000002.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000003.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000004.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000005.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000006.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000007.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000008.laz
 - D:/lidarrrrr/anbu/training_labeled\DX3011148 ULMIANO000009.laz
 - D:/lidarrrrr/anbu/training_labeled\pt013390.laz
Per-file target per class: 70000
Loaded: DX3011148 ULMIANO000001.laz | samples=140000 | ground=70000
Loaded: DX3011148 ULMIANO000002.laz | samples=140000 | ground=70000
Loaded: DX3011148 ULMIANO000003.laz | samples=140000 | ground=70000
Loaded: DX3011148 ULMIANO000004.laz | samples=140000 | ground=70000
Loaded: DX3011148 ULMIANO0

In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/101.7 MB ? eta -:--:--
    --------------------------------------- 1.3/101.7 MB 4.4 MB/s eta 0:00:23
   - -------------------------------------- 2.6/101.7 MB 5.3 MB/s eta 0:00:19
   - -------------------------------------- 4.5/101.7 MB 6.2 MB/s eta 0:00:16
   -- ------------------------------------- 6.3/101.7 MB 7.0 MB/s eta 0:00:14
   --- ------------------------------------ 8.7/101.7 MB 7.9 MB/s eta 0:00:12
   ---- ----------------------------------- 11.5/101.7 MB 8.8 MB/s eta 0:00:11
   ----- ---------------------------------- 14.7/101.7 MB 9.7 MB/s eta 0:00:10
   ------- -------------------------------- 18.4/101.7 MB 10.6 MB/s eta 0:00:08
   -------- ------------------------------- 22.3/101.7 MB 11.5 MB/s eta 0:00:07
  

In [2]:
import os, glob, pickle
import numpy as np
import laspy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ----------------------------
# PATHS
# ----------------------------
LABELED_DIR = r"D:/lidarrrrr/anbu/training_labeled"
UNCLASSIFIED_FILES = [
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000001.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000002.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000003.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000004.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000005.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000006.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000007.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000008.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000009.laz",
    r"D:\lidarrrrr\anbu\LAZ\LAZ\DX3042075 CASTELPOGGIO000010.laz"   
]
OUT_DIR   = r"D:/lidarrrrr/anbu/New folder/stage1_outputs_v3"
MODEL_PKL = r"D:/lidarrrrr/anbu/New folder/stage1_ground_model_v3_xgb.pkl"

os.makedirs(OUT_DIR, exist_ok=True)

GROUND_CLASS  = 2
DEFAULT_CLASS = 1
DEFAULT_UP_TO_H = 1.0

GRID_CELL = 2.0

TOTAL_EACH_CLASS = 800_000
MIN_PER_FILE_EACH = 25_000

# ----------------------------
# FEATURES
# ----------------------------
def grid_stats(xyz, cell=2.0):
    x, y, z = xyz[:,0], xyz[:,1], xyz[:,2]
    minx, miny = x.min(), y.min()
    gx = np.floor((x - minx)/cell).astype(np.int32)
    gy = np.floor((y - miny)/cell).astype(np.int32)
    key = gx.astype(np.int64)*1_000_000 + gy.astype(np.int64)

    order = np.argsort(key)
    key_s = key[order]
    z_s   = z[order]

    uniq, start = np.unique(key_s, return_index=True)

    zmin  = np.full(len(uniq), np.inf,  dtype=np.float32)
    zstd  = np.full(len(uniq), 0.0,     dtype=np.float32)
    zspan = np.full(len(uniq), 0.0,     dtype=np.float32)

    for i in range(len(uniq)):
        a = start[i]
        b = start[i+1] if i+1 < len(uniq) else len(key_s)
        zs = z_s[a:b]
        zmin[i] = float(zs.min())
        zstd[i] = float(zs.std())
        zspan[i] = float(np.percentile(zs, 95) - np.percentile(zs, 5))

    pos = np.searchsorted(uniq, key)
    pos = np.clip(pos, 0, len(uniq)-1)

    return zmin[pos], zstd[pos], zspan[pos]

def make_features(xyz, intensity, ret_num, n_returns, cell=2.0):
    zmin, zstd, zspan = grid_stats(xyz, cell=cell)
    z = xyz[:,2].astype(np.float32)

    hag = (z - zmin).astype(np.float32)
    # slope proxy using zspan (robust)
    slope_proxy = hag / (zspan + 1e-6)

    inten = intensity.astype(np.float32)
    rn    = ret_num.astype(np.float32)
    nr    = n_returns.astype(np.float32)

    # IMPORTANT: no raw absolute z (helps generalization)
    # Use zmin (local ground elevation) + hag + roughness/spread + sensor attrs
    X = np.stack([hag, zmin, zstd, zspan, slope_proxy, inten, rn, nr], axis=1).astype(np.float32)
    return X, hag

# ----------------------------
# TRAIN (XGBoost)
# ----------------------------
from xgboost import XGBClassifier

labeled_files = sorted(glob.glob(os.path.join(LABELED_DIR, "*.la*")))
if not labeled_files:
    raise RuntimeError("No labeled files found")

print("Found labeled files:", len(labeled_files))

per_file_each = max(MIN_PER_FILE_EACH, TOTAL_EACH_CLASS // len(labeled_files))
print("Per-file target per class:", per_file_each)

rng = np.random.default_rng(42)
X_list, y_list = [], []

for fp in labeled_files:
    las = laspy.read(fp)
    xyz = np.vstack([las.x, las.y, las.z]).T.astype(np.float32)
    cls = np.asarray(las.classification, dtype=np.int32)

    intensity = np.asarray(las.intensity)
    ret_num   = np.asarray(las.return_number)
    n_returns = np.asarray(las.number_of_returns)

    y = (cls == GROUND_CLASS).astype(np.int32)
    g_idx = np.where(y == 1)[0]
    n_idx = np.where(y == 0)[0]

    if len(g_idx) < 2000 or len(n_idx) < 2000:
        print("Skipping:", os.path.basename(fp))
        continue

    g_s = rng.choice(g_idx, min(per_file_each, len(g_idx)), replace=False)
    n_s = rng.choice(n_idx, min(per_file_each, len(n_idx)), replace=False)

    idx = np.concatenate([g_s, n_s])
    rng.shuffle(idx)

    X, _ = make_features(xyz[idx], intensity[idx], ret_num[idx], n_returns[idx], cell=GRID_CELL)
    y_s = y[idx]

    X_list.append(X)
    y_list.append(y_s)
    print("Loaded:", os.path.basename(fp), "| samples:", len(idx))

X_all = np.vstack(X_list)
y_all = np.concatenate(y_list)

print("Final training samples:", len(y_all), "| ground:", int(y_all.sum()), "| non-ground:", int((1-y_all).sum()))

X_train, X_val, y_train, y_val = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

model = XGBClassifier(
    n_estimators=600,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="logloss",
)
model.fit(X_train, y_train)

pred = (model.predict_proba(X_val)[:,1] >= 0.5).astype(np.int32)
print("\nValidation report (0=non-ground, 1=ground):")
print(classification_report(y_val, pred, digits=4))

with open(MODEL_PKL, "wb") as f:
    pickle.dump(model, f)
print("Saved model:", MODEL_PKL)

# ----------------------------
# PREDICT
# ----------------------------
for in_path in UNCLASSIFIED_FILES:
    las = laspy.read(in_path)
    xyz = np.vstack([las.x, las.y, las.z]).T.astype(np.float32)

    intensity = np.asarray(las.intensity)
    ret_num   = np.asarray(las.return_number)
    n_returns = np.asarray(las.number_of_returns)

    X, hag = make_features(xyz, intensity, ret_num, n_returns, cell=GRID_CELL)

    proba = model.predict_proba(X)[:, 1]
    is_ground = proba >= 0.5

    new_cls = np.full((len(xyz),), DEFAULT_CLASS, dtype=np.uint8)
    new_cls[is_ground] = GROUND_CLASS

    # enforce: up to 1m above local ground stays default
    new_cls[(new_cls == GROUND_CLASS) & (hag > DEFAULT_UP_TO_H)] = DEFAULT_CLASS

    las.classification = new_cls

    base = os.path.splitext(os.path.basename(in_path))[0]
    out_las = os.path.join(OUT_DIR, f"{base}_stage1_ground.las")
    out_laz = os.path.join(OUT_DIR, f"{base}_stage1_ground.laz")

    las.write(out_las)
    print("\nSaved:", out_las)

    try:
        las.write(out_laz)
        print("Saved:", out_laz)
    except Exception as e:
        print("LAZ write failed (ok). Error:", e)

    u, c = np.unique(new_cls, return_counts=True)
    print("Output class counts:", dict(zip(u.tolist(), c.tolist())))

print("\nDONE Stage-1 Ground v3.")

Found labeled files: 10
Per-file target per class: 80000
Loaded: DX3011148 ULMIANO000001.laz | samples: 160000
Loaded: DX3011148 ULMIANO000002.laz | samples: 160000
Loaded: DX3011148 ULMIANO000003.laz | samples: 160000
Loaded: DX3011148 ULMIANO000004.laz | samples: 160000
Loaded: DX3011148 ULMIANO000005.laz | samples: 160000
Loaded: DX3011148 ULMIANO000006.laz | samples: 160000
Loaded: DX3011148 ULMIANO000007.laz | samples: 160000
Loaded: DX3011148 ULMIANO000008.laz | samples: 160000
Loaded: DX3011148 ULMIANO000009.laz | samples: 160000
Loaded: pt013390.laz | samples: 160000
Final training samples: 1600000 | ground: 800000 | non-ground: 800000

Validation report (0=non-ground, 1=ground):
              precision    recall  f1-score   support

           0     0.8134    0.8113    0.8124    160000
           1     0.8118    0.8139    0.8128    160000

    accuracy                         0.8126    320000
   macro avg     0.8126    0.8126    0.8126    320000
weighted avg     0.8126    0.81