This notebook constructs an inference-ready dataset from raw spatial sample points.
It applies the same preprocessing pipeline used for training data to ensure consistency with the trained model.

In [1]:
QLD_BOUNDS = {
    "lat_min": -29.0,
    "lat_max": -10.5,
    "lon_min": 137.9,
    "lon_max": 153.6
}

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point

# 1. load GeoJSON
gdf = gpd.read_file("../../data/raw/Dataset/australia_boundary/australian-states.json")

# 2. filter Queensland（QLD）
qld = gdf[gdf["STATE_NAME"].str.lower() == "queensland"]

# 3. cast to proper CRS（EPSG:3577）
qld = qld.to_crs(epsg=3577)

# 4. extract QLD boundary
minx, miny, maxx, maxy = qld.total_bounds

# 5. sampling in QLD
num_points = 30000
points = []
rng = np.random.default_rng(seed=42)

while len(points) < num_points:
    x = rng.uniform(minx, maxx)
    y = rng.uniform(miny, maxy)
    pt = Point(x, y)
    if qld.contains(pt).any():
        points.append(pt)

# 6. store sampling points
points_gdf = gpd.GeoDataFrame(geometry=points, crs="EPSG:3577")

# 7. return to WGS84（EPSG:4326）for geo feature extracting
points_gdf = points_gdf.to_crs(epsg=4326)
points_gdf["latitude"] = points_gdf.geometry.y
points_gdf["longitude"] = points_gdf.geometry.x

# 8. store as coords_only.csv
points_gdf[["latitude", "longitude"]].to_csv("../../tests/data/user/coords_only.csv", index=False)


In [4]:
import pandas as pd
import rasterio
import numpy as np
import os

# load sampling points
coords_df = pd.read_csv("../../tests/data/user/coords_only.csv")

# set feature names
FEATURE_COLUMNS = [
    "gravity_iso_residual",
    "gravity_cscba",
    "gravity_cscba_1vd",
    "mag_uc_1_2km",
    "mag_uc_2_4km",
    "mag_uc_4_8km",
    "mag_uc_12_16km",
    "radio_th_ppm",
    "radio_u_ppm",
    "radio_k_pct",
    "radio_th_k_ratio",
    "radio_u_k_ratio",
    "radio_u_th_ratio"
]

geotiff_folder = "../../tests/data/system"
geotiff_paths = {feature: os.path.join(geotiff_folder, f"{feature}.tif") for feature in FEATURE_COLUMNS}

def extract_feature_values(df, tiff_path):
    try:
        with rasterio.open(tiff_path) as src:
            coords = [(x, y) for x, y in zip(df["longitude"], df["latitude"])]
            values = []
            for val in src.sample(coords):
                v = val[0]
                if v == src.nodata or np.isnan(v):
                    values.append(np.nan)
                else:
                    values.append(float(v))
            return values
    except Exception as e:
        print(f"Error extracting {tiff_path}: {e}")
        return [np.nan] * len(df)

# extracting each features
for feature, path in geotiff_paths.items():
    if os.path.exists(path):
        coords_df[feature] = extract_feature_values(coords_df, path)
    else:
        coords_df[feature] = np.nan

# store as full_features.csv
coords_df.to_csv("../../tests/data/user/full_features.csv", index=False)

print("✅ full_features.csv generated, including all features")


✅ full_features.csv generated, including all features
