This notebook constructs an inference-ready dataset from raw spatial sample points.
It applies the same preprocessing pipeline used for training data to ensure consistency with the trained model.

In [1]:
QLD_BOUNDS = {
    "lat_min": -29.0,
    "lat_max": -10.5,
    "lon_min": 137.9,
    "lon_max": 153.6
}

In [18]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point

# 1. Load QLD boundary
gdf = gpd.read_file("../../data/raw/Dataset/australia_boundary/australian-states.json")
qld = gdf[gdf["STATE_NAME"].str.lower() == "queensland"].to_crs(epsg=3577)

# 2. Get bounding box
minx, miny, maxx, maxy = qld.total_bounds

# 3. Define grid spacing (in meters, since EPSG:3577 is in meters)
spacing = 20000  # 20 km grid spacing
x_coords = np.arange(minx, maxx, spacing)
y_coords = np.arange(miny, maxy, spacing)

# 4. Generate grid points
grid_points = [Point(x, y) for x in x_coords for y in y_coords]

# 5. Keep points within QLD
grid_gdf = gpd.GeoDataFrame(geometry=grid_points, crs="EPSG:3577")
grid_gdf = grid_gdf[grid_gdf.within(qld.union_all())].reset_index(drop=True)

# 6. Extract lat/lon
grid_gdf["longitude"] = grid_gdf.geometry.x
grid_gdf["latitude"] = grid_gdf.geometry.y

# 7. Save to CSV
grid_gdf[["latitude", "longitude"]].to_csv("../../tests/data/user/qld_coords_only.csv", index=False)

print(f"Sampled {len(grid_gdf)} regular grid points within QLD.")


Sampled 4310 regular grid points within QLD.


In [22]:
qld = gdf[gdf["STATE_NAME"].str.lower() == "tasmania"].to_crs(epsg=3577)

# 2. Get bounding box
minx, miny, maxx, maxy = qld.total_bounds

# 3. Define grid spacing (in meters, since EPSG:3577 is in meters)
spacing = 2000  # 2 km grid spacing
x_coords = np.arange(minx, maxx, spacing)
y_coords = np.arange(miny, maxy, spacing)

# 4. Generate grid points
grid_points = [Point(x, y) for x in x_coords for y in y_coords]

# 5. Keep points within QLD
grid_gdf = gpd.GeoDataFrame(geometry=grid_points, crs="EPSG:3577")
grid_gdf = grid_gdf[grid_gdf.within(qld.union_all())].reset_index(drop=True)

# 6. Extract lat/lon
grid_gdf["longitude"] = grid_gdf.geometry.x
grid_gdf["latitude"] = grid_gdf.geometry.y

# 7. Save to CSV
grid_gdf[["latitude", "longitude"]].to_csv("../../tests/data/user/tas_coords_only.csv", index=False)

print(f"Sampled {len(grid_gdf)} regular grid points within TAS.")

Sampled 16332 regular grid points within TAS.


In [19]:
import pandas as pd
import rasterio
import numpy as np
import os

# load sampling points
coords_df = pd.read_csv("../../tests/data/user/qld_coords_only.csv")

# set feature names
FEATURE_COLUMNS = [
    'gravity_iso_residual', 'gravity_cscba', 'gravity_cscba_1vd', 'gravity_iso_residual_stddev3x3', 'gravity_cscba_stddev3x3', 
    
    'mag_uc_1_2km', 'mag_uc_2_4km', 'mag_uc_4_8km', 'mag_uc_8_12km', 'mag_uc_12_16km', 'mag_uc_2_4km_1vd', 'mag_uc_2_4km_thd', 'mag_uc_2_4km_stddev3x3', 
    
    'radio_k_pct', 'radio_th_ppm', 'radio_u_ppm', 'radio_th_k_ratio', 'radio_u_k_ratio', 'radio_u_th_ratio'
]

geotiff_folder = "../../tests/data/system"
geotiff_paths = {feature: os.path.join(geotiff_folder, f"{feature}.tif") for feature in FEATURE_COLUMNS}

def extract_feature_values(df, tiff_path):
    try:
        with rasterio.open(tiff_path) as src:
            coords = [(x, y) for x, y in zip(df["longitude"], df["latitude"])]
            values = []
            for val in src.sample(coords):
                v = val[0]
                if v == src.nodata or np.isnan(v):
                    values.append(np.nan)
                else:
                    values.append(float(v))
            return values
    except Exception as e:
        print(f"Error extracting {tiff_path}: {e}")
        return [np.nan] * len(df)

# extracting each features
for feature, path in geotiff_paths.items():
    if os.path.exists(path):
        coords_df[feature] = extract_feature_values(coords_df, path)
    else:
        coords_df[feature] = np.nan

# store as full_features.csv
coords_df.to_csv("../../tests/data/user/qld_full_features.csv", index=False)

print("✅ qld_full_features.csv generated, including all features")


✅ full_features.csv generated, including all features


In [23]:
import pandas as pd
import rasterio
import numpy as np
import os

# load sampling points
coords_df = pd.read_csv("../../tests/data/user/tas_coords_only.csv")

# set feature names
FEATURE_COLUMNS = [
    'gravity_iso_residual', 'gravity_cscba', 'gravity_cscba_1vd', 'gravity_iso_residual_stddev3x3', 'gravity_cscba_stddev3x3', 
    
    'mag_uc_1_2km', 'mag_uc_2_4km', 'mag_uc_4_8km', 'mag_uc_8_12km', 'mag_uc_12_16km', 'mag_uc_2_4km_1vd', 'mag_uc_2_4km_thd', 'mag_uc_2_4km_stddev3x3', 
    
    'radio_k_pct', 'radio_th_ppm', 'radio_u_ppm', 'radio_th_k_ratio', 'radio_u_k_ratio', 'radio_u_th_ratio'
]

geotiff_folder = "../../tests/data/system"
geotiff_paths = {feature: os.path.join(geotiff_folder, f"{feature}.tif") for feature in FEATURE_COLUMNS}

def extract_feature_values(df, tiff_path):
    try:
        with rasterio.open(tiff_path) as src:
            coords = [(x, y) for x, y in zip(df["longitude"], df["latitude"])]
            values = []
            for val in src.sample(coords):
                v = val[0]
                if v == src.nodata or np.isnan(v):
                    values.append(np.nan)
                else:
                    values.append(float(v))
            return values
    except Exception as e:
        print(f"Error extracting {tiff_path}: {e}")
        return [np.nan] * len(df)

# extracting each features
for feature, path in geotiff_paths.items():
    if os.path.exists(path):
        coords_df[feature] = extract_feature_values(coords_df, path)
    else:
        coords_df[feature] = np.nan

# store as full_features.csv
coords_df.to_csv("../../tests/data/user/tas_full_features.csv", index=False)

print("tas_full_features.csv generated, including all features")


tas_full_features.csv generated, including all features
