This notebook constructs an inference-ready dataset from raw spatial sample points (e.g., blank area samples).
It applies the same preprocessing pipeline used for training data to ensure consistency with the trained model.

In [5]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, RobustScaler

# --- Step 1: Load blank area sample points ---
df = pd.read_csv("../../data/processed/negative_blank_samples.csv")
print(f"Loaded {df.shape[0]} samples.")


Loaded 3979 samples.


In [None]:
# --- Step 2: Load GeoTIFF-based feature data (already extracted) ---
# This assumes you have run your 02_combine_features pipeline and saved the combined results.
features_path = Path("data/blank_area_features_combined.csv")
df = pd.read_csv(features_path)

# --- Step 3: Handle missing values (use same median as training set) ---
train_medians = joblib.load("models/train_feature_medians.pkl")  # precomputed from training set
missing_cols = [col for col in train_medians.index if col in df.columns]
df[missing_cols] = df[missing_cols].fillna(train_medians[missing_cols])

# --- Step 4: Log transformation for radiometric features ---
log_transform_cols = [
    "radio_th_ppm", "radio_u_ppm", "radio_k_pct",
    "radio_u_th_ratio", "radio_th_k_ratio", "radio_u_k_ratio"
]
for col in log_transform_cols:
    df[f"{col}_log"] = df[col].apply(lambda x: np.log1p(x) if pd.notnull(x) and x >= 0 else 0)

# --- Step 5: Clip magnetic features using training boundaries ---
clip_mag_cols = [
    "mag_uc_1_2km", "mag_uc_2_4km", "mag_uc_4_8km",
    "mag_uc_8_12km", "mag_uc_12_16km"
]
clip_bounds = joblib.load("models/train_clip_bounds.pkl")  # dict of {feature: (lower, upper)}
for col in clip_mag_cols:
    lower, upper = clip_bounds[col]
    df[f"{col}_clipped"] = df[col].clip(lower=lower, upper=upper)

# --- Step 6: Apply trained scalers ---
scaler_features = joblib.load("models/final_selected_features.pkl")  # list of selected features
scaler_dict = {col: joblib.load(f"models/scalers/{col}_scaler.pkl") for col in scaler_features if Path(f"models/scalers/{col}_scaler.pkl").exists()}

for col, scaler in scaler_dict.items():
    df[col] = scaler.transform(df[[col]])

# --- Step 7: Finalize inference dataset ---
output_cols = ["LONGITUDE", "LATITUDE"] + scaler_features
df_final = df[output_cols].copy()
df_final.to_csv("data/inference_ready_dataset.csv", index=False)
print("Saved inference-ready dataset to data/inference_ready_dataset.csv")