# Feature Engineering for Physics-Informed ViT

In [2]:
import logging

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from src.config import CLASS_ID_SPATTER, CLASS_ID_STREAK, get_dataset_path
from tqdm import tqdm


In [3]:
# Constants for physics-based features
LAYER_THICKNESS = 0.05  # mm

In [4]:
# Set dataset key
DATASET_KEY = "tcr_phase1_build1"
data_path = get_dataset_path(DATASET_KEY)
if data_path is None:
    raise FileNotFoundError(f"Dataset path for key '{DATASET_KEY}' not found.")

In [None]:
# Load HDF5 data
#TODO: Find out if pandas is actually slowing this block down. Numpy isn't that much any faster
with h5py.File(data_path, "r") as data:

    # --- 1. Load Temporal and Process Data ---
    temp_features = [
        "temporal/build_plate_temperature",
        "temporal/top_chamber_temperature",
        "temporal/bottom_chamber_temperature",
        "temporal/actual_ventilator_flow_rate",
        "temporal/layer_times",
        "temporal/gas_loop_oxygen"
    ]

    phys_df = pd.DataFrame({
        key.split("/")[-1]: data[key][:] for key in temp_features
    })

    # Add derived physics features
    power = data["parts/process_parameters/laser_beam_power"][:]
    speed = data["parts/process_parameters/laser_beam_speed"][:]
    hatch = data["parts/process_parameters/hatch_spacing"][:]

    energy_density = power / (speed * hatch * LAYER_THICKNESS)
    phys_df["energy_density"] = np.repeat(energy_density.mean(), len(phys_df))

    # Normalize physics features
    scaler = MinMaxScaler()
    phys_scaled = pd.DataFrame(scaler.fit_transform(phys_df), columns=phys_df.columns)

    # --- 2. Load Segmentation Labels for Spatter & Recoater Streaking ---
    labels = []
    for layer in tqdm(range(len(phys_scaled))):
        try:
            spatter = data[f"slices/segmentation_results/{CLASS_ID_SPATTER}"][layer]
            recoater = data[f"slices/segmentation_results/{CLASS_ID_STREAK}"][layer]
            labels.append({
                "layer": layer,
                "spatter_present": int(spatter.sum() > 0),
                "recoater_streaking_present": int(recoater.sum() > 0)
            })
        except KeyError:
            break

    labels_df = pd.DataFrame(labels)

    # --- 3. Merge Physics & Label Data ---
    feature_df = pd.concat([labels_df.set_index("layer"), phys_scaled], axis=1).dropna()

    # --- 4. Preview Feature Correlation ---
    plt.figure(figsize=(10, 6))
    plt.title("Feature Correlation with Anomalies")
    correlation = feature_df.corr()
    correlation["spatter_present"].drop("spatter_present").plot(kind="bar", alpha=0.6, label="Spatter")
    correlation["recoater_streaking_present"].drop("recoater_streaking_present").plot(kind="bar", alpha=0.6, label="Recoater")
    plt.legend()
    plt.ylabel("Correlation coefficient")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # # Save feature_df to use in later ViT training
    # feature_df.to_csv("/data/feature_dataset.csv", index=False)

NameError: name 'h5py' is not defined

In [None]:
print("Feature engineering complete. Saved to /data/feature_dataset.csv")