# EmberSense Week 1 Preprocessing Notebook

This notebook annotates, cleans, and prepares the EmberSense multi-modal dataset. Run end-to-end via `scripts/preprocessing/run_preprocessing.py` for reproducibility.


In [None]:
import json
import os
from pathlib import Path

import librosa
import numpy as np
import pandas as pd
from IPython.display import Audio, display



In [None]:
PROJECT_ROOT = Path.cwd().parents[1]
RAW_AUDIO = PROJECT_ROOT / "data" / "raw" / "audio"
RAW_SENSORS = PROJECT_ROOT / "data" / "raw" / "sensors"
LABELS_DIR = PROJECT_ROOT / "metadata" / "labels"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")


In [None]:
audio_labels = pd.read_csv(LABELS_DIR / "audio_event_annotations.csv")
sensor_segments = pd.read_csv(LABELS_DIR / "sensor_segment_annotations.csv")

print(f"Loaded {len(audio_labels)} audio events, {len(sensor_segments)} sensor segments")
audio_labels.head()


In [None]:
TARGET_SR = 16_000
TARGET_DURATION = 10.0


def load_audio(row):
    path = PROJECT_ROOT / row.file_path
    if not path.exists():
        raise FileNotFoundError(path)
    audio, sr = librosa.load(path, sr=TARGET_SR, mono=True)
    return audio


def normalize_duration(audio: np.ndarray, sr: int = TARGET_SR) -> np.ndarray:
    target_len = int(TARGET_DURATION * sr)
    if len(audio) > target_len:
        return audio[:target_len]
    if len(audio) < target_len:
        reps = int(np.ceil(target_len / len(audio)))
        tiled = np.tile(audio, reps)
        return tiled[:target_len]
    return audio


def lufs_normalize(audio: np.ndarray) -> np.ndarray:
    peak = np.max(np.abs(audio)) + 1e-9
    return 0.8 * (audio / peak)



In [None]:
processed_records = []
for _, row in audio_labels.iterrows():
    try:
        audio = load_audio(row)
    except FileNotFoundError as exc:
        print(f"Missing audio: {exc}")
        continue
    audio = normalize_duration(audio)
    audio = lufs_normalize(audio)
    out_path = PROCESSED_DIR / "audio" / f"{row.event_id}.npy"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    np.save(out_path, audio)
    processed_records.append({"event_id": row.event_id, "output": str(out_path)})

pd.DataFrame(processed_records).head()


In [None]:
def load_sensor_parquet(path: Path) -> pd.DataFrame:
    return pd.read_parquet(path)


sensor_outputs = []
for _, row in sensor_segments.iterrows():
    path = PROJECT_ROOT / row.file_path
    if not path.exists():
        print(f"Missing sensor file: {path}")
        continue
    df = load_sensor_parquet(path)
    window = df.loc[row.start_iso:row.end_iso]
    window = window.resample("1S").interpolate(limit=60)
    norm = (window - window.mean()) / window.std().replace(0, 1)
    out_path = PROCESSED_DIR / "sensors" / f"{row.segment_id}.parquet"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    norm.to_parquet(out_path)
    sensor_outputs.append({"segment_id": row.segment_id, "output": str(out_path)})

pd.DataFrame(sensor_outputs).head()


In [None]:
from sklearn.model_selection import train_test_split

split_manifest = {
    "version": "0.1.0",
    "seed": 42,
    "splits": {}
}

train, test = train_test_split(audio_labels["event_id"], test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.1, random_state=42)
split_manifest["splits"]["train"] = train.tolist()
split_manifest["splits"]["val"] = val.tolist()
split_manifest["splits"]["test"] = test.tolist()

split_path = PROJECT_ROOT / "metadata" / "splits" / "dataset_splits.json"
split_path.write_text(json.dumps(split_manifest, indent=2))
split_manifest


In [None]:
summary = {
    "generated_at": pd.Timestamp.utcnow().isoformat(),
    "audio_events": {
        "total_records": int(audio_labels.shape[0]),
        "class_counts": audio_labels["label"].value_counts().to_dict(),
        "avg_duration_s": float((audio_labels["end_s"] - audio_labels["start_s"]).mean()),
    },
    "sensor_segments": {
        "total_records": int(sensor_segments.shape[0]),
        "class_counts": sensor_segments["label"].value_counts().to_dict(),
        "avg_window_minutes": float(
            (pd.to_datetime(sensor_segments["end_iso"]) - pd.to_datetime(sensor_segments["start_iso"])).dt.total_seconds().mean() / 60.0
        ),
    },
}
summary_path = PROJECT_ROOT / "artifacts" / "week1" / "dataset_summary.json"
summary_path.parent.mkdir(parents=True, exist_ok=True)
summary_path.write_text(json.dumps(summary, indent=2))
summary
