# Notebook for setup og DataFrame for use acros project
Update file roots and output dir before running.

In [4]:
from pathlib import Path
import sys
import pandas as pd
from dataclasses import asdict

PROJECT_ROOT = Path.cwd()
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.append(str(SRC_ROOT))

from hrv_epatch.dataset.loader import iter_recordings
from hrv_epatch.dataset.naming import parse_recording_key
from hrv_epatch.dataset.seizures import build_seizure_events_from_df


## OBS - Configure root's and paths before running notebook!

In [5]:
TDMS_ROOT = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Patients ePatch data")
ANN_ROOT  = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Seizure log ePatch patients with seizures - excluded seizures removed")

# OUT_DIR = PROJECT_ROOT / "_analysis"
OUT_DIR = Path(r"E:\Speciale - Results\Datastruct")
OUT_DIR.mkdir(exist_ok=True, parents=True)

REC_OUT_PATH = OUT_DIR / "recordings_index.parquet"
EVT_OUT_PATH = OUT_DIR / "seizure_events.parquet"


## Load data

In [9]:
recording_rows = []
event_rows = []

for sig, meta, ann in iter_recordings(TDMS_ROOT, ANN_ROOT):
    rec_start = pd.to_datetime(meta.start_time)
    rec_duration_s = len(sig) / meta.fs
    rec_end = rec_start + pd.to_timedelta(rec_duration_s, unit="s")

    tdms_path = Path(meta.path)
    key = parse_recording_key(tdms_path)

    events = build_seizure_events_from_df(ann, rec_start, rec_end)

    rec_id = len(recording_rows)

    # annotation_source hÃ¥ndtering med None-sikring:
    if ann is not None and not ann.empty and "source_file" in ann.columns:
        ann_source = ann["source_file"].iloc[0]
    else:
        ann_source = None

    recording_rows.append({
        "recording_uid": rec_id,
        "patient_id": key.patient_id,
        "enrollment_id": key.enrollment_id,
        "recording_id": key.recording_id,
        "tdms_path": str(tdms_path),
        "annotation_source": ann_source,
        "recording_start": rec_start,
        "recording_end": rec_end,
        "rec_duration_s": rec_duration_s,
        "fs": meta.fs,
    })

    for ev in events:
        event_rows.append({
            "recording_uid": rec_id,
            "patient_id": key.patient_id,
            "enrollment_id": key.enrollment_id,
            "recording_id": key.recording_id,
            "seizure_id": ev.seizure_id,
            "t0": ev.t0,
            "t1": ev.t1,
            "duration_s": ev.t1 - ev.t0,
            "absolute_start": rec_start + pd.to_timedelta(ev.t0, unit="s"),
            "absolute_end": rec_start + pd.to_timedelta(ev.t1, unit="s"),
        })

# ðŸ‘‡ DETTE SKAL STÃ… EFTER for-lÃ¸kken:
df_rec = pd.DataFrame(recording_rows)
df_evt = pd.DataFrame(event_rows)

df_rec.head(), df_evt.head()


(   recording_uid  patient_id enrollment_id  recording_id  \
 0              0           1          None             1   
 1              1           1          None             2   
 2              2           2          None             1   
 3              3           3          None             1   
 4              4           3          None             2   
 
                                            tdms_path annotation_source  \
 0  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 1.xls   
 1  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 1.xls   
 2  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 2.xls   
 3  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 3.xls   
 4  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 3.xls   
 
       recording_start                 recording_end  rec_duration_s     fs  
 0 2016-02-22 11:04:14 2016-02-24 16:09:49.750000000   191135.750000  512.0  
 1 2016-02-24 16:15:00 2

Sanity Check to validate correct data loading

In [12]:
# print("Antal recordings:", len(df_rec))
# print("Antal seizures:", len(df_evt))

# print("\nUnikke patienter i df_rec:", sorted(df_rec["patient_id"].unique()))
# print("Unikke patienter i df_evt:", sorted(df_evt["patient_id"].unique()))

# df_rec.groupby("patient_id")["recording_uid"].count()
# df_evt.groupby("patient_id")["seizure_id"].count()


## Save DataFrame

In [16]:
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path

# Konverter til Arrow tables
table_rec = pa.Table.from_pandas(df_rec)
table_evt = pa.Table.from_pandas(df_evt)

# Skriv til parquet
pq.write_table(table_rec, REC_OUT_PATH)
pq.write_table(table_evt, EVT_OUT_PATH)

print("Saved:")
print("  recordings ->", REC_OUT_PATH)
print("  events     ->", EVT_OUT_PATH)

Saved:
  recordings -> E:\Speciale - Results\Datastruct\recordings_index.parquet
  events     -> E:\Speciale - Results\Datastruct\seizure_events.parquet
