# Notebook for setup og DataFrame for use acros project
Update file roots and output dir before running.

In [1]:
from pathlib import Path
import sys
import pandas as pd
from dataclasses import asdict

PROJECT_ROOT = Path.cwd()
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.append(str(SRC_ROOT))

from hrv_epatch.dataset.loader import iter_recordings
from hrv_epatch.dataset.naming import parse_recording_key
from hrv_epatch.dataset.seizures import build_seizure_events_from_df


## OBS - Configure root's and paths before running notebook!

In [2]:
TDMS_ROOT = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Patients ePatch data")
ANN_ROOT  = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Seizure log ePatch patients with seizures - excluded seizures removed")

# OUT_DIR = PROJECT_ROOT / "_analysis"
OUT_DIR = Path(r"E:\Speciale - Results\Datastruct")
OUT_DIR.mkdir(exist_ok=True, parents=True)

REC_OUT_PATH = OUT_DIR / "recordings_index.parquet"
EVT_OUT_PATH = OUT_DIR / "seizure_events.parquet"


## Load data

In [3]:
recording_rows = []
event_rows = []

for sig, meta, ann in iter_recordings(TDMS_ROOT, ANN_ROOT):
    rec_start = pd.to_datetime(meta.start_time)
    rec_duration_s = len(sig) / meta.fs
    rec_end = rec_start + pd.to_timedelta(rec_duration_s, unit="s")

    tdms_path = Path(meta.path)
    key = parse_recording_key(tdms_path)

    events = build_seizure_events_from_df(ann, rec_start, rec_end)

    rec_id = len(recording_rows)

    # annotation_source handling NaN/None cases:
    if ann is not None and not ann.empty and "source_file" in ann.columns:
        ann_source = ann["source_file"].iloc[0]
    else:
        ann_source = None

    recording_rows.append({
        "recording_uid": rec_id,
        "patient_id": key.patient_id,
        "enrollment_id": key.enrollment_id,
        "recording_id": key.recording_id,
        "tdms_path": str(tdms_path),
        "annotation_source": ann_source,
        "recording_start": rec_start,
        "recording_end": rec_end,
        "rec_duration_s": rec_duration_s,
        "fs": meta.fs,
    })

    for ev in events:
        row = {
            "recording_uid": rec_id,
            "patient_id": key.patient_id,
            "enrollment_id": key.enrollment_id,
            "recording_id": key.recording_id,
            "seizure_id": ev.seizure_id,
            "t0": ev.t0,
            "t1": ev.t1,
            "duration_s": ev.t1 - ev.t0,
            "absolute_start": rec_start + pd.to_timedelta(ev.t0, unit="s"),
            "absolute_end": rec_start + pd.to_timedelta(ev.t1, unit="s"),
        }

        if ev.t0_video is not None:
            row["t0_video"] = ev.t0_video
            row["t1_video"] = ev.t1_video
            row["absolute_start_video"] = rec_start + pd.to_timedelta(ev.t0_video, unit="s")
            row["absolute_end_video"]   = rec_start + pd.to_timedelta(ev.t1_video, unit="s")

        if ev.t0_clinical is not None:
            row["t0_clinical"] = ev.t0_clinical
            row["t1_clinical"] = ev.t1_clinical
            row["absolute_start_clinical"] = rec_start + pd.to_timedelta(ev.t0_clinical, unit="s")
            row["absolute_end_clinical"]   = rec_start + pd.to_timedelta(ev.t1_clinical, unit="s")

        event_rows.append(row)

df_rec = pd.DataFrame(recording_rows)
df_evt = pd.DataFrame(event_rows)

df_rec.head(), df_evt.head()

(   recording_uid  patient_id enrollment_id  recording_id  \
 0              0           1          None             1   
 1              1           1          None             2   
 2              2           2          None             1   
 3              3           3          None             1   
 4              4           3          None             2   
 
                                            tdms_path annotation_source  \
 0  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 1.xls   
 1  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 1.xls   
 2  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 2.xls   
 3  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 3.xls   
 4  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 3.xls   
 
       recording_start                 recording_end  rec_duration_s     fs  
 0 2016-02-22 11:04:14 2016-02-24 16:09:49.750000000   191135.750000  512.0  
 1 2016-02-24 16:15:00 2

Sanity Check to validate correct data loading

In [5]:
# print("Antal recordings:", len(df_rec))
# print("Antal seizures:", len(df_evt))

# print("\nUnikke patienter i df_rec:", sorted(df_rec["patient_id"].unique()))
# print("Unikke patienter i df_evt:", sorted(df_evt["patient_id"].unique()))

# df_rec.groupby("patient_id")["recording_uid"].count()
# df_evt.groupby("patient_id")["seizure_id"].count()


## Save DataFrame

In [6]:
from pathlib import Path

PROJECT_ROOT = Path.cwd()
OUT_DIR = Path(r"E:\Speciale - Results\Datastruct")
OUT_DIR.mkdir(exist_ok=True, parents=True)

REC_OUT_PATH = OUT_DIR / "recordings_index.csv"
EVT_OUT_PATH = OUT_DIR / "seizure_events.csv"

df_rec.to_csv(REC_OUT_PATH, index=False)
df_evt.to_csv(EVT_OUT_PATH, index=False)

print("Saved:")
print("  recordings ->", REC_OUT_PATH)
print("  events     ->", EVT_OUT_PATH)


Saved:
  recordings -> E:\Speciale - Results\Datastruct\recordings_index.csv
  events     -> E:\Speciale - Results\Datastruct\seizure_events.csv


Validation code:

In [6]:
# # hvor mange seizures har vi nu for patient 31?
# df_evt[df_evt["patient_id"] == 31].shape

# # kig på de første par
# df_evt[df_evt["patient_id"] == 31][["seizure_id", "duration_s", "absolute_start"]].head()

# from pathlib import Path
# import pandas as pd
# from hrv_epatch.dataset.annotations import load_annotations

# # Alle recordings for patient 31
# df_rec_31 = df_rec[df_rec["patient_id"] == 31]
# print(df_rec_31[["recording_uid", "enrollment_id", "tdms_path", "recording_start", "recording_end"]])

# rec31a = df_rec_31[df_rec_31["enrollment_id"] == "a"].iloc[0]
# print(rec31a)

# from hrv_epatch.dataset.annotations import find_annotation_file
# from hrv_epatch.dataset.naming import RecordingKey

# # rekonstruér key for 31a
# tdms_path_31a = Path(rec31a["tdms_path"])
# key31a = RecordingKey(patient_id=31, enrollment_id="a", recording_id=rec31a["recording_id"])

# print("Candidates for 31a:", [p.name for p in ANN_ROOT.glob("Patient 31*")])
# print("find_annotation_file says:", find_annotation_file(key31a, ANN_ROOT))


In [7]:
df_rec = pd.read_csv(OUT_DIR / "recordings_index.csv",
                     parse_dates=["recording_start", "recording_end"])
df_evt = pd.read_csv(OUT_DIR / "seizure_events.csv",
                     parse_dates=["absolute_start", "absolute_end"])


In [8]:
df_rec_31 = df_rec[df_rec["patient_id"] == 31]
print(df_rec_31[["recording_uid", "enrollment_id", "tdms_path",
                 "recording_start", "recording_end", "annotation_source"]])

df_evt_31 = df_evt[df_evt["patient_id"] == 31]
print("Seizures for patient 31:", len(df_evt_31))
print(df_evt_31.groupby("enrollment_id")["seizure_id"].count())


    recording_uid enrollment_id  \
51             51             a   
52             52             b   
53             53             b   

                                            tdms_path     recording_start  \
51  E:\ML algoritme tl anfaldsdetektion vha HRV\eP... 2016-06-07 12:35:25   
52  E:\ML algoritme tl anfaldsdetektion vha HRV\eP... 2018-09-10 11:54:20   
53  E:\ML algoritme tl anfaldsdetektion vha HRV\eP... 2018-09-12 10:58:06   

             recording_end annotation_source  
51 2016-06-09 09:33:35.250  patient 31a.xlsx  
52 2018-09-12 10:06:24.250   patient 31b.xls  
53 2018-09-13 09:24:15.500   patient 31b.xls  
Seizures for patient 31: 21
enrollment_id
a    19
b     2
Name: seizure_id, dtype: int64
