# Notebook for setup og DataFrame for use acros project
Update file roots and output dir before running.

In [1]:
from pathlib import Path
import sys
import pandas as pd
from dataclasses import asdict

PROJECT_ROOT = Path.cwd()
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.append(str(SRC_ROOT))

from hrv_epatch.dataset.loader import iter_recordings
from hrv_epatch.dataset.naming import parse_recording_key, RecordingKey
from hrv_epatch.dataset.seizures import build_seizure_events_from_df


## OBS - Configure root's and paths before running notebook!

In [2]:
TDMS_ROOT = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Patients ePatch data")
ANN_ROOT  = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Seizure log ePatch patients with seizures - excluded seizures removed")
TRIM_ROOT = Path(r"E:\Speciale - Results\Final-LabView-Trim-Testset\Pictures\Trim-Overview.csv")

# OUT_DIR = PROJECT_ROOT / "_analysis"
OUT_DIR = Path(r"E:\Speciale - Results\Datastruct")
OUT_DIR.mkdir(exist_ok=True, parents=True)

REC_OUT_PATH = OUT_DIR / "recordings_index.parquet"
EVT_OUT_PATH = OUT_DIR / "seizure_events.parquet"


## Load data

In [3]:
# Load trim data
trim_df = pd.read_csv(TRIM_ROOT)

# trim_df = trim_df.rename(columns={
#     "uid": "recording_uid",
#     "Patient_id": "patient_id",
#     "Recording_uid": "recording_uid_str",
#     "Trim_start": "trim_start_s",
#     "Trim_end": "trim_end_s",
# })

trim_df["recording_uid"] = pd.to_numeric(trim_df["recording_uid"], errors="coerce").astype("Int64")
trim_df["trim_start_s"] = pd.to_numeric(trim_df["trim_start_s"], errors="coerce")
trim_df["trim_end_s"] = pd.to_numeric(trim_df["trim_end_s"], errors="coerce")
trim_df = trim_df.dropna(subset=["recording_uid", "trim_start_s"]).copy()

trim_map = (
    trim_df[["recording_uid", "trim_start_s", "trim_end_s"]]
    .drop_duplicates("recording_uid")
    .set_index("recording_uid")
)

In [5]:
recording_rows = []
event_rows = []

for sig, meta, ann in iter_recordings(TDMS_ROOT, ANN_ROOT):
    rec_start = pd.to_datetime(meta.start_time)
    rec_duration_s = len(sig) / meta.fs
    rec_end = rec_start + pd.to_timedelta(rec_duration_s, unit="s")

    tdms_path = Path(meta.path)
    key = parse_recording_key(tdms_path)

    rid = len(recording_rows)
    if rid in trim_df["recording_uid"].values:
        s = trim_df.loc[trim_df["recording_uid"] == rid, "recording_uid_str"].iloc[0]
        print(rid, tdms_path.name, " <-> ", s)


    trim_start_s = float(trim_map.loc[rid, "trim_start_s"]) if rid in trim_map.index else 0.0
    trim_end_s   = float(trim_map.loc[rid, "trim_end_s"])   if rid in trim_map.index else 0.0

    events = build_seizure_events_from_df(
        seizure_df=ann,
        rec_start=rec_start,
        rec_end=rec_end,
        trim_start_s=trim_start_s,
        trim_end_s=trim_end_s,
    )

    # annotation_source
    if ann is not None and not ann.empty and "source_file" in ann.columns:
        ann_source = ann["source_file"].iloc[0]
    else:
        ann_source = None

    recording_rows.append({
        "recording_uid": rid,
        "patient_id": key.patient_id,
        "enrollment_id": key.enrollment_id,
        "recording_id": key.recording_id,
        "tdms_path": str(tdms_path),
        "annotation_source": ann_source,
        "recording_start": rec_start,
        "recording_end": rec_end,
        "rec_duration_s": rec_duration_s,
        "fs": meta.fs,
        # (optional) keep trim metadata at recording-level too
        "trim_start_s": trim_start_s,
        "trim_end_s": trim_end_s,
    })

    for ev in events:
        row = {
            "recording_uid": rid,
            "patient_id": key.patient_id,
            "enrollment_id": key.enrollment_id,
            "recording_id": key.recording_id,
            "seizure_id": ev.seizure_id,

            # RAW (relative to original rec_start)
            "t0": ev.t0,
            "t1": ev.t1,
            "duration_s": ev.duration_s,
            "absolute_start": ev.absolute_start,
            "absolute_end": ev.absolute_end,

            # trim metadata
            "trim_start_s": ev.trim_start_s,
            "trim_end_s": ev.trim_end_s,

            # TRIMMED (relative to trimmed signal start)
            "t0_trim": ev.t0_trim,
            "t1_trim": ev.t1_trim,
        }

        # RAW + TRIM: video
        if ev.t0_video is not None:
            row["t0_video"] = ev.t0_video
            row["t1_video"] = ev.t1_video
            row["absolute_start_video"] = rec_start + pd.to_timedelta(ev.t0_video, unit="s")
            row["absolute_end_video"]   = rec_start + pd.to_timedelta(ev.t1_video, unit="s")

            row["t0_video_trim"] = ev.t0_video_trim
            row["t1_video_trim"] = ev.t1_video_trim

        # RAW + TRIM: clinical
        if ev.t0_clinical is not None:
            row["t0_clinical"] = ev.t0_clinical
            row["t1_clinical"] = ev.t1_clinical
            row["absolute_start_clinical"] = rec_start + pd.to_timedelta(ev.t0_clinical, unit="s")
            row["absolute_end_clinical"]   = rec_start + pd.to_timedelta(ev.t1_clinical, unit="s")

            row["t0_clinical_trim"] = ev.t0_clinical_trim
            row["t1_clinical_trim"] = ev.t1_clinical_trim

        event_rows.append(row)

df_rec = pd.DataFrame(recording_rows)
df_evt = pd.DataFrame(event_rows)


0 Patient 1_1.tdms  <->  P01_R01
1 Patient 1_2.tdms  <->  P01_R02
2 Patient 2_1.tdms  <->  P02_R01
3 Patient 3_1.tdms  <->  P03_R01
4 Patient 3_2.tdms  <->  P03_R02
5 Patient 4_1.tdms  <->  P04_R01
6 Patient 5_1.tdms  <->  P05_R01
7 Patient 6_1.tdms  <->  P06_R01
8 Patient 6_2.tdms  <->  P06_R02
9 Patient 7_1.tdms  <->  P07_R01
10 Patient 7_2.tdms  <->  P07_R02
11 Patient 8a_1.tdms  <->  P08a_R01
12 Patient 8a_2.tdms  <->  P08a_R02
13 Patient 8b_1.tdms  <->  P08b_R01
14 Patient 9_1.tdms  <->  P09_R01
15 Patient 9_2.tdms  <->  P09_R02
16 Patient 10_1.tdms  <->  P10_R01
17 Patient 11_1.tdms  <->  P11_R01
18 Patient 11_2.tdms  <->  P11_R02
19 Patient 12_1.tdms  <->  P12_R01
20 Patient 12_2.tdms  <->  P12_R02
21 Patient 13_1.tdms  <->  P13_R01
22 Patient 13_2.tdms  <->  P13_R02
23 Patient 14_1.tdms  <->  P14_R01
24 Patient 14_2.tdms  <->  P14_R02
25 Patient 15_1.tdms  <->  P15_R01
26 Patient 15_2.tdms  <->  P15_R02
27 Patient 16_1.tdms  <->  P16_R01
28 Patient 17_1.tdms  <->  P17_R01
29 Pa

In [6]:
# # 1) merge trim_start_s ind via recording_uid
# df_evt = df_evt.merge(
#     trim_df[["recording_uid", "trim_start_s", "trim_end_s"]],
#     on="recording_uid",
#     how="left",
#     validate="many_to_one"
# )

# # fallback (hvis nogle få mangler trim, antag 0)
# df_evt["trim_start_s"] = df_evt["trim_start_s"].fillna(0.0)


# # korrigér clinical tider (hvis de findes)
# for a, b in [("t0_clinical","t1_clinical"), ("t0","t1"), ("t0_video","t1_video")]:
#     if a in df_evt.columns and b in df_evt.columns:
#         df_evt[a] = df_evt[a] - df_evt["trim_start_s"]
#         df_evt[b] = df_evt[b] - df_evt["trim_start_s"]

# # drop events der nu ligger før signalstart
# if "t0_clinical" in df_evt.columns:
#     df_evt.loc[df_evt["t0_clinical"] < 0, "t0_clinical"] = 0.0

# # opdatér absolute tider hvis de findes
# if "absolute_start_clinical" in df_evt.columns and "t0_clinical" in df_evt.columns:
#     # absolute = rec_start + t0_clinical (i sek)  (rec_start ligger ikke i df_evt pt)
#     # enkleste: drop dem og genberegn senere ved behov, eller merge rec_start ind:
#     df_evt = df_evt.merge(
#         df_rec[["recording_uid", "recording_start"]],
#         on="recording_uid",
#         how="left"
#     )
#     df_evt["absolute_start_clinical"] = df_evt["recording_start"] + pd.to_timedelta(df_evt["t0_clinical"], unit="s")
#     df_evt["absolute_end_clinical"]   = df_evt["recording_start"] + pd.to_timedelta(df_evt["t1_clinical"], unit="s")
#     df_evt = df_evt.drop(columns=["recording_start"])


df_rec.head(), df_evt.head()

(   recording_uid  patient_id enrollment_id  recording_id  \
 0              0           1          None             1   
 1              1           1          None             2   
 2              2           2          None             1   
 3              3           3          None             1   
 4              4           3          None             2   
 
                                            tdms_path annotation_source  \
 0  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 1.xls   
 1  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 1.xls   
 2  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 2.xls   
 3  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 3.xls   
 4  E:\ML algoritme tl anfaldsdetektion vha HRV\eP...     patient 3.xls   
 
       recording_start                 recording_end  rec_duration_s     fs  \
 0 2016-02-22 11:04:14 2016-02-24 16:09:49.750000000   191135.750000  512.0   
 1 2016-02-24 16:15:00

Sanity Check to validate correct data loading

In [40]:
# print("Antal recordings:", len(df_rec))
# print("Antal seizures:", len(df_evt))

# print("\nUnikke patienter i df_rec:", sorted(df_rec["patient_id"].unique()))
# print("Unikke patienter i df_evt:", sorted(df_evt["patient_id"].unique()))

# df_rec.groupby("patient_id")["recording_uid"].count()
# df_evt.groupby("patient_id")["seizure_id"].count()


## Save DataFrame

In [7]:
from pathlib import Path

PROJECT_ROOT = Path.cwd()
OUT_DIR = Path(r"E:\Speciale - Results\Datastruct")
OUT_DIR.mkdir(exist_ok=True, parents=True)

REC_OUT_PATH = OUT_DIR / "recordings_index.csv"
EVT_OUT_PATH = OUT_DIR / "seizure_events.csv"

df_rec.to_csv(REC_OUT_PATH, index=False)
df_evt.to_csv(EVT_OUT_PATH, index=False)

print("Saved:")
print("  recordings ->", REC_OUT_PATH)
print("  events     ->", EVT_OUT_PATH)


Saved:
  recordings -> E:\Speciale - Results\Datastruct\recordings_index.csv
  events     -> E:\Speciale - Results\Datastruct\seizure_events.csv


Validation code:

In [42]:
# # hvor mange seizures har vi nu for patient 31?
# df_evt[df_evt["patient_id"] == 31].shape

# # kig på de første par
# df_evt[df_evt["patient_id"] == 31][["seizure_id", "duration_s", "absolute_start"]].head()

# from pathlib import Path
# import pandas as pd
# from hrv_epatch.dataset.annotations import load_annotations

# # Alle recordings for patient 31
# df_rec_31 = df_rec[df_rec["patient_id"] == 31]
# print(df_rec_31[["recording_uid", "enrollment_id", "tdms_path", "recording_start", "recording_end"]])

# rec31a = df_rec_31[df_rec_31["enrollment_id"] == "a"].iloc[0]
# print(rec31a)

# from hrv_epatch.dataset.annotations import find_annotation_file
# from hrv_epatch.dataset.naming import RecordingKey

# # rekonstruér key for 31a
# tdms_path_31a = Path(rec31a["tdms_path"])
# key31a = RecordingKey(patient_id=31, enrollment_id="a", recording_id=rec31a["recording_id"])

# print("Candidates for 31a:", [p.name for p in ANN_ROOT.glob("Patient 31*")])
# print("find_annotation_file says:", find_annotation_file(key31a, ANN_ROOT))


In [8]:
df_rec = pd.read_csv(OUT_DIR / "recordings_index.csv",
                     parse_dates=["recording_start", "recording_end"])
df_evt = pd.read_csv(OUT_DIR / "seizure_events.csv",
                     parse_dates=["absolute_start", "absolute_end"])


In [22]:
# df_rec_31 = df_rec[df_rec["patient_id"] == 31]
# print(df_rec_31[["recording_uid", "enrollment_id", "tdms_path",
#                  "recording_start", "recording_end", "annotation_source"]])

# df_evt_31 = df_evt[df_evt["patient_id"] == 31]
# print("Seizures for patient 31:", len(df_evt_31))
# print(df_evt_31.groupby("enrollment_id")["seizure_id"].count())

# print(df_evt_31[["t0", "t0_trim", "trim_start_s"]])
# cols_to_check = [c for c in df_evt.columns if c != "enrollment_id"]
# n_nan_rows = df_evt[cols_to_check].isna().any(axis=1).sum()
# print("Rows with at least one NaN (excluding enrollment_id):", n_nan_rows, "Out off", len(df_evt))

# print(df_evt)
# print("Rows with at least one NaN:", n_nan_rows, "Out off", len(df_evt))

# print(df_evt)