# Study 2 â€“ Descriptive Dataset Analysis
This notebook characterises the dataset used in this thesis at patient, recording 
and seizure level. It ensures transparency in the structure, quality and distribution 
of the available ECG + annotation data prior to further SQI/HRV analysis.

In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ensure src is on path
PROJECT_ROOT = Path.cwd()
SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.append(str(SRC_ROOT))

from hrv_epatch.dataset.loader import iter_recordings
from hrv_epatch.dataset.naming import parse_recording_key
from hrv_epatch.dataset.annotations import load_annotations   # for debugging if needed
from hrv_epatch.dataset.seizures import SeizureEvent, build_seizure_events_from_df
from hrv_epatch.plots.seizure_gantt import plot_recording_seizure_timeline_multiday_clocklabels


plt.style.use("ggplot")

## 1. Load dataset (TDMS + annotations)
This uses the fully validated TDMS loader + annotation parser from src/hrv_epatch.

In [2]:
TDMS_ROOT = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Patients ePatch data")
ANN_ROOT  = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Seizure log ePatch patients with seizures - excluded seizures removed")

recordings = []

for sig, meta, ann in iter_recordings(TDMS_ROOT, ANN_ROOT):
    rec_start = pd.to_datetime(meta.start_time)
    rec_duration_s = len(sig) / meta.fs
    rec_end = rec_start + pd.to_timedelta(rec_duration_s, unit="s")

    key = parse_recording_key(Path(meta.path))
    
    events = build_seizure_events_from_df(ann, rec_start, rec_end)

    recordings.append({
        "patient_id": key.patient_id,
        "enrollment_id": key.enrollment_id,
        "recording_id": key.recording_id,
        "recording_start": rec_start,
        "rec_duration_s": rec_duration_s,
        "rec_end": rec_end,
        "seizure_events": events,
        "tdms_name": Path(meta.path).name,
        "ann_df": ann,
    })

len(recordings)

72

## 2. Patient-level summary
Total hours, number of recordings, number of seizures etc.

In [None]:
summary_rows = []
patients = sorted(set((r["patient_id"], r["enrollment_id"]) for r in recordings))

for pid, enr in patients:
    subset = [r for r in recordings if r["patient_id"] == pid and r["enrollment_id"] == enr]
    total_hours = sum(r["rec_duration_s"] for r in subset) / 3600
    num_seiz = sum(len(r["seizure_events"]) for r in subset)
    
    summary_rows.append({
        "Patient": pid,
        "Enrollment": enr if enr else "-",
        "Recordings": len(subset),
        "Total hours": total_hours,
        "Total seizures": num_seiz,
        "Hours per seizure": total_hours/num_seiz if num_seiz else np.nan,
    })

df_patient_summary = pd.DataFrame(summary_rows)
df_patient_summary

## 3. Recording-level tables & figures
A. Recording duration distribution

In [None]:
durations = [r["rec_duration_s"]/3600 for r in recordings]

plt.figure(figsize=(10,5))
plt.hist(durations, bins=20)
plt.xlabel("Recording duration (hours)")
plt.ylabel("Count")
plt.title("Distribution of recording durations")
plt.show()

B. Seizures per recording

In [None]:
seiz_counts = [len(r["seizure_events"]) for r in recordings]

plt.figure(figsize=(10,5))
plt.hist(seiz_counts, bins=range(0, max(seiz_counts)+2))
plt.xlabel("Number of seizures per recording")
plt.ylabel("Count")
plt.title("Seizures per recording")
plt.show()

## 4. Seizure-level characterisation
A. Extract seizure durations

In [None]:
all_events = []
for r in recordings:
    for ev in r["seizure_events"]:
        all_events.append({
            "patient": r["patient_id"],
            "enrollment": r["enrollment_id"],
            "recording": r["recording_id"],
            "duration": ev.t1 - ev.t0,
            "absolute_ts": r["recording_start"] + pd.to_timedelta(ev.t0, unit="s")
        })

df_events = pd.DataFrame(all_events)
df_events.head()

B. Seizure duration histogram

In [None]:
plt.figure(figsize=(10,5))
plt.hist(df_events["duration"], bins=20)
plt.xlabel("Duration (seconds)")
plt.ylabel("Seizures")
plt.title("Seizure duration distribution")
plt.show()

C. Time-of-day distribution

In [None]:
df_events["hour"] = df_events["absolute_ts"].dt.hour + df_events["absolute_ts"].dt.minute/60

plt.figure(figsize=(10,5))
plt.hist(df_events["hour"], bins=24)
plt.xlabel("Hour of day")
plt.ylabel("Seizures")
plt.title("Seizure time-of-day distribution")
plt.xticks(range(0,24))
plt.show()

D. Inter-seizure intervals (ISI)

In [None]:
isi_list = []

for pid in df_events["patient"].unique():
    d = df_events[df_events["patient"] == pid].sort_values("absolute_ts")
    if len(d) >= 2:
        dt = d["absolute_ts"].diff().dt.total_seconds().dropna()
        isi_list.extend(dt)

plt.figure(figsize=(10,5))
plt.hist(isi_list, bins=30)
plt.xlabel("ISI (seconds)")
plt.ylabel("Count")
plt.title("Inter-seizure interval distribution")
plt.show()

## 5. Gantt plot of the entire dataset (from earlier)
---- MISSING FUNCTION FOR GANTT ------

In [None]:
recordings_sorted = sorted(
    recordings,
    key=lambda r: (r["patient_id"], r["enrollment_id"] or "", r["recording_id"])
)

plot_recording_seizure_timeline_multiday_clocklabels(
    recordings_sorted,
    max_hours=96,
    day_grid=True
)

## 6. Export figures & tables for LaTeX

In [None]:
OUT = Path("study2_outputs")
OUT.mkdir(exist_ok=True)

df_patient_summary.to_csv(OUT/"patient_summary.csv", index=False)
df_events.to_csv(OUT/"seizure_events.csv", index=False)

print("Export completed.")