In [1]:
# %% [markdown]
# Study 3 – SQI vs seizures (clean notebook)

# %%
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import neurokit2 as nk

from src.hrv_epatch.io.tdms import load_tdms_from_path


In [2]:
# %%
DATASTRUCT_ROOT = Path(r"E:\Speciale - Results\Datastruct")
MAIN_DATA_ROOT  = Path(r"E:\Speciale - Results")
STUDY2_ROOT     = Path(MAIN_DATA_ROOT / "study2")

RECORDINGS_INDEX_CSV = DATASTRUCT_ROOT / "recordings_index.csv"
SEIZURE_EVENTS_CSV   = DATASTRUCT_ROOT / "seizure_events.csv"
WINDOW_QUALITY_CSV   = STUDY2_ROOT / "window_quality_baseline_vs_seizure.csv"

# hvor du evt. vil gemme mellem- og slutresultater til Study3
STUDY3_OUT = Path(MAIN_DATA_ROOT  / "study3")
STUDY3_OUT.mkdir(parents=True, exist_ok=True)

# Hvilke patienter vil du starte med?
PATIENT_SUBSET = None#, 1, 2, 3, 4, 5, 29]  # eller None for alle
WIN_S = 10.0  # vindueslængde i sekunder

In [3]:
# %%
df_rec = pd.read_csv(RECORDINGS_INDEX_CSV)
df_seiz = pd.read_csv(SEIZURE_EVENTS_CSV)
df_win_study2 = pd.read_csv(WINDOW_QUALITY_CSV)

print(df_rec.columns)
print(df_seiz.columns)
print(df_win_study2.columns[:15])
print(df_win_study2.shape)


Index(['recording_uid', 'patient_id', 'enrollment_id', 'recording_id',
       'tdms_path', 'annotation_source', 'recording_start', 'recording_end',
       'rec_duration_s', 'fs', 'trim_start_s', 'trim_end_s'],
      dtype='object')
Index(['recording_uid', 'patient_id', 'enrollment_id', 'recording_id',
       'seizure_id', 't0', 't1', 'duration_s', 'absolute_start',
       'absolute_end', 'trim_start_s', 'trim_end_s', 't0_trim', 't1_trim',
       't0_video', 't1_video', 'absolute_start_video', 'absolute_end_video',
       't0_video_trim', 't1_video_trim', 't0_clinical', 't1_clinical',
       'absolute_start_clinical', 'absolute_end_clinical', 't0_clinical_trim',
       't1_clinical_trim'],
      dtype='object')
Index(['recording_uid', 'patient_id', 'recording_id', 'window_idx',
       'win_start_s', 'win_end_s', 'context', 'std', 'range', 'diff_abs_med',
       'is_flatline', 'is_noiseburst', 'is_clipping'],
      dtype='object')
(1142228, 13)


In [4]:
# %%
if PATIENT_SUBSET is not None:
    df_rec = df_rec[df_rec["patient_id"].isin(PATIENT_SUBSET)].copy()
    df_win_study2 = df_win_study2[df_win_study2["patient_id"].isin(PATIENT_SUBSET)].copy()

df_rec.shape, df_win_study2.shape


((72, 12), (1142228, 13))

In [5]:
# %%
def compute_neurokit_sqi_per_window(
    sig: np.ndarray,
    fs: float,
    win_s: float = 10.0,
    methods=("averageQRS", "zhao2018"),
) -> pd.DataFrame:
    """
    Per-vindue beregning af NeuroKit-SQI.

    Returnerer én række pr. vindue med:
      - window_idx, win_start_s, win_end_s
      - nk_averageQRS (float)
      - nk_zhao2018   (kategori-streng)
    """
    sig = np.asarray(sig, float).ravel()
    win_n = int(win_s * fs)
    n_win = len(sig) // win_n

    rows = []
    for w in range(n_win):
        s = w * win_n
        e = s + win_n
        seg = sig[s:e]

        seg_clean = nk.ecg_clean(seg, sampling_rate=fs, method="neurokit")

        row = {
            "window_idx": w,
            "win_start_s": s / fs,
            "win_end_s": e / fs,
        }

        for m in methods:
            try:
                q = nk.ecg_quality(seg_clean, sampling_rate=fs, method=m)
            except Exception as err:
                # Hvis noget går galt (fx zhao2018 på meget mærkelige vinduer),
                # sæt bare NaN og fortsæt.
                # Du kan fjerne print for mindre støj:
                # print(f"[SQI] Fejl i vindue {w}, method='{m}': {err}")
                row[f"nk_{m}"] = np.nan
                continue

            # averageQRS → array [0..1]; zhao2018 → streng
            if isinstance(q, (list, np.ndarray, pd.Series)):
                row[f"nk_{m}"] = float(np.mean(q))
            else:
                row[f"nk_{m}"] = q

        rows.append(row)

    return pd.DataFrame(rows)


def compute_neurokit_avgqrs_per_window_full(
    sig: np.ndarray,
    fs: float,
    win_s: float = 10.0,
) -> pd.DataFrame:
    """
    Beregner NeuroKit averageQRS for HELE signalet én gang,
    og aggregerer derefter til 10 s vinduer.

    Returnerer:
      window_idx, win_start_s, win_end_s, nk_averageQRS
    """
    sig = np.asarray(sig, float).ravel()

    # 1) Clean hele signalet én gang
    sig_clean = nk.ecg_clean(sig, sampling_rate=fs, method="neurokit")

    # 2) Quality for hele signalet (sample-wise eller beat-wise)
    q = nk.ecg_quality(sig_clean, sampling_rate=fs, method="averageQRS")

    q = np.asarray(q, float).ravel()

    # 3) Klip til helt antal vinduer
    n_samples = len(sig_clean)
    win_n = int(win_s * fs)
    n_win = n_samples // win_n
    n_use = n_win * win_n

    sig_clean = sig_clean[:n_use]
    q = q[:n_use]

    # 4) Lav vindues-inds, og aggreger
    idx = np.arange(n_use)
    win_idx = idx // win_n  # 0,0,...,1,1,... etc

    df_tmp = pd.DataFrame(
        {
            "window_idx": win_idx,
            "nk_averageQRS": q,
        }
    )

    df_agg = df_tmp.groupby("window_idx", as_index=False)["nk_averageQRS"].mean()

    df_agg["win_start_s"] = df_agg["window_idx"] * win_s
    df_agg["win_end_s"] = (df_agg["window_idx"] + 1) * win_s

    return df_agg[["window_idx", "win_start_s", "win_end_s", "nk_averageQRS"]]


In [None]:
# # %% OLD TEST VERSION - Using Zhao2018 method includede, which is slow!
# all_records_sqi = []

# for _, rec in df_rec.iterrows():
#     pid = int(rec["patient_id"])
#     rid = int(rec["recording_id"])
#     tdms_path = rec["tdms_path"]
#     fs = rec["fs"]

#     print(f"== NeuroKit SQI for patient {pid}, recording {rid} ==")

#     sig, meta = load_tdms_from_path(tdms_path)
#     fs_tdms = meta.fs
#     if not np.isclose(fs_tdms, fs):
#         print(f"  [Warning] fs mismatch: meta {fs_tdms}, index {fs}")
#         fs = fs_tdms

#     df_sqi = compute_neurokit_sqi_per_window(sig, fs, win_s=WIN_S)

#     # tilføj id'er så vi kan merge
#     df_sqi["patient_id"] = pid
#     df_sqi["recording_id"] = rid

#     all_records_sqi.append(df_sqi)

# df_nk_sqi_all = pd.concat(all_records_sqi, ignore_index=True)
# df_nk_sqi_all.head()

# # %% -- OLD Pipeline with Zhao2018 (slower) --
# # vi antager at df_win_study2 har window_idx + win_start_s/win_end_s
# merge_cols = ["patient_id", "recording_id", "window_idx", "win_start_s", "win_end_s"]

# df_win_all = df_win_study2.merge(
#     df_nk_sqi_all,
#     on=merge_cols,
#     how="left",
# )

# df_win_all.shape, df_win_all.columns[:20]

# # %%
# if "ours_bad" not in df_win_all.columns:
#     df_win_all["ours_bad"] = (
#         df_win_all.get("is_flatline", False)
#         | df_win_all.get("is_noiseburst", False)
#         | df_win_all.get("is_clipping", False)
#     )

# # NeuroKit "bad" definitioner
# df_win_all["nk_bad_avgqrs"] = df_win_all["nk_averageQRS"] < 0.5  # kan tunes

# good_cats = ["Excellent", "Good"]
# df_win_all["nk_bad_zhao"] = ~df_win_all["nk_zhao2018"].isin(good_cats)


In [18]:
# %%
def attach_seizure_flag_to_windows(df_win_all, df_rec, df_seiz):
    df_meta = df_rec[["patient_id", "recording_id", "recording_start"]].copy()
    df_meta["recording_start"] = pd.to_datetime(df_meta["recording_start"])
    df_meta = df_meta.rename(columns={"recording_start": "absolute_start"})

    df_win = df_win_all.merge(
        df_meta,
        on=["patient_id", "recording_id"],
        how="left",
    )

    df_win["win_abs_start"] = df_win["absolute_start"] + pd.to_timedelta(
        df_win["win_start_s"], unit="s"
    )
    df_win["win_abs_end"] = df_win["absolute_start"] + pd.to_timedelta(
        df_win["win_end_s"], unit="s"
    )

    def flag_group(g):
        pid, rid = g.name
        seiz = df_seiz[
            (df_seiz["patient_id"] == pid)
            & (df_seiz["recording_id"] == rid)
        ]
        if seiz.empty:
            g["is_seizure_window"] = False
            return g

        seiz_start = pd.to_datetime(seiz["absolute_start"])
        seiz_end   = pd.to_datetime(seiz["absolute_end"])

        intervals = pd.IntervalIndex.from_arrays(
            seiz_start, seiz_end, closed="both"
        )

        def overlaps_any(row):
            win_int = pd.Interval(row["win_abs_start"], row["win_abs_end"], closed="both")
            return intervals.overlaps(win_int).any()

        g["is_seizure_window"] = g.apply(overlaps_any, axis=1)
        return g

    df_win = (
        df_win
        .groupby(["patient_id", "recording_id"], group_keys=False)
        .apply(flag_group)
    )

    return df_win

# %%
def frac_bad_by_seizure(df, flag_col):
    out = (
        df.groupby("is_seizure_window")[flag_col]
        .mean()
        .rename({False: "Non-seizure", True: "Seizure"})
    )
    return out


In [None]:


# df_win_all = attach_seizure_flag_to_windows(df_win_all, df_rec, df_seiz)

# df_win_all["is_seizure_window"].value_counts()


In [19]:

# print("Vores værktøj (ours_bad):")
# print(frac_bad_by_seizure(df_win_all, "ours_bad"))

# print("\nNeuroKit avgQRS (nk_bad_avgqrs):")
# print(frac_bad_by_seizure(df_win_all, "nk_bad_avgqrs"))

# print("\nNeuroKit Zhao2018 (nk_bad_zhao):")
# print(frac_bad_by_seizure(df_win_all, "nk_bad_zhao"))


In [20]:
# # %%
# df_seiz_win = df_win_all[df_win_all["is_seizure_window"]]
# df_nonseiz_win = df_win_all[~df_win_all["is_seizure_window"]]

# print("Ours vs NK avgQRS – seizure vinduer:")
# print(pd.crosstab(df_seiz_win["ours_bad"], df_seiz_win["nk_bad_avgqrs"]))

# print("\nOurs vs NK Zhao2018 – seizure vinduer:")
# print(pd.crosstab(df_seiz_win["ours_bad"], df_seiz_win["nk_bad_zhao"]))

# print("\nOurs vs NK avgQRS – non-seizure vinduer:")
# print(pd.crosstab(df_nonseiz_win["ours_bad"], df_nonseiz_win["nk_bad_avgqrs"]))


In [21]:
# # %%
# # fx én samlet summary-tabel
# summary = pd.DataFrame({
#     "ours_seizure_bad_frac":    frac_bad_by_seizure(df_win_all, "ours_bad"),
#     "nk_avgqrs_seizure_bad_frac": frac_bad_by_seizure(df_win_all, "nk_bad_avgqrs"),
#     "nk_zhao_seizure_bad_frac":   frac_bad_by_seizure(df_win_all, "nk_bad_zhao"),
# })

# summary.to_csv(STUDY3_OUT / "study3_sqi_seizure_summary.csv")

# # evt. også konfusionsmatricer som separate CSV'er
# pd.crosstab(df_seiz_win["ours_bad"], df_seiz_win["nk_bad_zhao"]).to_csv(
#     STUDY3_OUT / "study3_confmat_seiz_ours_vs_zhao2018.csv"
# )


In [22]:
# # Save df_win_all with seizure flag
# df_win_all.to_csv(STUDY3_OUT / "study3_window_quality_with_seizure_flag.csv", index=False)

# # Save other relevant dataframes
# df_nk_sqi_all.to_csv(STUDY3_OUT / "study3_neurokit_sqi_per_window.csv", index=False)
# df_seiz.to_csv(STUDY3_OUT / "study3_seizure_events.csv", index=False)


In [None]:
# --- FAST NeuroKit avgQRS for all recordings ---
# New faster approach without Zhao2018 for faster computation
all_records_sqi = []

for _, rec in df_rec.iterrows():
    pid = int(rec["patient_id"])
    rid = int(rec["recording_id"])
    tdms_path = rec["tdms_path"]
    fs = rec["fs"]

    print(f"== NeuroKit SQI for patient {pid}, recording {rid} ==")

    sig, meta = load_tdms_from_path(tdms_path)
    fs_tdms = meta.fs
    if not np.isclose(fs_tdms, fs):
        print(f"  [Warning] fs mismatch: meta {fs_tdms}, index {fs}")
        fs = fs_tdms

    df_sqi_fast = compute_neurokit_avgqrs_per_window_full(sig, fs, win_s=WIN_S)

    # tilføj id'er så vi kan merge
    df_sqi_fast["patient_id"] = pid
    df_sqi_fast["recording_id"] = rid

    all_records_sqi.append(df_sqi_fast)

df_nk_sqi = pd.concat(all_records_sqi, ignore_index=True)
df_nk_sqi.head()

== NeuroKit SQI for patient 1, recording 1 ==
== NeuroKit SQI for patient 1, recording 2 ==
== NeuroKit SQI for patient 2, recording 1 ==
== NeuroKit SQI for patient 3, recording 1 ==
== NeuroKit SQI for patient 3, recording 2 ==
== NeuroKit SQI for patient 4, recording 1 ==
== NeuroKit SQI for patient 5, recording 1 ==
== NeuroKit SQI for patient 6, recording 1 ==
== NeuroKit SQI for patient 6, recording 2 ==
== NeuroKit SQI for patient 7, recording 1 ==
== NeuroKit SQI for patient 7, recording 2 ==
== NeuroKit SQI for patient 8, recording 1 ==
== NeuroKit SQI for patient 8, recording 2 ==
== NeuroKit SQI for patient 8, recording 1 ==
== NeuroKit SQI for patient 9, recording 1 ==
== NeuroKit SQI for patient 9, recording 2 ==
== NeuroKit SQI for patient 10, recording 1 ==
== NeuroKit SQI for patient 11, recording 1 ==
== NeuroKit SQI for patient 11, recording 2 ==
== NeuroKit SQI for patient 12, recording 1 ==
== NeuroKit SQI for patient 12, recording 2 ==
== NeuroKit SQI for patient 1

Unnamed: 0,window_idx,win_start_s,win_end_s,nk_averageQRS,patient_id,recording_id
0,0,0.0,10.0,0.960794,1,1
1,1,10.0,20.0,0.955033,1,1
2,2,20.0,30.0,0.939726,1,1
3,3,30.0,40.0,0.943852,1,1
4,4,40.0,50.0,0.984257,1,1


In [12]:
merge_cols = ["patient_id", "recording_id", "window_idx", "win_start_s", "win_end_s"]

df_win_all = df_win_study2.merge(
    df_nk_sqi,  # <- concat all recordings
    on=merge_cols,
    how="left",
)

In [13]:
# OURS bad
df_win_all["ours_bad"] = (
    df_win_all.get("is_flatline", False)
    | df_win_all.get("is_noiseburst", False)
    | df_win_all.get("is_clipping", False)
)

# NeuroKit bad (threshold kan tunes)
df_win_all["nk_bad_avgqrs"] = df_win_all["nk_averageQRS"] < 0.5

# Seizure flag: use Study2 context (fastest, consistent)
df_win_all["is_seizure_window"] = df_win_all["context"].eq("seizure")

In [14]:
# # %%
# if "ours_bad" not in df_win_all.columns:
#     df_win_all["ours_bad"] = (
#         df_win_all.get("is_flatline", False)
#         | df_win_all.get("is_noiseburst", False)
#         | df_win_all.get("is_clipping", False)
#     )

# # NeuroKit "bad" definitioner
# df_win_all["nk_bad_avgqrs"] = df_win_all["nk_averageQRS"] < 0.5  # kan tunes

# good_cats = ["Excellent", "Good"]

# # df_win_all = attach_seizure_flag_to_windows(df_win_all, df_rec, df_seiz)
# # df_win_all har allerede context = baseline/seizure fra Study2
# df_win_all["is_seizure_window"] = df_win_all["context"].eq("seizure")


# df_win_all["is_seizure_window"].value_counts()

In [23]:
print("Vores værktøj (ours_bad):")
print(frac_bad_by_seizure(df_win_all, "ours_bad"))

print("\nNeuroKit avgQRS (nk_bad_avgqrs):")
print(frac_bad_by_seizure(df_win_all, "nk_bad_avgqrs"))

Vores værktøj (ours_bad):
is_seizure_window
Non-seizure    0.007451
Seizure        0.048316
Name: ours_bad, dtype: float64

NeuroKit avgQRS (nk_bad_avgqrs):
is_seizure_window
Non-seizure    0.000004
Seizure        0.000000
Name: nk_bad_avgqrs, dtype: float64


In [24]:
df_seiz_win = df_win_all[df_win_all["is_seizure_window"]]
df_nonseiz_win = df_win_all[~df_win_all["is_seizure_window"]]

print("Ours vs NK avgQRS – seizure vinduer:")
print(pd.crosstab(df_seiz_win["ours_bad"], df_seiz_win["nk_bad_avgqrs"]))

print("\nOurs vs NK avgQRS – non-seizure vinduer:")
print(pd.crosstab(df_nonseiz_win["ours_bad"], df_nonseiz_win["nk_bad_avgqrs"]))


Ours vs NK avgQRS – seizure vinduer:
nk_bad_avgqrs  False
ours_bad            
False           1300
True              66

Ours vs NK avgQRS – non-seizure vinduer:
nk_bad_avgqrs    False  True 
ours_bad                     
False          1384757      5
True             10396      0


In [25]:
# %%
# fx én samlet summary-tabel
summary = pd.DataFrame({
    "ours_seizure_bad_frac":    frac_bad_by_seizure(df_win_all, "ours_bad"),
    "nk_avgqrs_seizure_bad_frac": frac_bad_by_seizure(df_win_all, "nk_bad_avgqrs"),
})

summary.to_csv(STUDY3_OUT / "study3_sqi_fast_seizure_summary.csv")


In [27]:
# Save df_win_all with seizure flag
df_win_all.to_csv(STUDY3_OUT / "study3_window_quality_fast_with_seizure_flag.csv", index=False)

# Save other relevant dataframes
# df_nk_sqi_all.to_csv(STUDY3_OUT / "study3_neurokit_sqi_whole_window.csv", index=False)
df_seiz.to_csv(STUDY3_OUT / "study3_seizure_events_1.csv", index=False)