In [6]:
# Dataset characterization pipeline for ECG/HRV-based epilepsy study
# --------------------------------------------------------------
# This script builds a reproducible summary of your dataset:
#   - File manifest & provenance
#   - Cohort statistics (patients, seizures, hours, sampling rates)
#   - Annotation audit
#   - R-peak and RR quality metrics
#   - Signal-to-noise & SQI summary
#   - Feature distributions (HR, HRV, CSI, HFVI)
#   - Exclusion report
#
# To run: adjust ROOT_DIR to your dataset and execute all cells sequentially.

import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# ----------------------------------
# Configuration
# ----------------------------------
ROOT_DIR = r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Patients ePatch data"  # adjust to your dataset root
OUT_DIR = Path(r"E:\ML algoritme tl anfaldsdetektion vha HRV\ePatch data from Aarhus to Lausanne\Results")
OUT_DIR.mkdir(exist_ok=True, parents=True)

# ----------------------------------
# Step A: File manifest
# ----------------------------------
def build_manifest(root):
    rows = []
    for dirpath, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith((".tdms", ".csv", ".xls", ".lvm")):
                p_full = Path(dirpath) / f
                class _ProxyPath:
                    def __init__(self, full, root):
                        self._full = full
                        self._root = Path(root)
                    def stat(self):
                        return self._full.stat()
                    def __str__(self):
                        return str(Path(os.path.relpath(self._full, self._root)))
                    @property
                    def suffix(self):
                        return self._full.suffix
                    @property
                    def parent(self):
                        return self._full.parent
                p = _ProxyPath(p_full, root)
                st = p.stat()
                rows.append({
                    "path": str(p),
                    "bytes": st.st_size,
                    "modified": pd.to_datetime(st.st_mtime, unit="s"),
                    "extension": p.suffix,
                    "folder": p.parent.name,
                })
    df = pd.DataFrame(rows)
    df.to_csv(OUT_DIR / "manifest.csv", index=False)
    return df


# ----------------------------------
# Step B: Cohort & seizure overview
# ----------------------------------
def summarize_manifest(df):
    df_ext = df.groupby("extension").size().reset_index(name="count")
    print("\nFile type distribution:\n", df_ext)
    print("Total files:", len(df))

# ----------------------------------
# Step C: Annotation audit
# ----------------------------------
def audit_annotations(df_annot):
    df_annot["duration"] = (df_annot["end"] - df_annot["start"]).dt.total_seconds()
    invalid = df_annot[df_annot["duration"] < 10]
    print(f"{len(invalid)} annotations <10s (excluded)")
    invalid.to_csv(OUT_DIR / "invalid_annotations.csv", index=False)

# ----------------------------------
# Step D: RR quality summary
# ----------------------------------
def rr_quality(df_rr):
    df_rr["outlier"] = (df_rr["rr_s"] < 0.3) | (df_rr["rr_s"] > 2.5)
    outlier_rate = df_rr["outlier"].mean()
    print(f"RR outlier rate: {outlier_rate:.2%}")

# ----------------------------------
# Step E: Feature summary
# ----------------------------------
def feature_summary(df_features):
    summary = df_features.groupby("label").describe().T
    summary.to_csv(OUT_DIR / "feature_summary.csv")
    print(summary.head())

    key_feats = ["HRmean", "HRmax", "SDNN", "RMSSD", "CSI", "HFVI"]
    for feat in key_feats:
        if feat in df_features.columns:
            plt.figure(figsize=(5,4))
            sns.boxplot(x="label", y=feat, data=df_features)
            plt.title(f"{feat} by label")
            plt.savefig(OUT_DIR / f"box_{feat}.png", dpi=150)
            plt.close()

# ----------------------------------
# Step F: Exclusion report
# ----------------------------------
def make_exclusion_report(meta):
    df = pd.DataFrame(meta)
    df.to_csv(OUT_DIR / "exclusion_report.csv", index=False)
    print("Saved exclusion report.")

# Example main execution
if __name__ == "__main__":
    manifest = build_manifest(ROOT_DIR)
    summarize_manifest(manifest)
    # Subsequent steps require your actual annotation and feature dataframes
    # Example placeholders:
    # audit_annotations(df_annotations)
    # rr_quality(df_rr)
    # feature_summary(df_features)
    # make_exclusion_report(exclusion_meta)



File type distribution:
   extension  count
0     .tdms     72
Total files: 72
