In [1]:

import pandas as pd
import numpy as np
import os
import glob
from datetime import datetime, timedelta

# ==========================================================
# 1. Load Firstbeat file function (UNCHANGED)
# ==========================================================
def load_firstbeat_file(path):
    """
    Reads a Firstbeat IBI file where:
    - first 2 lines are metadata
    - third line is "RR;Artifact corrected RR;Raw artifact;"
    - all rows contain semicolon-separated values in ONE column

    Returns:
        df (DataFrame with RR, ArtifactCorrectedRR, RawArtifact, ts_raw)
        raw_start_ts (datetime of header start time)
    """

    # ----------------------------------------------------
    # 1. Read header lines
    # ----------------------------------------------------
    with open(path, "r") as f:
        lines = f.readlines()

    # Extract raw start time ("Start time: dd.mm.yyyy HH:MM:SS")
    raw_start_str = lines[1].split("Start time:")[1].strip()
    raw_start_ts = datetime.strptime(raw_start_str, "%d.%m.%Y %H:%M:%S")

    # ----------------------------------------------------
    # 2. Read the data as ONE COLUMN of text
    # ----------------------------------------------------
    df = pd.read_csv(
        path,
        header=None,
        skiprows=3,
        names=["raw"],
        dtype=str,
        engine="python",
    )

    # Remove empty lines
    df = df[df["raw"].notna() & (df["raw"].str.strip() != "")]

    # ----------------------------------------------------
    # 3. Split "RR;ArtifactCorrectedRR;RawArtifact"
    # ----------------------------------------------------
    raw_split = df["raw"].str.split(";", expand=True)

    # Keep first 3 columns only
    raw_split = raw_split.iloc[:, :3]
    raw_split.columns = ["RR", "ArtifactCorrectedRR", "RawArtifact"]

    # Convert values to numeric
    raw_split["RR"] = pd.to_numeric(raw_split["RR"], errors="coerce")
    raw_split["ArtifactCorrectedRR"] = pd.to_numeric(raw_split["ArtifactCorrectedRR"], errors="coerce")
    raw_split["RawArtifact"] = pd.to_numeric(raw_split["RawArtifact"], errors="coerce")

    # ----------------------------------------------------
    # 4. DROP rows where corrected RR is NaN
    #    (Firstbeat sometimes leaves the last line empty)
    # ----------------------------------------------------
    raw_split = raw_split.dropna(subset=["ArtifactCorrectedRR"]).reset_index(drop=True)

    # ----------------------------------------------------
    # 5. Compute raw timestamps with cumsum over RR intervals
    # ----------------------------------------------------
    rr_seconds = raw_split["ArtifactCorrectedRR"] / 1000  # convert ms → seconds
    cumulative_time = rr_seconds.cumsum()

    # Safe apply to avoid NaN issues
    raw_split["ts_raw"] = cumulative_time.apply(
        lambda s: raw_start_ts + timedelta(seconds=float(s))
    )

    return raw_split, raw_start_ts



# ==========================================================
# 2. Define EXACT extractions you want
# ==========================================================

EXTRACTIONS = {
    #"test39_39_20251113_125113_IBI.csv": [7],
    #"test39_39_20251113_132004_IBI.csv": [8, 10],
    #"test39_39_20251116_140244_IBI.csv": [14],
    "test39_39_20251126_145748_IBI.csv": [23],
}

metadata_path = "trials_sensor11.csv"
hr_data_folder = "sensor11/"
output_folder = "participants_split_sensor11/"

os.makedirs(output_folder, exist_ok=True)

meta = pd.read_csv(metadata_path)
meta["Date_dt"] = pd.to_datetime(meta["Date"], format="%d-%b-%y")
meta["Start_ts"] = pd.to_datetime(meta["Date_dt"].dt.strftime("%Y-%m-%d") + " " + meta["Start time (firstbeat)"])
meta["End_ts"]   = pd.to_datetime(meta["Date_dt"].dt.strftime("%Y-%m-%d") + " " + meta["End time (firstbeat)"])


# ==========================================================
# 3. PROCESS ONLY THE FILES + PARTICIPANTS YOU SPECIFIED
# ==========================================================

for filename, participant_list in EXTRACTIONS.items():

    filepath = os.path.join(hr_data_folder, filename)
    if not os.path.exists(filepath):
        print(f"❌ File not found: {filepath}")
        continue

    print(f"\nProcessing: {filename}")

    # Extract date from filename
    file_date_str = filename.split("_")[2]  # e.g. 20251113
    file_date = pd.to_datetime(file_date_str, format="%Y%m%d")

    # Metadata for participants on that date
    day_meta = meta[meta["Date_dt"] == file_date]

    # Load HR file
    df_raw, raw_start_ts = load_firstbeat_file(filepath)

    # -------------------------
    # Process each requested participant
    # -------------------------
    for pid in participant_list:

        row = day_meta[day_meta["Participant ID"] == pid]

        if row.empty:
            print(f"  ⚠️ Participant {pid} missing from metadata for {filename}")
            continue

        row = row.iloc[0]

        start_ts = row["Start_ts"]
        end_ts = row["End_ts"]

        print(f"  → Extracting participant {pid} ({start_ts} to {end_ts})")

        # Fix device clock offset
        time_offset = start_ts - df_raw["ts_raw"].iloc[0]
        df_raw["ts"] = df_raw["ts_raw"] + time_offset

        # Slice window
        p_df = df_raw[(df_raw["ts"] >= start_ts) & (df_raw["ts"] <= end_ts)]

        if p_df.empty:
            print(f"    ⚠️ No HR data found for participant {pid}")
            continue

        # Keep columns
        clean_df = p_df[["RR", "ArtifactCorrectedRR", "RawArtifact", "ts"]]

        # Output file
        outpath = os.path.join(output_folder, f"participant{pid}.csv")

        with open(outpath, "w") as f:
            f.write(f"Participant ID: {pid}\n")
            f.write(f"Start time: {start_ts}\n")
            f.write(f"End time: {end_ts}\n")
            f.write("\n")

        clean_df.to_csv(outpath, mode="a", index=False)

        print(f"    ✓ Saved {outpath}")



Processing: test39_39_20251126_145748_IBI.csv
  → Extracting participant 23 (2025-11-26 14:57:00 to 2025-11-26 15:12:00)
    ✓ Saved participants_split_sensor11/participant23.csv
