In [1]:
import os 
import pandas as pd 
from pathlib import Path
import numpy as np
import xmltodict
from datetime import datetime, time
from itertools import groupby

path_mastersheet = "/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/"
path_annot = "/Users/alicealbrecht/Desktop/UCSF_PSG_annotation_files"

## First let's adjust HSP MGB mastersheet

In [None]:
mgb_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "hsp_mgb_mastersheet_untouched.csv"))
mgb_mastersheet = mgb_mastersheet[~mgb_mastersheet['annot_path'].isna()]
# Look at all possible annotaiotn file found
possible_annot = possible_annot = mgb_mastersheet['annot_path'].str.extract(r'ses-\d+_(.*)')[0]
possible_annot.unique()

array(['task-psg_annotations.csv', 'task-psg_event_annotations.csv',
       'task_Xltek.csv', nan], dtype=object)

In [6]:
duplicates_mask = mgb_mastersheet.duplicated(subset=['sub_id', 'session'], keep=False)
mgb_mastersheet[duplicates_mask]

Unnamed: 0,sub_id,dataset,site,session,start_time,sfreq_global,duration_samples,duration_sec,edf_path,annot_path,h5_path,events_path,extracted_features_path,sleep_stage_path


## Final code to apply all changes 

In [None]:
import os 
import pandas as pd 
from pathlib import Path
import numpy as np

# --------------------------------

path_mastersheet = "/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/"

# --------------------------------

def get_event_sleep_paths(path_str):
    path = Path(path_str)
    folder = path.parent
    filename = path.name

    if "event" in filename:
        event_path = path
        sleep_path = folder / filename.replace("event", "sleep")
    elif "sleep" in filename:
        sleep_path = path
        event_path = folder / filename.replace("sleep", "event")
    else:
        print("Neither event or sleep")
        event_path = sleep_path = None
    
    return event_path, sleep_path

def check_file_exist(path):
    # Adjust path if needed
    path_str = str(path).replace("/wynton/group/andrews/data/", "/Users/alicealbrecht/wynton_data/")
    exist = Path(path_str).exists()
    return exist


# --------------------------------
# Load the untouched mastersheet 
mgb_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "hsp_mgb_mastersheet_untouched.csv"))

# Replace annot_path with NaN where it ends with "task-psg_pre.csv"
mgb_mastersheet.loc[
    mgb_mastersheet['annot_path'].str.endswith("task-psg_pre.csv", na=False),
    'annot_path'
] = np.nan

for idx, row in mgb_mastersheet.iterrows():
    path_str = row['annot_path']
    
    if pd.isna(path_str):
        continue
    
    if "event" in path_str.lower() or "sleep" in path_str.lower():
        event_path, sleep_path = get_event_sleep_paths(path_str)
   
        if event_path is not None and check_file_exist(event_path):
            mgb_mastersheet.at[idx, 'annot_path'] = str(event_path)
        if sleep_path is not None and check_file_exist(sleep_path):
            mgb_mastersheet.at[idx, 'sleep_stage_path'] = str(sleep_path)

mgb_mastersheet.loc[
    mgb_mastersheet['annot_path'].str.contains("sleep", case=False, na=False),
    'annot_path'
] = np.nan

In [None]:
mgb_mastersheet[mgb_mastersheet['annot_path'].str.contains("sub-s", na=False)]

Unnamed: 0,sub_id,dataset,site,session,start_time,sfreq_global,duration_samples,duration_sec,edf_path,annot_path,h5_path,events_path,extracted_features_path,sleep_stage_path


In [None]:
output_path = os.path.join(path_mastersheet, "hsp_mgb_mastersheet.csv")
mgb_mastersheet.to_csv(output_path, index=False)

## Look at the annotation options

## HSP MGB

In [4]:
import os 
import pandas as pd 
from pathlib import Path
import numpy as np
import xmltodict
from datetime import datetime, time
from itertools import groupby

path_mastersheet = "/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/"
mgb_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "hsp_mgb_mastersheet.csv"))
mgb_mastersheet = mgb_mastersheet[mgb_mastersheet['duration_sec'] < 3*3600]
mgb_mastersheet = mgb_mastersheet[ ~(mgb_mastersheet['annot_path'].isna())] 

In [89]:
mgb_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "hsp_mgb_mastersheet.csv"))
for col in ["edf_path", "annot_path", "h5_path","events_path","extracted_features_path"]:
    mgb_mastersheet[col] = mgb_mastersheet[col].str.replace(
        "/wynton/group/andrews/data",
        "/Users/alicealbrecht/wynton_data"
    )
mgb_mastersheet = mgb_mastersheet[~mgb_mastersheet['annot_path'].isna()]
rows = mgb_mastersheet[100:130].to_dict(orient="records")
# rows = mgb_mastersheet[(mgb_mastersheet["sub_id"] == "S0001111210276") & (mgb_mastersheet["session"] == 2)].to_dict(orient="records")
# row = rows[0]
# row

In [None]:
def datetime_to_sec(t_input):
    """
    Convert time-like input to seconds since midnight (float, including milliseconds).
    """

    if t_input is None or (isinstance(t_input, float) and np.isnan(t_input)):
        return np.nan

    # --- Convert to string to check for negative seconds ---
    t_str = str(t_input).strip()
    if '-' in t_str.split(' ')[-1]:  # negative seconds detected
        return 0.0

    # --- Handle datetime or time objects ---
    if isinstance(t_input, datetime):
        t_obj = t_input.time()
    elif isinstance(t_input, time):
        t_obj = t_input
    else:
        # Remove timezone suffix if present (e.g., +00:00, Z)
        t_str_clean = t_str.split('+')[0].split('Z')[0]

        t_obj = None
        # Try multiple formats
        for fmt in [
            "%Y-%m-%d %H:%M:%S.%f",
            "%Y-%m-%d %H:%M:%S",
            "%I:%M:%S %p",
            "%H:%M:%S.%f",
            "%H:%M:%S"
        ]:
            try:
                t_obj = datetime.strptime(t_str_clean, fmt).time()
                break
            except ValueError:
                continue

        if t_obj is None:
            print(f"[WARNING] Could not parse time: {t_input}")
            return np.nan

    # --- Compute seconds including fractional part ---
    t_sec = (
        t_obj.hour * 3600 +
        t_obj.minute * 60 +
        t_obj.second +
        t_obj.microsecond / 1e6
    )

    return t_sec


def ensure_post_midnight(times_sec: pd.Series) -> pd.Series:
    continuous_sec = []
    prev_sec = None
    rollover = 0  # track cumulative 24h rollovers

    for sec in times_sec:
        if pd.isna(sec):
            continuous_sec.append(np.nan)
            continue
        if prev_sec is not None and sec + rollover < prev_sec:
            # crossed midnight
            rollover += 24 * 3600
        continuous_sec.append(sec + rollover)
        prev_sec = sec + rollover

    return pd.Series(continuous_sec, index=times_sec.index)


def read_annot_MGB(row):
    sub_id = row["sub_id"]
    session = row["session"]
    psg_id = f"sub-{sub_id}_ses-{session}" 
    annot_path = row["annot_path"]
    sleep_stage_path = row["sleep_stage_path"]
    sfreq_global = row["sfreq_global"]
    duration_samples = row["duration_samples"]

    # -------- EVENT ---------
    if pd.isna(annot_path): # Creating empty df_events
        print(f"[WARNING] {psg_id}: No Annot Stage Path found.") # 1 time
        df_events = pd.DataFrame(columns=["onset", "duration", "event_type"])
    else: # Read the slepe stage from the specific file 
        df_annot = pd.read_csv(annot_path)
        required_cols_psg = {"epoch", "time", "duration", "event"}
        if required_cols_psg.issubset(df_annot.columns):
            df_events = df_annot[list(required_cols_psg)].copy()
            df_events = df_events.rename(columns={
                "time" : "clock_time",
                "event": "event_type"
                })
            
            # Convert 'duration' → numeric (replace '-' or missing with 0.0)
            df_events['duration'] = pd.to_numeric(df_events['duration'], errors='coerce').fillna(0.0)
         
            # Shift event file if not matching recording time
            edf_start_sec = datetime_to_sec(row["start_time"]) 
            df_events['clock_time'] = df_events['clock_time'].apply(datetime_to_sec)

            # reordering if a missplaced values   
            times = df_events['clock_time'].values
            drops = [i for i in range(1, len(times)) if times[i] < times[i - 1]]
            if len(drops) == 2:
                drop1, drop2 = drops
                bad_time = times[drop1]  # first drop is the misplaced value
                after_midnight = times[drop2:]  # values after true midnight

                # find the correct insertion index
                for i, t in enumerate(after_midnight):
                    if bad_time <= t:
                        new_pos = drop2 + i
                        break
                    else:
                        new_pos = len(times)

                row_to_move = df_events.iloc[drop1]
                df_events = df_events.drop(index=drop1).reset_index(drop=True)
                if drop1 < new_pos:
                    new_pos -= 1
                df_events = pd.concat([
                    df_events.iloc[:new_pos],
                    pd.DataFrame([row_to_move]),
                    df_events.iloc[new_pos:]
                ]).reset_index(drop=True)                
                print(f"[INFO] {psg_id}: Move event at index {drop1} to new index {new_pos}")


            # after reordering, fix midnight continuity
            df_events['clock_time'] = ensure_post_midnight(df_events['clock_time'])

            first_idx_mask = df_events['clock_time'].notna() & (df_events['clock_time'] != 0)
            first_idx = first_idx_mask.idxmax() if first_idx_mask.any() else 0
            annot_start_sec = df_events['clock_time'].iloc[first_idx]
         
            if (
                (edf_start_sec is not None) and not np.isnan(edf_start_sec) and
                (annot_start_sec is not None) and not np.isnan(annot_start_sec)
                ):
                offset_sec = annot_start_sec - edf_start_sec 
                if abs(offset_sec) <= 1:
                    if offset_sec != 0:
                        print(f"[INFO] {psg_id}: Shift of {offset_sec} s")
                    df_events["onset"] = df_events["clock_time"].astype(float) - (edf_start_sec + offset_sec)
                elif offset_sec < -1:
                    print(f"[ERROR] {psg_id}: Event start before recording ({offset_sec:.3f}s)")
                else:
                    print(f"[WARNING] {psg_id}: offset_event >1s ({offset_sec:.3f}s)")
                    df_events["onset"] = df_events["clock_time"].astype(float) - (edf_start_sec + offset_sec)
            else:
                print(f"[ERROR] {psg_id}: edf_start ({edf_start_sec}) or annot_start ({annot_start_sec}) is not defined")
            
            # Devide in df_events and df_sleep 
            df_events["event_flat"] = df_events["event_type"].str.lower().str.replace(r"[-_ ]", "", regex=True)
            mask_sleep = (
                df_events["event_flat"].str.startswith("sleepstage") |
                df_events["event_flat"].str.startswith("stage")
            )
            df_sleep = df_events[mask_sleep].copy()
            df_events = df_events[~mask_sleep].copy()

            # Final type enforcement for df_events
            df_events['onset'] = df_events['onset'].astype(float)
            df_events['duration'] = df_events['duration'].astype(float)
            df_events['event_type'] = (
                df_events['event_type']
                .astype(str)
                .str.lower()
                .str.replace(" ", "_", regex=False)
            )

            # Sort and reorder columns
            df_events = df_events.drop(columns = ['epoch','clock_time', 'event_flat'])
            df_events = df_events.sort_values('onset', ignore_index=True, ascending=True)

            # -------- SLEEP STAGE ----------
            # Check first sleep stage colck time - muliple of 30 or not
            if df_sleep.empty:
                print(f"[ERROR] {psg_id} No Sleep_Stages in event !!")
                df_sleep = pd.DataFrame(columns=["onset", "duration", "sleep_stage"]) 
                full_sleep_stages = np.full(duration_samples, np.nan) 
            else:
                # Get the map sleep stages 
                stage_dict = {"W": 0, "WAKE": 0, 
                            "N1": 1, 
                            "N2": 2, 
                            "N3": 3, "N4": 3, 
                            "R": 4, "REM": 4}
                df_sleep['sleep_stage'] = (
                    df_sleep["event_flat"]
                    .str.replace(r"^(sleepstage|stage)", "", case=False, regex=True)
                    .str.upper()
                    .map(stage_dict)
                )
                
                # Repeat sleep stage every 30s epoch if only changes are annotated
                epoch_length_sec = 30
                if not (df_sleep["duration"] == epoch_length_sec).all():
                    # Append one extra endpoint for the final stage
                    all_onsets = np.append(df_sleep["onset"].values,
                                        df_sleep["onset"].iloc[-1] + epoch_length_sec)
                    # Repeat stage labels
                    epoch_counts = (np.diff(all_onsets) // epoch_length_sec).astype(int)
                    expanded_stages = np.repeat(df_sleep["sleep_stage"].values, epoch_counts)
                    # Construct the new DataFrame
                    expanded_onsets = (
                        np.arange(len(expanded_stages)) * epoch_length_sec  + df_sleep["onset"].iloc[0]
                    )
                    df_sleep = pd.DataFrame({
                        "onset": expanded_onsets,
                        "duration": epoch_length_sec,
                        "sleep_stage": expanded_stages
                    }).reset_index(drop=True)

                # Drop unecessary columns
                cols_to_drop = ['epoch', 'clock_time', 'event_type', 'event_flat']
                existing_cols = [c for c in cols_to_drop if c in df_sleep.columns]
                df_sleep = df_sleep.drop(columns=existing_cols)

                # If mulitple of 30 second than add all the missing sleep stages
                first_sleep_onset = df_sleep.iloc[0]['onset']
                if first_sleep_onset > 0 and first_sleep_onset % epoch_length_sec == 0:
                    n_missing = int(first_sleep_onset // epoch_length_sec)
                    missing_df = pd.DataFrame({
                        "onset": np.arange(n_missing) * epoch_length_sec,
                        "duration": epoch_length_sec,
                        "sleep_stage": [np.nan] * n_missing
                    })
                    df_sleep = pd.concat([missing_df, df_sleep]).reset_index(drop=True)
                else:
                    if first_sleep_onset != 0:
                        print(f"[INFO] {psg_id}: First sleep stage onset not multiple of 30: {first_sleep_onset}")

                # Deduce full_sleep_stages
                full_sleep_stages = np.full(duration_samples, np.nan) # init
                overflow_values = []
                for i, row in df_sleep.iterrows():
                    start_idx = int(row['onset'] * sfreq_global)
                    end_idx = int((row['onset'] + epoch_length_sec) * sfreq_global) 

                    if start_idx >= duration_samples:
                        overflow_values.extend(df_sleep.loc[i:, 'sleep_stage'].values)
                        break  

                    if end_idx > duration_samples:
                        overflow_values.append(row['sleep_stage'])
                        end_idx = duration_samples
                    
                    full_sleep_stages[start_idx:end_idx] = row['sleep_stage']
                
                if overflow_values and any(not (np.isnan(x) or x == 0) for x in overflow_values):
                    print(f"[WARNING] {psg_id}: Trimming extra sleep_stages: {overflow_values}")

            # -------- CONCAT SLEEP EVENTS ----------
            df_sleep = df_sleep.assign(event_type=np.nan)
            df_events = df_events.assign(sleep_stage=np.nan)
            df_events = pd.concat([df_sleep, df_events], ignore_index=True).sort_values('onset').reset_index(drop=True)
            desired_order = ['onset', 'duration', 'sleep_stage', 'event_type']
            existing_cols = [c for c in desired_order if c in df_events.columns]
            df_events = df_events[existing_cols]
        
        else:
            df_events = pd.DataFrame(columns=["onset", "duration", "event_type"])
            full_sleep_stages = np.full(duration_samples, np.nan) 
            print(f"[OTHER] {psg_id}: NEED XLTEK OR PSG ANNOT + 3 CORRECT COLS", flush=True)
    
    return full_sleep_stages, df_events

In [93]:
for row in rows:
    print(row['sub_id'])
    full_sleep_stages, df_events = read_annot_MGB(row)
    print(np.unique(full_sleep_stages, return_counts=True))
    print('---------')

S0001111326836
(array([ 0.,  1.,  2.,  3.,  4., nan]), array([ 300000,  660000, 2616000,  870000,  762000,  165200]))
---------
S0001111327618
[OK] sub-S0001111327618_ses-1: Shift of 1.0 s
(array([ 0.,  1.,  2.,  4., nan]), array([1234250,  472500, 4170000,  360000,  112500]))
---------
S0001111331020
(array([ 0.,  1.,  2.,  3.,  4., nan]), array([1800000,  576000, 2286000,  666000,  252000,  129600]))
---------
S0001111331190
(array([ 0.,  1.,  2.,  3.,  4., nan]), array([ 312000,  294000, 2466000, 1602000,  690000,  186000]))
---------
S0001111335708
(array([ 0.,  1.,  2.,  3.,  4., nan]), array([ 228000,  726000, 2052000, 1074000,  966000,  109200]))
---------
S0001111336247
(array([ 0.,  1.,  2.,  3.,  4., nan]), array([ 486000,   78000, 3720000,  324000,  438000,  110800]))
---------
S0001111337337
(array([ 0.,  1.,  2.,  3.,  4., nan]), array([ 480000,  708000, 2286000,  888000, 1350000,  139200]))
---------
S0001111337337
(array([ 0.,  1.,  2.,  3.,  4., nan]), array([ 312000, 1

In [84]:
np.unique(full_sleep_stages, return_counts=True)

(array([ 0.,  1.,  2.,  3.,  4., nan]),
 array([ 450000,  138000, 2670000, 1416000,  912000,   98000]))

In [85]:
df_events

Unnamed: 0,onset,duration,sleep_stage,event_type
0,0.0,30.0,,
1,30.0,30.0,,
2,60.0,30.0,,
3,90.0,30.0,,
4,120.0,30.0,,
...,...,...,...,...
1105,28290.0,30.0,3.0,
1106,28320.0,30.0,3.0,
1107,28350.0,30.0,3.0,
1108,28380.0,30.0,0.0,


## HSP BIDMC

#### Looking at the mastersheet problems like weird global sfreq or too short duration 

In [None]:
import os 
import pandas as pd 
from pathlib import Path
import numpy as np
import xmltodict

path_mastersheet = "/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/"
bidmc_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "hsp_bidmc_mastersheet_diagnostic.csv"))

In [3]:
bidmc_mastersheet[bidmc_mastersheet['annot_path'].isna()]

Unnamed: 0,sub_id,dataset,site,session,start_time,sfreq_global,duration_samples,duration_sec,edf_path,annot_path,h5_path,events_path,extracted_features_path,sleep_stage_path


In [150]:
for col in ["edf_path", "annot_path", "h5_path","events_path","extracted_features_path","sleep_stage_path"]:
    bidmc_mastersheet[col] = bidmc_mastersheet[col].str.replace(
        "/wynton/group/andrews/data",
        "/Users/alicealbrecht/wynton_data"
    )
row = bidmc_mastersheet[bidmc_mastersheet['sub_id'] == "I0002150031452"].to_dict(orient="records")
row = row[0]
row

{'sub_id': 'I0002150031452',
 'dataset': 'hsp_bidmc',
 'site': 'bidmc',
 'session': 1,
 'start_time': '2019-06-27 23:06:59+00:00',
 'sfreq_global': 500.0,
 'duration_samples': 12042000,
 'duration_sec': 24084.0,
 'edf_path': '/Users/alicealbrecht/wynton_data/HSP/PSG/bids/BIDMC/sub-I0002150031452/ses-1/eeg/sub-I0002150031452_ses-1_task-PSG_eeg.edf',
 'annot_path': '/Users/alicealbrecht/wynton_data/HSP/PSG/bids/BIDMC/sub-I0002150031452/ses-1/eeg/sub-I0002150031452_ses-1_task-psg_events_annotations.csv',
 'h5_path': '/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/h5_data/hsp_bidmc/hsp_bidmc_ses-1_sub-I0002150031452_signals.h5',
 'events_path': '/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/events/hsp_bidmc/hsp_bidmc_ses-1_sub-I0002150031452_events.csv',
 'extracted_features_path': '/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/extracted_features/hsp_bidmc/hsp_bidmc_ses-1_sub-I0002150031452_extracted_features.csv',
 'sleep_stage_path': '/Users/alicealbrecht/wynton_da

In [151]:
bidmc_sleep = pd.read_csv(row['sleep_stage_path'])
bidmc_annot = pd.read_csv(row['annot_path'])

In [2]:
import os
import pandas as pd

path_mastersheet = "/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/"
bidmc_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "hsp_bidmc_mastersheet_diagnostic.csv"))

for col in ["edf_path", "annot_path", "h5_path","events_path","extracted_features_path","sleep_stage_path"]:
    bidmc_mastersheet[col] = bidmc_mastersheet[col].str.replace(
        "/wynton/group/andrews/data",
        "/Users/alicealbrecht/wynton_data"
    )

In [None]:
from datetime import datetime, time
from itertools import groupby
import os
import pandas as pd
import numpy as np


def datetime_to_sec(t_input):
    """
    Convert time-like input to seconds since midnight (float, including milliseconds).
    """

    if t_input is None or (isinstance(t_input, float) and np.isnan(t_input)):
        return np.nan

    # --- Convert to string to check for negative seconds ---
    t_str = str(t_input).strip()
    if '-' in t_str.split(' ')[-1]:  # negative seconds detected
        return 0.0

    # --- Handle datetime or time objects ---
    if isinstance(t_input, datetime):
        t_obj = t_input.time()
    elif isinstance(t_input, time):
        t_obj = t_input
    else:
        # Remove timezone suffix if present (e.g., +00:00, Z)
        t_str_clean = t_str.split('+')[0].split('Z')[0]

        t_obj = None
        # Try multiple formats
        for fmt in [
            "%Y-%m-%d %H:%M:%S.%f",
            "%Y-%m-%d %H:%M:%S",
            "%I:%M:%S %p",
            "%H:%M:%S.%f",
            "%H:%M:%S"
        ]:
            try:
                t_obj = datetime.strptime(t_str_clean, fmt).time()
                break
            except ValueError:
                continue

        if t_obj is None:
            print(f"[WARNING] Could not parse time: {t_input}")
            return np.nan

    # --- Compute seconds including fractional part ---
    t_sec = (
        t_obj.hour * 3600 +
        t_obj.minute * 60 +
        t_obj.second +
        t_obj.microsecond / 1e6
    )

    return t_sec


def ensure_post_midnight(times_sec: pd.Series) -> pd.Series:
    continuous_sec = []
    prev_sec = None
    rollover = 0  # track cumulative 24h rollovers

    for sec in times_sec:
        if pd.isna(sec):
            continuous_sec.append(np.nan)
            continue
        if prev_sec is not None and sec + rollover < prev_sec:
            # crossed midnight
            rollover += 24 * 3600
        continuous_sec.append(sec + rollover)
        prev_sec = sec + rollover

    return pd.Series(continuous_sec, index=times_sec.index)


def read_annot_BIDMC(row):
    sub_id = row["sub_id"]
    session = row["session"]
    psg_id = f"sub-{sub_id}_ses-{session}" 
    annot_path = row["annot_path"]
    sleep_stage_path = row["sleep_stage_path"]
    sfreq_global = row["sfreq_global"]
    duration_samples = row["duration_samples"]

    # -------- EVENT ---------
    if pd.isna(annot_path): # Creating empty df_events
        print(f"[WARNING] {psg_id}: No Annot Stage Path found.") # 1 time
        df_events = pd.DataFrame(columns=["onset", "duration", "event_type"])
    else: # Read the slepe stage from the specific file 
        df_annot = pd.read_csv(annot_path)

        required_cols = {"Epoch", "Record Time", "Time", "Length", "Description"}
        if required_cols.issubset(df_annot.columns):
            df_events = df_annot[list(required_cols)].copy()
            df_events = df_events.rename(columns={
                "Epoch": "epoch",
                "Record Time": "onset",
                "Time" : "clock_time",
                "Length": "duration",
                "Description": "event_type"
                })
            
            # Convert 'duration' → numeric (replace '-' or missing with 0.0)
            df_events['duration'] = pd.to_numeric(df_events['duration'], errors='coerce').fillna(0.0)

            # Convert 'onset' (HH:MM:SS or timedelta-like) → seconds
            df_events['onset'] = df_events['onset'].apply(datetime_to_sec)

            # Compute drops
            times = df_events['onset'].values
            drops = [i for i in range(1, len(times)) if times[i] < times[i - 1]]
            if len(drops) > 1:
                for drop_idx in drops:
                    drop_amount = times[drop_idx - 1] - times[drop_idx]
                    if drop_amount < 3600: # less than an hour drop 
                        row_to_move = df_events.iloc[drop_idx].copy()
                        df_events = df_events.drop(df_events.index[drop_idx]).reset_index(drop=True)
                        subset = df_events.iloc[:drop_idx].copy()
                        insert_idx = subset['onset'].searchsorted(row_to_move['onset'])

                        # Split df_events and insert the row
                        df_events = pd.concat([
                            df_events.iloc[:insert_idx],
                            pd.DataFrame([row_to_move]),
                            df_events.iloc[insert_idx:]
                        ]).reset_index(drop=True)

                        # print(f"[INFO] {psg_id}: Move event '{row_to_move['event_type']}' at index {drop_idx} to new index {insert_idx}")
            
            # Only one drop → standard post-midnight adjustment
            df_events['onset'] = ensure_post_midnight(df_events['onset'])

            # Shift event file if not matching recording time
            edf_start_sec = datetime_to_sec(row["start_time"]) 
            if edf_start_sec < 5*3600:
                print(f"[WARNING] {psg_id}: EDF start after midnight: {edf_start_sec} seconds.")
                edf_start_sec += 24*3600
            df_events['clock_time'] = df_events['clock_time'].apply(datetime_to_sec)   

            # If first value already after midnight add 24h 
            first_idx_mask = df_events['clock_time'].notna() & (df_events['clock_time'] != 0)
            first_idx = first_idx_mask.idxmax() if first_idx_mask.any() else 0
            df_events['adjusted_clock_time'] = df_events['clock_time']
            if df_events.loc[first_idx, 'adjusted_clock_time'] < 5*3600:
                print(f"[WARNING] {psg_id}: EVENT start after midnight: {df_events.loc[first_idx, 'adjusted_clock_time']} seconds.")
                df_events.loc[first_idx, 'adjusted_clock_time'] += 24*3600
            df_events['adjusted_clock_time'] = ensure_post_midnight(df_events['adjusted_clock_time'])

            # Find first index
            first_after_start_idx = df_events.index[df_events['adjusted_clock_time'] >= edf_start_sec][0]
            subset = df_events.iloc[: first_after_start_idx + 5].copy()
            offset = subset['adjusted_clock_time'].astype(float) - subset['onset'].astype(float)
            subset['time_offset'] = np.where(offset != 0, offset, np.nan)
            offset_diffs = subset['time_offset'].diff().abs()
            print(subset)
            offset_change_idxs = offset_diffs[offset_diffs.gt(1)].index
            n_offsets = len(offset_change_idxs)
            change_first_epoch = False
            first_correct_epoch = 1

            if n_offsets >= 1: # weird time at first
                last_change_idx = offset_change_idxs[-1]   # <-- only use the last one
                print(f"[WARNING] {psg_id}: Detected {n_offsets} time offset jumps. Using the last one at index {last_change_idx}.")
                correct_start = subset.loc[last_change_idx, 'time_offset']
                # rows BEFORE the last change need correction
                to_fix = df_events.iloc[:last_change_idx].copy()
                to_fix['onset'] = (to_fix['adjusted_clock_time'] - correct_start)
                if (to_fix['onset'] > (24*3600)).any():
                    print(to_fix)
                    to_fix['onset'] = to_fix['onset'] - (24*3600)

                # rows AFTER the last change are already correct
                df_events = pd.concat([to_fix, df_events.iloc[last_change_idx:]]).reset_index(drop=True)
                first_correct_epoch = subset.loc[last_change_idx, 'epoch']
                change_first_epoch = True
                print(f"[INFO] {psg_id}: New first epoch: {first_correct_epoch}")


            first_idx_mask = df_events['clock_time'].notna() & (df_events['clock_time'] != 0)
            first_idx = first_idx_mask.idxmax() if first_idx_mask.any() else 0
            annot_start_sec = df_events['clock_time'].iloc[first_idx] - df_events['onset'].iloc[first_idx]
            if (
                (edf_start_sec is not None) and not np.isnan(edf_start_sec) and
                (annot_start_sec is not None) and not np.isnan(annot_start_sec)
                ):
                offset_sec = annot_start_sec - edf_start_sec 
                if abs(offset_sec) <= 1:
                    df_events["onset"] = df_events["onset"].astype(float) + offset_sec
                    if int(offset_sec) != 0:
                        print(f"[OK] {psg_id}: Shift of {offset_sec} s")
                elif offset_sec < -1:
                    print(f"[WARNING] {psg_id}: Event start before recording ({offset_sec:.3f}s)")
                    df_events["onset"] = df_events["onset"].astype(float) + offset_sec
                else:
                    print(f"[WARNING] {psg_id}: offset_event >1s ({offset_sec:.3f}s)")
                    df_events["onset"] = df_events["onset"].astype(float) + offset_sec
            else:
                print(f"[ERROR] {psg_id}: edf_start ({edf_start_sec}) or annot_start ({annot_start_sec}) is not defined")
            
            if change_first_epoch:
                first_pos_onset = df_events.loc[df_events["onset"] > 0, "onset"].iloc[0]
                epoch_shift = int(first_pos_onset // 30)
                first_correct_epoch -= epoch_shift
                if epoch_shift != 0:
                    print(f"[INFO] {psg_id}: Shifted first epoch: {first_correct_epoch}")
            
        else:
            print(f"[WARNING] {psg_id}:  No events extracted - Missing expected columns: {required_cols - set(df_annot.columns)}")
            df_events = pd.DataFrame(columns=["event_type", "onset", "duration"])
    
    # Final type enforcement
    df_events['onset'] = df_events['onset'].astype(float)
    df_events['duration'] = df_events['duration'].astype(float)
    df_events['event_type'] = (
        df_events['event_type']
        .astype(str)
        .str.lower()
    )

    # Sort and reorder columns
    df_events = df_events.drop(columns = ['clock_time'])
    df_events = df_events.sort_values('onset', ignore_index=True, ascending=True)
    desired_order = ['onset', 'duration', 'event_type']
    existing_cols = [c for c in desired_order if c in df_events.columns]
    df_events = df_events[existing_cols]
    
    # -------- SLEEP STAGE ----------
    # Open and read sleep CSV 
    if pd.isna(sleep_stage_path): # Should never be the case for BIDMC
        print(f"[ERROR] {psg_id}: No Sleep Stage Path found.")
        full_sleep_stages = []
    else:
        df_sleep = pd.read_csv(sleep_stage_path)
        start_idx = df_sleep.index[df_sleep["Epoch"] == first_correct_epoch][0]
        df_sleep = df_sleep.iloc[start_idx:].copy()
        stage_col = next((c for c in df_sleep.columns if "stage" in c.lower()), None)
        if stage_col is None:
            print(f"[ERROR] {psg_id}: No column containing 'stage' found in df_sleep")
            return None, None 
        raw_stages = df_sleep[stage_col]
        stage_dict = {"W": 0, "WAKE": 0, 
                      "N1": 1, "1": 1,
                      "N2": 2, "2": 2,
                      "N3": 3, "N4": 3, "3": 3, "4": 3,
                      "R": 4, "REM": 4}
        sleep_stages_mapped = [stage_dict.get(str(s), np.nan) for s in raw_stages]

        ### Adding a check 
        first_epoch = df_sleep.iloc[0]["Epoch"]
        if first_epoch != 1:
            print(f"[WARNING] {psg_id}: First Epoch in df_sleep is: {first_epoch}")

        # Create a df_stages
        epoch_length_sec = 30
        n_epochs = int(duration_samples / (epoch_length_sec * sfreq_global))
        full_sleep_stages = np.array([s for s, g in groupby(sleep_stages_mapped) for _ in range(int(len(list(g)) * epoch_length_sec * sfreq_global))])

        # Warnings and adjustments for length mismatches
        if len(full_sleep_stages) != duration_samples:
            if len(full_sleep_stages) > duration_samples:
                # Adjust full sleep stages AND sleep_stages_mapped
                excess_samples = len(full_sleep_stages) - duration_samples
                samples_per_epoch = int(epoch_length_sec * sfreq_global)
                excess_epochs = int(np.ceil(excess_samples / samples_per_epoch))
                if excess_epochs > 0:
                    to_trim = sleep_stages_mapped[-excess_epochs:]
                    if excess_epochs > 1 and any(not (np.isnan(x) or x == 0) for x in to_trim):
                        print(f"[WARNING] {psg_id}: Trimming {excess_epochs} extra sleep_stages: {to_trim}")
                    sleep_stages_mapped = sleep_stages_mapped[:-excess_epochs]
                full_sleep_stages = full_sleep_stages[:duration_samples]
            else:
                # Too short → pad with NaNs if within one epoch
                missing_len = duration_samples - len(full_sleep_stages)
                if missing_len >= (epoch_length_sec * sfreq_global):
                    missing_epochs = int(np.ceil(missing_len / (epoch_length_sec * sfreq_global)))
                    sleep_stages_mapped = np.concatenate([sleep_stages_mapped, np.full(missing_epochs, np.nan)])
                    print(f"[ERROR] {psg_id}: Sleep_stages too short "
                          f"({len(full_sleep_stages)} vs {duration_samples} samples) adding {missing_epochs} NaN stages.")

                # uncomplete epoch has no sleep stage so just np.nan
                full_sleep_stages = np.concatenate([full_sleep_stages, np.full(missing_len, np.nan)])

        # Create DataFrame for sleep stages
        if (len(sleep_stages_mapped) - n_epochs) == 1:
            sleep_stages_mapped = sleep_stages_mapped[:-1] # remove uncomplete 
        try:
            df_stages = pd.DataFrame({
                'onset': np.arange(0, n_epochs * 30, 30),
                'duration': np.full(n_epochs, 30),
                'sleep_stage': sleep_stages_mapped
            })
        except Exception as e:
            print(f"[ERROR] PSG {psg_id}: n_epoch and sleep_stage not same length ({e})")
            df_stages = pd.DataFrame()
        
    # -------- CONCAT SLEEP EVENTS ----------
    df_stages = df_stages.assign(event_type=np.nan)
    df_events = df_events.assign(sleep_stage=np.nan)
    df_events = pd.concat([df_stages, df_events], ignore_index=True).sort_values('onset').reset_index(drop=True)

    return full_sleep_stages, df_events

In [29]:
too_short = [('I0002150004595', 1), ('I0002150006187', 1), ('I0002150007691', 1), ('I0002150022116', 1), ('I0002150022402', 2), ('I0002150028192', 1), ('I0002150029079', 1), ('I0002150032416', 1), ('I0002150032906', 1), ('I0002150032928', 2), ('I0002150032999', 1), ('I0002150033290', 1), ('I0002150033580', 1), ('I0002150033582', 1), ('I0002150033596', 1), ('I0002150033739', 1), ('I0002150033902', 1), ('I0002150034157', 1), ('I0002150034644', 1), ('I0002150035974', 1), ('I0002150036003', 1), ('I0002150036099', 1), ('I0002150036126', 1), ('I0002150047125', 1), ('I0002150059267', 1), ('I0002150657531', 1), ('I0002150699770', 1), ('I0002150768848', 1), ('I0002150970568', 1), ('I0002151118943', 1), ('I0002151663002', 1), ('I0002151691582', 1), ('I0002151709732', 1), ('I0002151723174', 1)]

errors = [
    ("I0002150003642", 3),
    ("I0002150028860", 1),
    ("I0002150031572", 2),
    ("I0002150033055", 2),
    ("I0002150034864", 1),
    ("I0002150035218", 1),
    ("I0002150035725", 1),
    ("I0002151721922", 1),
]


new_errors = [
("I0002150035326", 1),
("I0002150023701", 1),
("I0002150032478", 1),
("I0002150034890", 1),
("I0002150029545", 1),
("I0002150030664", 1),
("I0002150032693", 1),
("I0002150032976", 1),
("I0002150001677", 1),
("I0002150024270", 1),
("I0002150029507", 2),
("I0002150030529", 1),
("I0002150034832", 1),
("I0002150004978", 1),
("I0002150031002", 1),
("I0002151253113", 1),
("I0002150032324", 1),
("I0002150032329", 1),
("I0002150032528", 1),
("I0002150077588", 1),
]
rows = bidmc_mastersheet[
    bidmc_mastersheet.set_index(['sub_id', 'session']).index.isin(errors)
]
rows = rows.to_dict(orient="records")

for row in rows:
    print(row['sub_id'], row['session'])
    full_sleep_stages, df_events = read_annot_BIDMC(row)
    print(df_events.head(10))
    print("----")

I0002150003642 3
   epoch  clock_time                 event_type  duration  onset  \
0      1     86350.0              SpO2 No Pulse       0.0    0.0   
1      1     86350.0  EtCO2 No Respiration Data       0.0    0.0   
2      1     86350.0           SpO2 Check Probe       0.0    0.0   
3      1     86350.0           SpO2 Check Probe       0.0    0.0   
4      1     86350.0                     Cal On       0.0    0.0   

   adjusted_clock_time  time_offset  
0              86350.0      86350.0  
1              86350.0      86350.0  
2              86350.0      86350.0  
3              86350.0      86350.0  
4              86350.0      86350.0  
   onset  duration  sleep_stage                 event_type
0    0.0      30.0          NaN                        NaN
1    0.0       0.0          NaN           spo2 check probe
2    0.0       0.0          NaN           spo2 check probe
3    0.0       0.0          NaN                     cal on
4    0.0       0.0          NaN                    

In [23]:
df_events[:50]

Unnamed: 0,onset,duration,sleep_stage,event_type
0,0.0,30.0,,
1,0.0,1.4,,leg movement
2,0.0,10.0,,leg movement
3,0.0,10.0,,leg movement
4,0.0,1.6,,leg movement
5,0.0,0.8,,leg movement
6,0.0,1.4,,leg movement
7,0.0,1.4,,leg movement
8,0.0,1.5,,leg movement
9,0.0,10.0,,leg movement


In [148]:
bidmc_annot[:20]

Unnamed: 0,Epoch,Stage,BP,Type,Record Time,Time,Length,Description
0,1,L,UNKNOWN,Log,00:00:00,21:44:14,-,SpO2 Check Probe
1,1,L,UNKNOWN,Log,00:00:00,21:44:14,-,EtCO2 No Respiration Data
2,1,L,UNKNOWN,Log,00:00:00,21:44:14,-,SpO2 Connector Off
3,1,L,UNKNOWN,Log,00:00:02,21:44:16,-,Imp On
4,1,L,UNKNOWN,Log,00:00:12,21:44:26,-,Imp Off
5,3,L,UNKNOWN,Log,00:01:06,21:51:40,-,Imp On
6,3,L,UNKNOWN,Log,00:01:14,21:51:47,-,Imp Off
7,3,L,UNKNOWN,Log,00:01:15,21:51:49,-,Cal On
8,4,L,UNKNOWN,Log,00:01:35,21:52:08,-,EtCO2 No Respiration Data
9,4,L,UNKNOWN,Log,00:01:38,21:52:11,-,EtCO2 No Respiration Data


In [149]:
bidmc_sleep

Unnamed: 0,Epoch,SaO2 Max,SaO2 Min,Heart Rate Max,Heart Rate Min,Heart Rate Avg,R-R Max,R-R Min,R-R Avg,PTT Max,PTT Min,PTT Avg,EtCO2 Max,EtCO2 Min,tCO2 Max,tCO2 Min,Body Position,Stage
0,1,0,0,61,49,57,0,0,0,0,0,0,0,0,0,0,upright,L
1,2,0,0,67,61,64,0,0,0,0,0,0,0,0,0,0,upright,L
2,3,5,0,72,71,71,0,0,0,0,0,0,0,0,0,0,upright,L
3,4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,upright,L
4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,upright,L
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,989,100,98,83,68,79,0,0,0,0,0,0,0,0,0,0,supine,L
989,990,100,99,67,62,63,0,0,0,0,0,0,0,0,0,0,supine,L
990,991,99,99,63,54,58,0,0,0,0,0,0,0,0,0,0,supine,L
991,992,100,99,63,54,60,0,0,0,0,0,0,0,0,0,0,supine,L


In [None]:
len(full_sleep_stages)

13185000

In [None]:
bidmc_sleep = pd.read_csv("/Users/alicealbrecht/wynton_data/HSP/PSG/bids/BIDMC/sub-I0002150000375/ses-1/eeg/sub-I0002150000375_ses-1_task-psg_sleep_annotations.csv")
bidmc_annot = pd.read_csv("/Users/alicealbrecht/wynton_data/HSP/PSG/bids/BIDMC/sub-I0002150000375/ses-1/eeg/sub-I0002150000375_ses-1_task-psg_events_annotations.csv")

In [5]:
bidmc_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "hsp_bidmc_mastersheet.csv"))
row = bidmc_mastersheet[(bidmc_mastersheet["sub_id"] == "I0002150000029") & (bidmc_mastersheet["session"] == 1)]
row

Unnamed: 0,sub_id,dataset,site,session,start_time,sfreq_global,duration_samples,duration_sec,edf_path,annot_path,h5_path,events_path,extracted_features_path,sleep_stage_path
1,I0002150000029,hsp_bidmc,bidmc,1,2017-11-25 01:04:23+00:00,200.0,4628400,23142.0,/wynton/group/andrews/data/HSP/PSG/bids/BIDMC/...,/wynton/group/andrews/data/HSP/PSG/bids/BIDMC/...,/wynton/group/andrews/data/PSG_Pipeline_Output...,/wynton/group/andrews/data/PSG_Pipeline_Output...,/wynton/group/andrews/data/PSG_Pipeline_Output...,/wynton/group/andrews/data/HSP/PSG/bids/BIDMC/...


In [6]:
if not pd.api.types.is_numeric_dtype(bidmc_annot['Record Time']):
    print("NOT")
    bidmc_annot['Record Time'] = pd.to_timedelta(bidmc_annot['Record Time'],errors='coerce').dt.total_seconds()

bidmc_annot

NOT


Unnamed: 0,Epoch,Stage,BP,Type,Record Time,Time,Length,Description
0,3,L,UNKNOWN,Log,73.0,01:05:36,-,Imp On
1,3,L,UNKNOWN,Log,82.0,01:05:45,-,Imp Off
2,3,L,UNKNOWN,Log,85.0,01:05:48,-,Imp On
3,4,L,UNKNOWN,Log,93.0,01:05:56,-,Imp Off
4,10,L,UNKNOWN,Log,285.0,01:09:08,-,Cal On
...,...,...,...,...,...,...,...,...
322,771,L,RIGHT,Log,23120.0,07:29:43,-,BLINK 10x
323,771,L,RIGHT,Log,23127.0,07:29:50,-,GRIND TEETH
324,772,L,U,Log,23131.0,07:29:54,-,MAKE A SNORING SOUND
325,772,L,U,Log,23135.0,07:29:58,-,MOVE LEFT FOOT


## XML FILES: MESA / SHHS1 / SHHS2
Basic NSRR XML format code already work for that ! 

In [2]:
import os
import pandas as pd
import numpy as np
import xmltodict

# Annot file
path_annot = "/Users/alicealbrecht/wynton_data/mesa/nsrr/polysomnography/annotations-events-profusion/"
mesa_annot = os.path.join(path_annot, "mesa-sleep-0050-profusion.xml")
with open(mesa_annot, encoding='utf-8') as f:
    info_dict = xmltodict.parse(f.read())

# Mastersheet
path_mastersheet = "/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/"
mesa_mastersheet = pd.read_csv(os.path.join(path_mastersheet, "mesa_mastersheet.csv"))
row = mesa_mastersheet[(mesa_mastersheet["sub_id"] == "mesa-sleep-0050") & (mesa_mastersheet["session"] == 1)]
row

Unnamed: 0,sub_id,dataset,site,session,start_time,sfreq_global,duration_samples,duration_sec,edf_path,annot_path,h5_path,events_path,extracted_features_path,sleep_stage_path
16,mesa-sleep-0050,mesa,mesa-sleep,1,1985-01-01 20:59:59+00:00,256.0,9215744,35999.0,/wynton/group/andrews/data/mesa/nsrr/polysomno...,/wynton/group/andrews/data/mesa/nsrr/polysomno...,/wynton/group/andrews/data/PSG_Pipeline_Output...,/wynton/group/andrews/data/PSG_Pipeline_Output...,/wynton/group/andrews/data/PSG_Pipeline_Output...,


In [None]:
raw_stages = info_dict['CMPStudyConfig']['SleepStages']['SleepStage']
stage_dict = {'0':0, '1':1, '2':2, '3':3, '4':3, '5':4}  
sleep_stages_mapped = [stage_dict.get(str(s), np.nan) for s in raw_stages]
print(len(raw_stages))
print(len(sleep_stages_mapped))

1199
1199


In [None]:
from itertools import groupby

sfreq_global = row["sfreq_global"]
duration_samples = row["duration_samples"]
epoch_length_sec = float(info_dict['CMPStudyConfig']['EpochLength'])
n_epochs = int(duration_samples / (epoch_length_sec * sfreq_global))
full_sleep_stages = np.array([s for s, g in groupby(sleep_stages_mapped) for _ in range(int(len(list(g)) * epoch_length_sec * sfreq_global))])
print(len(full_sleep_stages))

9208320


  n_epochs = int(duration_samples / (epoch_length_sec * sfreq_global))
  full_sleep_stages = np.array([s for s, g in groupby(sleep_stages_maped) for _ in range(int(len(list(g)) * epoch_length_sec * sfreq_global))])


If the epoch is not complete (less thna 30s) then no sleep stages so pad wiht nan values. If more than 30s then print error.


!