In [4]:
import os
import pandas as pd
import numpy as np
import xmltodict

In [5]:
selected_subjects = ['po7185']

In [6]:
mastersheet = pd.read_csv("/Users/alicealbrecht/wynton_PSG_Pipeline_Outputs/mastersheets/mros_ses-1_mastersheet.csv")

In [7]:
mastersheet = mastersheet[mastersheet["sub_id"].isin(selected_subjects)]
mastersheet["annot_path"] = mastersheet["annot_path"].str.replace(
    "/wynton/group/andrews/data/MrOS/",
    "/Users/alicealbrecht/wynton_data/",
    regex=False  # important so slashes aren't treated as regex
)
rows = mastersheet.sort_values("sub_id").to_dict(orient="records")

In [14]:
def read_annot(row, dataset_name):
    sub_id = row["sub_id"]
    annot_path = row["annot_path"]
    duration_sec = row["duration_sec"]
    # Open and read xml 
    with open(annot_path, encoding='utf-8') as f:
            info_dict = xmltodict.parse(f.read())

    # Read and map full sleep stage (no only night) to match AASM guidelines  
    raw_stages = info_dict['CMPStudyConfig']['SleepStages']['SleepStage']
    print("raw stages:", raw_stages)
    stage_dict = {'0':0, '1':1, '2':2, '3':3, '4':3, '5':4}  
    full_sleep_stages = [stage_dict.get(str(s), np.nan) for s in raw_stages]

    # Create a df_stages
    epoch_length_sec = float(info_dict['CMPStudyConfig']['EpochLength'])
    n_epochs = int(duration_sec / epoch_length_sec)

    # Warnings and adjustments for length mismatches
    if len(full_sleep_stages) != n_epochs:
        print(f"[WARNING] Length mismatch: len(full_sleep_stages) is {len(full_sleep_stages)} != n_epochs is {n_epochs}")
        if len(full_sleep_stages) > n_epochs:
            print(f"[WARNING] Trimming extra sleep_stages: {full_sleep_stages[n_epochs:]}")
            full_sleep_stages = full_sleep_stages[:n_epochs]
        else:
            raise ValueError(f"sleep_stages is too short ({len(full_sleep_stages)}) for {n_epochs} epochs.")
    
    # Create DataFrame for sleep stages
    df_stages = pd.DataFrame({
        'onset': np.arange(0, n_epochs * 30, 30),
        'duration': np.full(n_epochs, 30),
        'sleep_stage': full_sleep_stages 
    })

    # Read events and add sleep stages
    events = info_dict['CMPStudyConfig']['ScoredEvents']
    if events is None:
        print(f"[WARNING] Sub {sub_id}: No events (other than sleep stages) in annotation file.")
        df_events = pd.DataFrame(data={
            'onset': [], 'duration': [], 'event_type': [],
            'channel': [], 'lowest_spo2': [], 'desaturation': []
        })
    else:
        events = events['ScoredEvent']
        if not isinstance(events, list):
            events = [events]
        df_events = pd.DataFrame(events)

        # Convert column names to lowercase for consistent processing
        df_events.columns = [col.lower() for col in df_events.columns]

        # Rename columns to match BIDS-style event.csv
        df_events = df_events.rename(columns={
            'start': 'onset',
            'name': 'event_type',
            'input': 'channel',
            'lowestspo2': 'lowest_spo2'
        })

        # Ensure correct types
        df_events['onset'] = df_events['onset'].astype(float)
        df_events['duration'] = df_events['duration'].astype(float)
        df_events['event_type'] = df_events['event_type'].astype(str)
        df_events['event_type'] = df_events['event_type'].str.lower().str.replace(' ', '_')
        if 'channel' in df_events.columns:
            df_events['channel'] = df_events['channel'].astype(str)
            df_events['channel'] = df_events['channel'].str.lower().str.replace(' ', '_') 

        # Specific to MrOS
        if dataset_name.lower() == "mros":
            df_events.loc[df_events.event_type=='hypopnea','event_type'] = 'hypopnea_(airflow_reduction30-50%)'
            df_events.loc[df_events.event_type=='unsure','event_type'] = 'hypopnea_(airflow_reduction>50%)'

        # Sort and reorder columns
        df_events = df_events.sort_values('onset', ignore_index=True, ascending=True)
        desired_order = ['onset', 'duration', 'event_type', 'channel', 'lowest_spo2', 'desaturation']
        existing_cols = [c for c in desired_order if c in df_events.columns]
        df_events = df_events[existing_cols]
    
    # Align columns in both DataFrames
    df_stages = df_stages.assign(event_type=np.nan, channel=np.nan, lowest_spo2=np.nan, desaturation=np.nan)
    df_events = df_events.assign(sleep_stage=np.nan)

    # Concatenate and sort by onset
    df_events = pd.concat([df_stages, df_events], ignore_index=True).sort_values('onset').reset_index(drop=True)

    return full_sleep_stages, df_events, epoch_length_sec

dataset_name = "mros"

In [15]:
dataset_name = "mros"
row = rows[0]
full_sleep_stages, df_events, epoch_length_sec = read_annot(row, dataset_name)


raw stages: ['9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9

In [13]:
full_sleep_stages

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan

In [9]:
dataset_name = "mros"
df_events_all = []
for row in rows: 
    full_sleep_stages, df_events, epoch_length_sec = read_annot(row, dataset_name)
    df_events_all.append(df_events)

In [10]:
df_events_all[0] 

Unnamed: 0,onset,duration,sleep_stage,event_type,channel,lowest_spo2,desaturation
0,0.0,30.0,,,,,
1,0.0,10.0,,spo2_artifact,sao2,,
2,30.0,30.0,,,,,
3,40.1,2.6,,limb_movement_(right),leg_r,,
4,60.0,30.0,,,,,
...,...,...,...,...,...,...,...
2866,35842.6,22.3,,hypopnea_(airflow_reduction>50%),abdominal,,
2867,35850.0,30.0,,,,,
2868,35862.0,31.0,,spo2_desaturation,sao2,94,4
2869,35880.0,30.0,,,,,
