In [14]:
import os
import pandas as pd
import h5py

psg_ids = [
    ("S0001114191434", 1),
    ("S0001116639282", 1),
    ("S0001121913093", 1),
    ("S0001122312754", 1),
]

base_path = "/wynton/group/andrews/data/HSP/PSG/bids/MGB/"

# Set pandas display options to show all rows/columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# Loop over subjects
for sub_id, session in psg_ids:
    eeg_path = os.path.join(base_path, f"sub-{sub_id}", f"ses-{session}", "eeg")
    
    if not os.path.exists(eeg_path):
        print(f"[WARNING] Path not found: {eeg_path}")
        continue

    # Look for TSV files (usually *_channels.tsv in BIDS EEG)
    tsv_files = [f for f in os.listdir(eeg_path) if f.endswith("channels.tsv")]
    if not tsv_files:
        print(f"[WARNING] No channels TSV found in {eeg_path}")
        continue

    # Open the first TSV found
    tsv_file_path = os.path.join(eeg_path, tsv_files[0])
    df_channels = pd.read_csv(tsv_file_path, sep='\t')
    
    print(f"\nSubject {sub_id}, session {session}: {tsv_file_path}")
    print(df_channels['name'])  # Print the full dataframe


Subject S0001114191434, session 1: /wynton/group/andrews/data/HSP/PSG/bids/MGB/sub-S0001114191434/ses-1/eeg/sub-S0001114191434_ses-1_task-psg_channels.tsv
0           F3-M2
1           F4-M1
2           C3-M2
3           C4-M1
4           O1-M2
5           O2-M1
6           E1-M2
7           E2-M1
8     CHIN1-CHIN2
9             LAT
10            RAT
11          SNORE
12            POS
13           CPAP
14        AIRFLOW
15            DC5
16            DC8
17          CHEST
18            ABD
19           SaO2
20             IC
21             HR
22            EKG
Name: name, dtype: object

Subject S0001116639282, session 1: /wynton/group/andrews/data/HSP/PSG/bids/MGB/sub-S0001116639282/ses-1/eeg/sub-S0001116639282_ses-1_task-psg_channels.tsv
0          E1-REF
1          E2-REF
2         Fp1-REF
3         Fp2-REF
4          F7-REF
5          F8-REF
6          F3-REF
7          F4-REF
8          T3-REF
9          T4-REF
10         C3-REF
11         C4-REF
12         T5-REF
13         T6-

In [15]:
path_log = "/wynton/home/leng/alice-albrecht/projects/PSG_Pipeline/log/h5_shhs1.o1467406"
# path_log = "/wynton/home/leng/alice-albrecht/projects/PSG_Pipeline/log/burdens_shhs2.o1467690"
selected_subjects = []

with open(path_log, "r") as f:
    lines = f.readlines()

for i, line in enumerate(lines[:-1]):  # avoid going past last line
    if "No RESP" in line and "No RESP" in lines[i+1]:
        psg_id = line.split('sub-')[-1].strip()
        sub_id = psg_id.split('_')[0]
        session = psg_id.split('ses-')[-1]
        selected_subjects.append((sub_id, int(session)))
        print(line)
        print(lines[i+1])

# Remove duplicates
selected_subjects = list(set(selected_subjects))

print(selected_subjects)
print(len(selected_subjects), "subjects/sessions with missing features.")

    No RESP_THERM channels found for subject sub-shhs1-200087_ses-1

    No RESP_NASAL_PRESSURE channels found for subject sub-shhs1-200087_ses-1

    No RESP_THERM channels found for subject sub-shhs1-202970_ses-1

    No RESP_NASAL_PRESSURE channels found for subject sub-shhs1-202970_ses-1

    No RESP_THERM channels found for subject sub-shhs1-203053_ses-1

    No RESP_NASAL_PRESSURE channels found for subject sub-shhs1-203053_ses-1

    No RESP_THERM channels found for subject sub-shhs1-203128_ses-1

    No RESP_NASAL_PRESSURE channels found for subject sub-shhs1-203128_ses-1

    No RESP_THERM channels found for subject sub-shhs1-203247_ses-1

    No RESP_NASAL_PRESSURE channels found for subject sub-shhs1-203247_ses-1

    No RESP_THERM channels found for subject sub-shhs1-203272_ses-1

    No RESP_NASAL_PRESSURE channels found for subject sub-shhs1-203272_ses-1

    No RESP_THERM channels found for subject sub-shhs1-203552_ses-1

    No RESP_NASAL_PRESSURE channels found for sub

In [9]:
path_log = "/wynton/home/leng/alice-albrecht/projects/PSG_Pipeline/log/burdens_shhs1.o1467689"
# path_log = "/wynton/home/leng/alice-albrecht/projects/PSG_Pipeline/log/burdens_shhs2.o1467690"
selected_subjects = []
with open(path_log, "r") as f:
    for line in f:
        if "subprocess.CalledProcessError:" in line:
            psg_id = line.split('.mat')[0].split('sub-')[-1]
            sub_id = psg_id.split('_')[0]
            session = psg_id.split('ses-')[-1]
            selected_subjects.append((sub_id, int(session)))
        if "Feature 'vb' was not computed" in line:
            sub_id = line.split(':')[0].split('Sub ')[-1].strip()
            session = 1
            selected_subjects.append((sub_id, int(session)))

print(len(selected_subjects))

2834


In [13]:
import xmltodict
from collections import Counter

annot_path = "/wynton/group/andrews/data/MrOS/mros-sof_mjhe/vs/EDF/bi/bi0002.edf.XML"
with open(annot_path, encoding='utf-8') as f:
        info_dict = xmltodict.parse(f.read())

events = info_dict['CMPStudyConfig']['ScoredEvents']['ScoredEvent']
sleep_stages = info_dict['CMPStudyConfig']['SleepStages']['SleepStage']
names = [event['Name'] for event in events if 'Name' in event]
Counter(names)

Counter({'SpO2 desaturation': 434,
         'PLM (Left)': 332,
         'PLM (Right)': 266,
         'Limb Movement (Left)': 187,
         'Limb Movement (Right)': 161,
         'Arousal (ASDA)': 109,
         'Hypopnea': 53,
         'SpO2 artifact': 47,
         'Central Apnea': 3})

In [8]:
import os
import pandas as pd 
pd.set_option('display.max_rows', None)

df = pd.read_csv("/Users/alicealbrecht/Desktop/S0001_psg_metadata_2025-09-05.csv")
print(df.shape)
df = df.rename(columns={
    "BDSPPatientID": "sub_id",
    "SessionID": "session"
})
df['sub_id'] = 'S0001' + df['sub_id'].astype(str)
df['session'] = df['session'].astype(int)
df = df[
    df['StudyType'].str.lower().str.contains("diagnostic|dignostic", na=False) &
    (~df['StudyType'].str.lower().str.contains("oxygen", na=False))
]
print(df.shape)
df['StudyType'].value_counts()

(26328, 14)
(9468, 14)


StudyType
PSG Diagnostic                   8297
PSG Diagnostic                    870
Diagnostic                        263
Diagnostic PSG                     22
Diagnostic                          4
PSG DIAGNOSTIC                      2
Diagnostic Psg                      1
Diagnostic Oral Appliance           1
Expanded EEG, Diagnostic            1
Diagnostic Extended EEG             1
Diagnostic with OA                  1
Diagnostic EtCO2                    1
Diagnostic PSG with arm leads       1
PSG diagnostic                      1
Diagnostic, < 6 hrs                 1
Dignostic                           1
Name: count, dtype: int64

In [11]:
master = pd.read_csv("/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/hsp_mgb_mastersheet_before_diagnostic.csv")
print(master.shape)
master['sub_id'] = master['sub_id'].astype(str)
master['session'] = master['session'].astype(int)
master = master.merge(
    df[['sub_id', 'session', 'StudyType']],
    on=['sub_id', 'session'],
    how='left'
)
master = master[
    master['StudyType'].str.lower().str.contains("diagnostic|dignostic", na=False) &
    (~master['StudyType'].str.lower().str.contains("oxygen", na=False))
]
print(master.shape)
master['StudyType'].value_counts()

(26310, 14)
(9464, 15)


StudyType
PSG Diagnostic                   8293
PSG Diagnostic                    870
Diagnostic                        263
Diagnostic PSG                     22
Diagnostic                          4
PSG DIAGNOSTIC                      2
Diagnostic Psg                      1
Diagnostic EtCO2                    1
Dignostic                           1
Expanded EEG, Diagnostic            1
Diagnostic Extended EEG             1
Diagnostic with OA                  1
Diagnostic Oral Appliance           1
PSG diagnostic                      1
Diagnostic PSG with arm leads       1
Diagnostic, < 6 hrs                 1
Name: count, dtype: int64

In [15]:
master = master.drop(columns= ['StudyType'])
master.shape

(9464, 14)

In [16]:
output_path = "/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/hsp_mgb_mastersheet.csv"
master.to_csv(output_path, index=False)

In [None]:
master = master[~master['annot_path'].isna()]
print(master.shape)
# Look at all possible annotaiotn file found
possible_annot = master['annot_path'].str.extract(r'ses-\d+_(.*)')[0]
possible_annot.unique()

(9326, 14)


array(['task-psg_annotations.csv', 'task_Xltek.csv', nan], dtype=object)

In [2]:
import pandas as pd
master = pd.read_csv("/Users/alicealbrecht/wynton_data/PSG_Pipeline_Outputs/mastersheets/hsp_mgb_mastersheet.csv")
master['annot_options'] = master['annot_path'].str.extract(r'ses-\d+_(.*)')[0]
master['annot_options'].value_counts()

annot_options
task-psg_annotations.csv    9314
task_Xltek.csv                11
Name: count, dtype: int64

In [3]:
master.shape

(9464, 15)