In [20]:
import os
from mlcroissant import Dataset
import wfdb
import numpy as np
from utils import read_event_file_as_list, yml_import, get_question_mapping
import logging
from tqdm import tqdm
import argparse

# Data Import

In [21]:
ANALYSIS_DATA_FILES = ["Kardio Events.txt", "Körperlage.txt"]
#ANALYSIS_DATA_FILES = []

YAML_DATA_FILES = ['allgemeiner_schlaffragebogen_1.yml', 'allgemeiner_schlaffragebogen_1_2.yml']
#YAML_DATA_FILES = []

In [22]:
def load_signals(base_path, sample_ids):
    psg_files = [os.path.join(base_path, sample_id, 'PSG', sample_id) for sample_id in sample_ids]

    signal_data = {}

    for psg_file in tqdm(psg_files, desc='Loading signals'):
        record = wfdb.rdrecord(psg_file)
        signal = np.transpose(record.p_signal)

        sort_index = np.argsort(record.sig_name)
        sorted_sig_name = np.array(record.sig_name)[sort_index]
        sorted_signal = signal[sort_index]

        signal_data[record.record_name] = {signal_name: signal_data for signal_name, signal_data in zip(sorted_sig_name, sorted_signal)}

    return signal_data

def load_analysis_and_yaml_files(base_path, sample_ids, analysis_data_files=ANALYSIS_DATA_FILES, yaml_data_files=YAML_DATA_FILES):

    analysis_data = {}
    yaml_data = {}
    logging.info(f'Loading {len(yaml_data_files)} yaml files and {len(analysis_data_files)} analysis files.')
    for sample_id in tqdm(sample_ids, desc='Loading analysis and yaml files'):
        analysis_data[sample_id] = {}
        yaml_data[sample_id] = {}

        for analysis_data_file in analysis_data_files:
            path = os.path.join(base_path, sample_id, 'PSG', 'Analysedaten', analysis_data_file)
            try:
                file_as_list, _, _ = read_event_file_as_list(path)
                analysis_data[sample_id][analysis_data_file] = file_as_list
            except FileNotFoundError:
                print(f"Warning: File {analysis_data_file} from sample {sample_id} not found. Leaving it empty.")
                analysis_data[sample_id][analysis_data_file] = None

        for yaml_data_file in yaml_data_files:
            path = os.path.join(base_path, sample_id, 'YAML', yaml_data_file)
            try:
                data = yml_import(path)
                yaml_data[sample_id][yaml_data_file] = data
            except FileNotFoundError:
                print(f"Warning: File {yaml_data_file} from sample {sample_id} not found. Leaving it empty.")
                yaml_data[sample_id][yaml_data_file] = None

    return analysis_data, yaml_data



In [24]:
dataset = Dataset(jsonld="croissant.json")

base_path = dataset.metadata.url
directory = os.getcwd()
base_path = os.path.join(directory, base_path)
    
sample_ids = os.listdir(base_path)
analysis_data_files = ANALYSIS_DATA_FILES
yaml_data_files = YAML_DATA_FILES

signals = load_signals(base_path, sample_ids)
logging.info(f'Loaded {len(signals)} signals.')

analysis_data, yaml_data = load_analysis_and_yaml_files(base_path, sample_ids, analysis_data_files, yaml_data_files)
logging.info(f'Loaded analysis data for {len(analysis_data)} samples.')
logging.info(f'Loaded YAML data for {len(yaml_data)} samples.')

# Optional: Get the question mapping
yaml_data_mapped = get_question_mapping(yaml_data)

  -  [Metadata(Comprehensive Polysomnography (CPS) Dataset: A Resource for Sleep-Related Arousal Research)] Property "https://schema.org/datePublished" is recommended, but does not exist.
Loading signals: 100%|██████████| 1/1 [00:05<00:00,  5.66s/it]
INFO:root:Loaded 1 signals.
INFO:root:Loading 2 yaml files and 2 analysis files.
Loading analysis and yaml files: 100%|██████████| 1/1 [00:00<00:00, 34.48it/s]
INFO:root:Loaded analysis data for 1 samples.
INFO:root:Loaded YAML data for 1 samples.


# Data Understanding