In [43]:
import os
from mlcroissant import Dataset
import wfdb
from pandas.core.interchange.dataframe_protocol import DataFrame

from utils import read_event_file_as_list, yml_import, get_question_mapping
import logging
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler


# Data Import

In [44]:
#ANALYSIS_DATA_FILES = ["Kardio Events.txt", "Körperlage.txt"]
ANALYSIS_DATA_FILES = ["Schlafprofil.txt"]

#YAML_DATA_FILES = ['allgemeiner_schlaffragebogen_1.yml', 'allgemeiner_schlaffragebogen_1_2.yml']
YAML_DATA_FILES = []

In [45]:
def load_signals(base_path, sample_ids):
    psg_files = [os.path.join(base_path, sample_id, 'PSG', sample_id) for sample_id in sample_ids]

    signal_data = {}

    for psg_file in tqdm(psg_files, desc='Loading signals'):
        record = wfdb.rdrecord(psg_file)
        signal = np.transpose(record.p_signal)

        sort_index = np.argsort(record.sig_name)
        sorted_sig_name = np.array(record.sig_name)[sort_index]
        sorted_signal = signal[sort_index]

        signal_data[record.record_name] = {signal_name: signal_data for signal_name, signal_data in zip(sorted_sig_name, sorted_signal)}

    return signal_data

def load_analysis_and_yaml_files(base_path, sample_ids, analysis_data_files=ANALYSIS_DATA_FILES, yaml_data_files=YAML_DATA_FILES):

    analysis_data = {}
    yaml_data = {}
    logging.info(f'Loading {len(yaml_data_files)} yaml files and {len(analysis_data_files)} analysis files.')
    for sample_id in tqdm(sample_ids, desc='Loading analysis and yaml files'):
        analysis_data[sample_id] = {}
        yaml_data[sample_id] = {}

        for analysis_data_file in analysis_data_files:
            path = os.path.join(base_path, sample_id, 'PSG', 'Analysedaten', analysis_data_file)
            try:
                file_as_list, _, _ = read_event_file_as_list(path)
                analysis_data[sample_id][analysis_data_file] = file_as_list
            except FileNotFoundError:
                print(f"Warning: File {analysis_data_file} from sample {sample_id} not found. Leaving it empty.")
                analysis_data[sample_id][analysis_data_file] = None

        for yaml_data_file in yaml_data_files:
            path = os.path.join(base_path, sample_id, 'YAML', yaml_data_file)
            try:
                data = yml_import(path)
                yaml_data[sample_id][yaml_data_file] = data
            except FileNotFoundError:
                print(f"Warning: File {yaml_data_file} from sample {sample_id} not found. Leaving it empty.")
                yaml_data[sample_id][yaml_data_file] = None

    return analysis_data, yaml_data



In [46]:
dataset = Dataset(jsonld="croissant.json")

base_path = dataset.metadata.url
print(f"Base path: {base_path}")
directory = os.getcwd()
base_path = os.path.join(directory, base_path)
    
sample_ids = os.listdir(base_path)
analysis_data_files = ANALYSIS_DATA_FILES
yaml_data_files = YAML_DATA_FILES

signals = load_signals(base_path, sample_ids)
logging.info(f'Loaded {len(signals)} signals.')

analysis_data, yaml_data = load_analysis_and_yaml_files(base_path, sample_ids, analysis_data_files, yaml_data_files)
logging.info(f'Loaded analysis data for {len(analysis_data)} samples.')
logging.info(f'Loaded YAML data for {len(yaml_data)} samples.')

# Optional: Get the question mapping
yaml_data_mapped = get_question_mapping(yaml_data)

  -  [Metadata(Comprehensive Polysomnography (CPS) Dataset: A Resource for Sleep-Related Arousal Research)] Property "https://schema.org/datePublished" is recommended, but does not exist.


Base path: ./data


Loading signals: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it]
Loading analysis and yaml files: 100%|██████████| 1/1 [00:00<00:00, 219.71it/s]


## Get Labels

In [47]:
df_label = pd.DataFrame()

for patient, patient_labels in analysis_data.items():
    
    label_array = [x[1] for x in patient_labels["Schlafprofil.txt"]]
    df_label["Schlafprofil_label"] = label_array
    
df_label.head(5)


Unnamed: 0,Schlafprofil_label
0,A
1,Wach
2,Wach
3,Wach
4,Wach


## Format Data

In [48]:
features =["C3:A2"]

# Die Größe der Blöcke (z. B. 3840 Datenpunkte pro Zeile)
block_size = 7680
df_main = pd.DataFrame()

for patient, patient_data in signals.items():
    
    for feature in features:
        n_rows = len(patient_data[feature]) // block_size
        reshaped_array = patient_data[feature][:n_rows * block_size].reshape(n_rows, block_size)
        datapoint_column_names = [f"{feature}_{i + 1}" for i in range(block_size)]

        df = pd.DataFrame(reshaped_array, columns=datapoint_column_names)
        df[f'{feature}_Min_Wert'] = df.min(axis=1)
        df[f'{feature}_Max_Wert'] = df.max(axis=1)
        df[f'{feature}_Mean_Wert'] = df.mean(axis=1)
        df[f'{feature}_Median'] = df.median(axis=1)
        df[f'{feature}_Std_Wert'] = df.std(axis=1)
        df[f'{feature}_Amplitude'] = df.max(axis=1) - df.min(axis=1)
        
        df_main = pd.concat([df_main, df],axis=1)

print("Anzahl an Datensätzen: " + str(len(df_main)))
df_main = pd.concat([df_main, df_label],axis=1)
print("Anzahl an Labels: " + str(len(df_label)))

Anzahl an Datensätzen: 969
Anzahl an Labels: 971


In [49]:
df_main = df_main.dropna()
print("Anzahl an Datenstätzen nach Reinigung: " + str(len(df_main)))
df_main.iloc[:, list(range(5)) + list(range(-10, 0))].tail(2)


Anzahl an Datenstätzen nach Reinigung: 969


Unnamed: 0,C3:A2_1,C3:A2_2,C3:A2_3,C3:A2_4,C3:A2_5,C3:A2_7678,C3:A2_7679,C3:A2_7680,C3:A2_Min_Wert,C3:A2_Max_Wert,C3:A2_Mean_Wert,C3:A2_Median,C3:A2_Std_Wert,C3:A2_Amplitude,Schlafprofil_label
967,-4.7e-05,-3.3e-05,-1.9e-05,-9e-06,-5e-06,-2.5e-05,-2.3e-05,-1.9e-05,-0.000148,0.00049,-3e-06,-8e-06,4.7e-05,0.000638,Wach
968,-1.2e-05,-5e-06,2e-06,5e-06,5e-06,-5e-06,-4e-06,-6e-06,-8.6e-05,0.000102,-4e-06,-5e-06,1.7e-05,0.000189,Wach


# Data Understanding

# Base Model

## Train Test Split

In [50]:
X = df_main.drop(columns=['Schlafprofil_label'])  # Alle Spalten außer 'Label' sind Features
y = df_main['Schlafprofil_label'] 

data_train, data_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building Model

In [51]:
clf = DecisionTreeClassifier()

clf.fit(data_train, label_train)


## Evaluate Model

In [52]:
data_pred = clf.predict(data_test)

# Modell evaluieren
accuracy = accuracy_score(label_test, data_pred)
print(f"Genauigkeit des Modells: {accuracy:.2f}")

Genauigkeit des Modells: 0.46
