In [25]:
import os
from mlcroissant import Dataset
import wfdb

from utils import read_event_file_as_list, yml_import, get_question_mapping
import logging
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

from sklearn import metrics
from sktime.classification.feature_based import Catch22Classifier
from sktime.datasets import load_basic_motions, load_italy_power_demand
from sktime.transformations.panel.catch22 import Catch22


# Data Import

In [26]:
#ANALYSIS_DATA_FILES = ["Kardio Events.txt", "Körperlage.txt"]
ANALYSIS_DATA_FILES = ["Schlafprofil.txt"]

#YAML_DATA_FILES = ['allgemeiner_schlaffragebogen_1.yml', 'allgemeiner_schlaffragebogen_1_2.yml']
YAML_DATA_FILES = []

In [27]:
def load_signals(base_path, sample_ids):
    psg_files = [os.path.join(base_path, sample_id, 'PSG', sample_id) for sample_id in sample_ids]

    signal_data = {}

    for psg_file in tqdm(psg_files, desc='Loading signals'):
        record = wfdb.rdrecord(psg_file)
        signal = np.transpose(record.p_signal)

        sort_index = np.argsort(record.sig_name)
        sorted_sig_name = np.array(record.sig_name)[sort_index]
        sorted_signal = signal[sort_index]

        signal_data[record.record_name] = {signal_name: signal_data for signal_name, signal_data in zip(sorted_sig_name, sorted_signal)}

    return signal_data

def load_analysis_and_yaml_files(base_path, sample_ids, analysis_data_files=ANALYSIS_DATA_FILES, yaml_data_files=YAML_DATA_FILES):

    analysis_data = {}
    yaml_data = {}
    logging.info(f'Loading {len(yaml_data_files)} yaml files and {len(analysis_data_files)} analysis files.')
    for sample_id in tqdm(sample_ids, desc='Loading analysis and yaml files'):
        analysis_data[sample_id] = {}
        yaml_data[sample_id] = {}

        for analysis_data_file in analysis_data_files:
            path = os.path.join(base_path, sample_id, 'PSG', 'Analysedaten', analysis_data_file)
            try:
                file_as_list, _, _ = read_event_file_as_list(path)
                analysis_data[sample_id][analysis_data_file] = file_as_list
            except FileNotFoundError:
                print(f"Warning: File {analysis_data_file} from sample {sample_id} not found. Leaving it empty.")
                analysis_data[sample_id][analysis_data_file] = None

        for yaml_data_file in yaml_data_files:
            path = os.path.join(base_path, sample_id, 'YAML', yaml_data_file)
            try:
                data = yml_import(path)
                yaml_data[sample_id][yaml_data_file] = data
            except FileNotFoundError:
                print(f"Warning: File {yaml_data_file} from sample {sample_id} not found. Leaving it empty.")
                yaml_data[sample_id][yaml_data_file] = None

    return analysis_data, yaml_data



def get_labels():
    label_array=[]
    
    for patient, patient_labels in analysis_data.items():
        
        label_array = label_array + [x[1] for x in patient_labels["Schlafprofil.txt"]]
    
    return label_array



In [28]:
dataset = Dataset(jsonld="croissant.json")

base_path = dataset.metadata.url
print(f"Base path: {base_path}")
directory = os.getcwd()
base_path = os.path.join(directory, base_path)
    
sample_ids = os.listdir(base_path)
analysis_data_files = ANALYSIS_DATA_FILES
yaml_data_files = YAML_DATA_FILES

signals = load_signals(base_path, sample_ids)
logging.info(f'Loaded {len(signals)} signals.')

analysis_data, yaml_data = load_analysis_and_yaml_files(base_path, sample_ids, analysis_data_files, yaml_data_files)
logging.info(f'Loaded analysis data for {len(analysis_data)} samples.')
logging.info(f'Loaded YAML data for {len(yaml_data)} samples.')

# Optional: Get the question mapping
yaml_data_mapped = get_question_mapping(yaml_data)

  -  [Metadata(Comprehensive Polysomnography (CPS) Dataset: A Resource for Sleep-Related Arousal Research)] Property "https://schema.org/datePublished" is recommended, but does not exist.


Base path: ./data


Loading signals: 100%|██████████| 1/1 [00:08<00:00,  8.32s/it]
Loading analysis and yaml files: 100%|██████████| 1/1 [00:00<00:00, 63.84it/s]


# Data Understanding

# Baseline Model

## Get Labels

In [29]:
df_label = pd.DataFrame()

df_label["Schlafprofil_label"] = get_labels()
df_label.head(5)
df_label.shape
#Passt das eventuell wenn wir von den labels die A intervalle wegsteichen?

(883, 1)

## Data Formatting

In [30]:
features =["C3:A2"]

# Die Größe der Blöcke 
time_interval_block_size = 7680

def create_feature_vektor(viewed_features, block_size):
    df_main = pd.DataFrame()
    
    for patient, patient_data in signals.items():
        df_patient = pd.DataFrame()
        for feature in viewed_features:
            
            reshaped_array = [patient_data[feature][i:i+time_interval_block_size] for i in range(0, len(patient_data[feature]), time_interval_block_size)]
    
            df = pd.DataFrame({
                feature: reshaped_array
            }) 
            #if add_metadata:
                #df[f'{feature}_Min_Wert'] = df.min(axis=1)
                #df[f'{feature}_Max_Wert'] = df.max(axis=1)
                #df[f'{feature}_Mean_Wert'] = df.mean(axis=1)
                #df[f'{feature}_Median'] = df.median(axis=1)
                #df[f'{feature}_Std_Wert'] = df.std(axis=1)
                #df[f'{feature}_Amplitude'] = df.max(axis=1) - df.min(axis=1)
    
            # Überprüfen, ob das Feature schon existiert
            if any(col.startswith(feature) for col in df_patient.columns):
                # Vertikales Kombinieren (neue Zeilen)
                df_patient = pd.concat([df_patient, df], axis=0, ignore_index=True)
            else:
                # Horizontales Kombinieren (neue Spalten)
                if df_patient.empty:
                    df_patient = df  # Erste Daten: einfach zuweisen
                else:
                    df_patient = pd.concat([df_patient.reset_index(drop=True),
                                            df.reset_index(drop=True)], axis=1)
    
        df_main = pd.concat([df_main, df_patient], axis=0, ignore_index=True)
    
    return df_main 


df_main = create_feature_vektor(features, time_interval_block_size)

print("Anzahl an Datensätzen: " + str(len(df_main)))
df_main = pd.concat([df_main, df_label],axis=1)
print("Anzahl an Labels: " + str(len(df_label)))
print(df_main.shape)
df_main.head(10)

#df_main.tail(10)

Anzahl an Datensätzen: 883
Anzahl an Labels: 883
(883, 2)


Unnamed: 0,C3:A2,Schlafprofil_label
0,"[1.2207122528867856e-08, -8.470078775865103e-0...",A
1,"[4.675364341412463e-06, 7.386895963903432e-07,...",Wach
2,"[2.52689391142996e-06, 6.315405128380489e-06, ...",Wach
3,"[-8.508430712348273e-06, -9.811602752513631e-0...",Wach
4,"[-5.680018306390691e-05, -5.70258523143649e-05...",Wach
5,"[1.746852840790365e-05, 1.2252098538810706e-05...",Wach
6,"[3.210498139151665e-06, 2.3875085647247393e-06...",Wach
7,"[-6.554055102076978e-05, -6.86688339137184e-05...",Wach
8,"[-2.2937361846384074e-05, -2.131579310965574e-...",Wach
9,"[-1.7517357281312346e-05, -2.0546738257645517e...",Wach


In [None]:
df_main = df_main.dropna()
print("Anzahl an Datenstätzen nach Reinigung: " + str(len(df_main)))
df_main.iloc[:, list(range(5)) + list(range(-10, 0))].tail(2)


## Train-Test Split

In [None]:
X = df_main.drop(columns=['Schlafprofil_label'])  # Alle Spalten außer 'Label' sind Features
y = df_main['Schlafprofil_label'] 

data_train, data_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

#clf = DecisionTreeClassifier()
clf = RandomForestClassifier()

clf.fit(data_train, label_train)


## Evaluate Model

In [None]:
data_pred = clf.predict(data_test)
# Modell evaluieren
accuracy = accuracy_score(label_test, data_pred)
print(f"Genauigkeit des Modells: {accuracy:.2f}")

print("Eindeutige Werte in label_test:", np.unique(label_test))
print("Eindeutige Werte in data_pred:", np.unique(data_pred))

report = classification_report(label_test, data_pred)
print(report)

cm = confusion_matrix(label_test, data_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(label_test))
disp.plot(cmap=plt.cm.Blues, values_format='d')