# Imports

In [4]:
import numpy as np

import sys
sys.path.append('../')
from src.utils.data_loader import EKGDataLoader
from src.utils.data_preprocessing import DataPreprocessor

# Variables

In [5]:
DATA_DIR = "../data/raw/ptb_xl_ecg/"
SAMPLING_FREQUENCY = 100
TASK = "superdiagnostic"
SCALER_DIR = f"../data/results/scaler/"
PREPROCESSED_DIR = f"../data/preprocessed/"

# Load Data

In [6]:
# 1. Lade PTB-XL Daten
# 2. Labels aggregieren
# 3. Relevante Daten auswählen und in One-Hot umwandeln
loader = EKGDataLoader(DATA_DIR, SAMPLING_FREQUENCY, TASK)
X, Y, y, mlb = loader.load_ptbxl_data()

Loading PTB-XL data from: ../data/raw/ptb_xl_ecg/ ptbxl_database.csv


In [7]:
Y

Unnamed: 0_level_0,patient_id,age,sex,height,weight,nurse,site,device,recording_date,report,...,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr,scp_codes_len,superdiagnostic,superdiagnostic_len
ecg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15709.0,56.0,1,,63.0,2.0,0.0,CS-12 E,1984-11-09 09:17:34,sinusrhythmus periphere niederspannung,...,,,,,3,records100/00000/00001_lr,records500/00000/00001_hr,3,[NORM],1
2,13243.0,19.0,0,,70.0,2.0,0.0,CS-12 E,1984-11-14 12:55:37,sinusbradykardie sonst normales ekg,...,,,,,2,records100/00000/00002_lr,records500/00000/00002_hr,2,[NORM],1
3,20372.0,37.0,1,,69.0,2.0,0.0,CS-12 E,1984-11-15 12:49:10,sinusrhythmus normales ekg,...,,,,,5,records100/00000/00003_lr,records500/00000/00003_hr,2,[NORM],1
4,17014.0,24.0,0,,82.0,2.0,0.0,CS-12 E,1984-11-15 13:44:57,sinusrhythmus normales ekg,...,,,,,3,records100/00000/00004_lr,records500/00000/00004_hr,2,[NORM],1
5,17448.0,19.0,1,,70.0,2.0,0.0,CS-12 E,1984-11-17 10:43:15,sinusrhythmus normales ekg,...,,,,,4,records100/00000/00005_lr,records500/00000/00005_hr,2,[NORM],1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21833,17180.0,67.0,1,,,1.0,2.0,AT-60 3,2001-05-31 09:14:35,ventrikulÄre extrasystole(n) sinustachykardie ...,...,,,1ES,,7,records100/21000/21833_lr,records500/21000/21833_hr,4,[STTC],1
21834,20703.0,300.0,0,,,1.0,2.0,AT-60 3,2001-06-05 11:33:39,sinusrhythmus lagetyp normal qrs(t) abnorm ...,...,,,,,4,records100/21000/21834_lr,records500/21000/21834_hr,3,[NORM],1
21835,19311.0,59.0,1,,,1.0,2.0,AT-60 3,2001-06-08 10:30:27,sinusrhythmus lagetyp normal t abnorm in anter...,...,,,,,2,records100/21000/21835_lr,records500/21000/21835_hr,2,[STTC],1
21836,8873.0,64.0,1,,,1.0,2.0,AT-60 3,2001-06-09 18:21:49,supraventrikulÄre extrasystole(n) sinusrhythmu...,...,,,SVES,,8,records100/21000/21836_lr,records500/21000/21836_hr,2,[NORM],1


# Label Manipulation

In [8]:
# Old Classes
all_class_names = list(mlb.classes_)
all_class_ids = [i for i, name in enumerate(all_class_names)]
num_all_classes = len(all_class_names)

print(f"All classes: {all_class_names}")
print(f'Class IDs: {all_class_ids}')
print(f"Count classes: {num_all_classes}")

All classes: ['CD', 'HYP', 'MI', 'NORM', 'STTC']
Class IDs: [0, 1, 2, 3, 4]
Count classes: 5


In [9]:
# New Classes
y_relabel = DataPreprocessor.relabel_to_mi_norm_other(y, mlb)

print('y_relabel examples:')
print(y_relabel[:5])

class_counts = np.sum(y_relabel, axis=0)
class_percent = class_counts / y_relabel.shape[0] * 100
class_names = ['MI', 'NORM', 'OTHER']

print('\nDistribution:')
for name, class_counts, percent in zip(class_names, class_counts, class_percent):
    print(f"{name}: {class_counts} ({percent:.2f}%)")
print(f"ALL: {np.sum(class_counts)} ({np.sum(class_percent):.2f}%)")

y_relabel examples:
[[0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]
 [0 1 0]]

Distribution:
MI: 5469 (25.57%)
NORM: 9513 (44.48%)
OTHER: 6406 (29.95%)
ALL: 6406 (100.00%)


# Split

In [10]:
preprocessor = DataPreprocessor()

In [11]:
# 4. Train, Test, Validation Split
X_train, y_train, X_val, y_val, X_test, y_test = preprocessor.data_split(X, y_relabel, Y)

In [12]:
splits = {'Train': y_train, 'Validation': y_val, 'Test': y_test}
class_names = ['MI', 'NORM', 'OTHER']

for split_name, y_split in splits.items():
    y_arr = np.asarray(y_split)

    # Falls One-Hot: zu Integer-Labels machen
    if y_arr.ndim == 2 and y_arr.shape[1] == 3:
        y_idx = np.argmax(y_arr, axis=1)
    else:
        y_idx = y_arr

    counts = np.bincount(y_idx, minlength=3)
    perc = counts / counts.sum() * 100
    print(f"{split_name}:")
    
    for name, c, p in zip(class_names, counts, perc):
        print(f"  {name}: {int(c)} ({p:.2f}%)")
    print(f"  TOTAL: {int(counts.sum())} (100.00%)\n")

Train:
  MI: 4379 (25.63%)
  NORM: 7595 (44.46%)
  OTHER: 5110 (29.91%)
  TOTAL: 17084 (100.00%)

Validation:
  MI: 540 (25.16%)
  NORM: 955 (44.50%)
  OTHER: 651 (30.34%)
  TOTAL: 2146 (100.00%)

Test:
  MI: 550 (25.49%)
  NORM: 963 (44.62%)
  OTHER: 645 (29.89%)
  TOTAL: 2158 (100.00%)



# Standardization

In [13]:
# 5. Data Standardization with trained scaler on training set
X_train_std, X_val_std, X_test_std = preprocessor.preprocess_signals(X_train, X_val, X_test, SCALER_DIR)

# Saving

In [14]:
# 6. Save Processed Data
preprocessor.save_signals(X_train_std, y_train, X_val_std, y_val, X_test_std, y_test, PREPROCESSED_DIR)