In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/memmap/mds_ed.csv')

In [None]:
df.shape

In [None]:
general_columns = [i for i in df.columns if 'general_' in i]
len(general_columns)

In [None]:
demographics_columns = [i for i in df.columns if 'demographics_' in i]
len(demographics_columns)

In [None]:
biometrics_columns = [i for i in df.columns if 'biometrics_' in i]
len(biometrics_columns)

In [None]:
vitalparameters_columns = [i for i in df.columns if 'vitalparemeters_' in i]
len(vitalparameters_columns)

In [None]:
labvalues_columns = [i for i in df.columns if 'labvalues_' in i]
len(labvalues_columns)

In [None]:
all_features = demographics_columns + biometrics_columns + vitalparameters_columns + labvalues_columns

In [None]:
len(all_features)

In [None]:
all_features_with_masks = []

for col in all_features:
    mask_col = col + '_m'
    df[mask_col] = df[col].notna().astype(float)
    all_features_with_masks.append(col)
    all_features_with_masks.append(mask_col)
    
selected_folds = df[df['general_strat_fold'].isin(range(0, 18))]

medians = selected_folds[all_features].median()
df[all_features] = df[all_features].fillna(medians)

In [None]:
diagnoses_columns = [i for i in df.columns if 'diagnoses_' in i]
len(diagnoses_columns)

In [None]:
deterioration_columns = [i for i in df.columns if 'deterioration_' in i]
len(deterioration_columns)

In [None]:
# note: column 'general_data' is the index of the waveform from MIMIC-IV-ECG

In [None]:
# All features (e.g. Multimodal ECG waveform + tabular):

x_train = df[df['general_strat_fold'].isin(range(0, 18))].reset_index(drop=True)
x_val = df[df['general_strat_fold'].isin([18])].reset_index(drop=True)
x_test = df[df['general_strat_fold'].isin([19])].reset_index(drop=True)

indexes_val = x_val[x_val['general_ecg_no_within_stay']==0].index
indexes_test = x_test[x_test['general_ecg_no_within_stay']==0].index


# use 'all_features_with_mask' for deep learning e.g. S4 or 'all_features' for tabular e.g. XGBoost 
x_train = x_train[all_features_with_masks].reset_index(drop=True)
x_val = x_val.iloc[indexes_val][all_features_with_masks].reset_index(drop=True)
x_test = x_test.iloc[indexes_test][all_features_with_masks].reset_index(drop=True)

y_train_diagnoses = df[df['general_strat_fold'].isin(range(0, 18))][diagnoses_columns].reset_index(drop=True).values
y_val_diagnoses = df[df['general_strat_fold'].isin([18])][diagnoses_columns].reset_index(drop=True)
y_test_diagnoses = df[df['general_strat_fold'].isin([19])][diagnoses_columns].reset_index(drop=True)

y_val_diagnoses = y_val_diagnoses.iloc[indexes_val].values
y_test_diagnoses = y_test_diagnoses.iloc[indexes_test].values

y_train_deterioration = df[df['general_strat_fold'].isin(range(0, 18))][deterioration_columns].reset_index(drop=True).values
y_val_deterioration = df[df['general_strat_fold'].isin([18])][deterioration_columns].reset_index(drop=True)
y_test_deterioration = df[df['general_strat_fold'].isin([19])][deterioration_columns].reset_index(drop=True)

y_val_deterioration = y_val_deterioration.iloc[indexes_val].values
y_test_deterioration = y_test_deterioration.iloc[indexes_test].values

In [None]:
# additional: for XGBoost one should also remove samples where y is special token -999, e.g...

In [None]:
deterioration_columns[-1]

In [None]:
label_index = -1

to_keep_train = np.argwhere(y_train_deterioration[:,label_index]!=-999)[:,0]
to_keep_val = np.argwhere(y_val_deterioration[:,label_index]!=-999)[:,0]
to_keep_test = np.argwhere(y_test_deterioration[:,label_index]!=-999)[:,0]

x_train_label = x_train.iloc[to_keep_train]
x_val_label = x_val.iloc[to_keep_val]
x_test_label = x_test.iloc[to_keep_test]

y_train_deterioration_label = y_train_deterioration[to_keep_train,label_index]
y_val_deterioration_label = y_val_deterioration[to_keep_val,label_index]
y_test_deterioration_label = y_test_deterioration[to_keep_test,label_index]