In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM, BatchNormalization, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
from tqdm import tqdm





In [2]:
# Load the CSV file into a DataFrame
csv_path = 'D:\Omar\Friends\European_HealthCare_Hackathon\ecg_hospitalization\data\processed\meta\data_pairs.csv'  # Replace with the path to your CSV file
df = pd.read_csv(csv_path)

In [3]:
# Initialize empty lists to store data
data_list = []
labels_list = []

In [4]:
# Iterate through rows and load .npy files
for index, row in tqdm(df.iterrows()):
    file_path = row['np_file_path']
    label = row['label']

    # Load the .npy file
    loaded_data = np.load(file_path)

    # Append the loaded data and label to the lists
    data_list.append(loaded_data)
    labels_list.append(label)

23292it [08:30, 45.61it/s] 


In [5]:
# Convert lists to NumPy arrays
data_array = np.array(data_list)
labels_array = np.array(labels_list)

In [6]:
# Print shapes for verification
print("Data Array Shape:", data_array.shape)
print("Labels Array Shape:", labels_array.shape)

Data Array Shape: (23292, 8, 5000)
Labels Array Shape: (23292,)


In [7]:
indices_with_nans = [370, 899, 4733, 4936, 5404, 8354, 9146, 9560, 10268, 10879, 11915, 12946, 13441, 14674, 15413, 15702, 16190, 22258, 23204]

In [8]:
data_array = data_array[np.logical_not(np.isin(np.arange(len(data_array)), indices_with_nans))]

In [9]:
labels_array = labels_array[np.logical_not(np.isin(np.arange(len(labels_array)), indices_with_nans))]

In [10]:
num_patients, num_leads, num_time_points = data_array.shape

In [11]:
ecg_data = data_array
labels = labels_array

In [12]:
# Set seed for reproducibility
np.random.seed(42)

In [13]:
# Generate random indices for shuffling
indices = np.arange(len(labels))
np.random.shuffle(indices)

In [14]:
# Shuffle ecg_data and labels using the generated indices
ecg_data_shuffled = ecg_data[indices]
labels_shuffled = labels[indices]

In [15]:
# To be moved up
from scipy.stats import skew, kurtosis
from scipy import signal
from scipy.fft import fft
from biosppy.signals import ecg  # Biosppy is a library for biosignal processing
import pywt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [16]:
# For the classical models

# Function to extract features from each lead
def extract_features(lead):
    # 1. Statistical Features
    mean_value = np.mean(lead)
    median_value = np.median(lead)
    std_dev_value = np.std(lead)
    skewness_value = skew(lead)
    kurtosis_value = kurtosis(lead)

    # 2. Time-Domain Features
    # You might need to preprocess the data to find R-peaks for RR interval calculations
    # Example using biosppy
    rpeaks = ecg.ecg(lead, sampling_rate=500, show=False)['rpeaks']
    rr_interval = np.diff(rpeaks)

    # Calculate features from RR intervals
    rr_mean = np.mean(rr_interval)  
    heart_rate = 60 / rr_mean

    # 3. Frequency-Domain Features
    #power_spectral_density (psd) shape is (num_time_points // 2 + 1)
    f, psd = signal.welch(lead, fs=500)
    dominant_frequency = f[np.argmax(psd)]
    spectral_entropy = -np.sum(psd * np.log2(psd + 1e-10))

    # Combine all features into a single array
    extracted_features = np.array([
        mean_value, median_value, std_dev_value, skewness_value, kurtosis_value,
        rr_mean, heart_rate, dominant_frequency, spectral_entropy
    ])

    return extracted_features

In [17]:
num_features = len(extract_features(ecg_data[0, 0, :]))

In [18]:
patient_features = np.zeros((num_patients, num_features * num_leads))

In [None]:
# Apply feature extraction for each lead and concatenate features for each patient
for patient_index in tqdm(range(num_patients)):
    patient_lead_features = np.zeros((num_leads, num_features))
    
    for lead_index in range(num_leads):
        patient_lead_features[lead_index, :] = extract_features(ecg_data[patient_index, lead_index, :])
    
    # Concatenate features for the current patient
    patient_features[patient_index, :] = patient_lead_features.flatten()


############## consider try - catch #######################

 72%|██████████████████████████████████████████████████████▊                     | 16782/23273 [19:37<07:50, 13.79it/s]

In [60]:
patient_features.shape

(23280, 72)

In [61]:
ecg_data.shape

(23280, 8, 5000)

In [62]:
labels.shape

(23280,)

In [63]:
X_classical = patient_features

In [65]:
X_deep = ecg_data

In [66]:
# Split the data into training and testing sets
X_train_classical, X_test_classical, y_train_classical, y_test_classical = train_test_split(X_classical, labels, test_size=0.2, random_state=42)
X_train_deep, X_test_deep, y_train_deep, y_test_deep = train_test_split(ecg_data, labels, test_size=0.2, random_state=42)

In [39]:
for i in range(len(y_train_classical)):
    if(y_train_classical[i] != y_train_deep[i]):
        Print("WRONG SPLITTING!!!!!!!!!!!  TRAIN")

for i in range(len(y_test_classical)):
    if(y_test_classical[i] != y_test_deep[i]):
        Print("WRONG SPLITTING!!!!!!!!!!!   TESTTTTTTTTTT")

In [40]:
np.unique(y_train_classical)

array([0, 1, 2])

In [41]:
np.unique(y_train_deep)

array([0, 1, 2])

In [67]:
y_test_deep

array([0, 1, 2, ..., 0, 1, 1])

In [68]:
y_train_deep

array([1, 1, 1, ..., 1, 2, 0])

In [43]:
############################################ DEEP ###############################################

In [69]:
# Build the CNN combined with RNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(8, 5000)))
model.add(MaxPooling1D(pool_size=2))
model.add(BatchNormalization())
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # Assuming 3 output classes

In [70]:
optimizer = Adam(learning_rate=0.001)

In [71]:
batch_size = 32

In [72]:
# Implement early stopping based on validation loss
#early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [73]:
# Compile the model
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [74]:
# Convert labels to categorical format
le = LabelEncoder()
y_train_deep = to_categorical(le.fit_transform(y_train_deep))
y_test_deep = to_categorical(le.fit_transform(y_test_deep))

In [75]:
# Class proportions
p1 = 0.61
p2 = 0.29
p3 = 0.1

In [76]:
# Calculate weights
w1 = 1 / p1
w2 = 1 / p2
w3 = 1 / p3

In [77]:
# Normalize weights
sum_weights = w1 + w2 + w3
normalized_w1 = w1 / sum_weights
normalized_w2 = w2 / sum_weights
normalized_w3 = w3 / sum_weights

In [79]:
# Train the model
model.fit(X_train_deep, y_train_deep, epochs=50, batch_size=batch_size, validation_data=(X_test_deep, y_test_deep), class_weight={0:normalized_w1, 1:normalized_w2, 2:normalized_w3})

MemoryError: Unable to allocate 5.55 GiB for an array with shape (18624, 8, 5000) and data type float64

In [65]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_deep, y_test_deep)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 2.8677122592926025
Test Accuracy: 0.4640480875968933


In [None]:
y_pred_deep = model.predict(X_test_deep)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test_deep, y_pred_deep)
report = classification_report(y_test_deep, y_pred_deep)

In [None]:
############################################ CLASSICAL #########################################################

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_classical = scaler.fit_transform(X_train_classical)
X_test_classical = scaler.transform(X_test_classical)

In [None]:
# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(class_weight = {0 : 0.05, 1 : 0.1, 2 : 0.85}, n_estimators=150, random_state=42)
rf_classifier.fit(X_train_classical, y_train_classical)

In [None]:
y_pred_classical = rf_classifier.predict(X_test_classical)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test_classical, y_pred_classical)
report = classification_report(y_test_classical, y_pred_classical)

In [None]:
######################################### ENSEMBLE ###########################################################

In [None]:
# Step 5: Combine predictions using stacking
stacked_features_train = np.column_stack((y_pred_deep, y_pred_classical))

In [None]:
## MOVE above later
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

In [None]:
# Optional: Train a logistic regression meta-model
meta_model = LogisticRegression()
meta_model.fit(stacked_features_train, y_test_deep)

In [None]:
# Use cross_val_predict for out-of-fold predictions
# y_pred_deep ready
# y_pred_classical ready
stacked_features_test = np.column_stack((y_pred_deep, y_pred_classical))

In [None]:
y_pred_ensemble = meta_model.predict(stacked_features_test)

In [None]:
# Step 6: Evaluate the ensemble
ensemble_accuracy = accuracy_score(y_test_deep, y_pred_ensemble)
print(f'Ensemble Accuracy: {ensemble_accuracy}')

In [None]:
meta_model_cnn = keras.Sequential([
    layers.Conv1D(64, 3, activation='relu', input_shape=(stacked_features_train.shape[1], 1)),
    layers.MaxPooling1D(2),
    layers.Conv1D(128, 3, activation='relu'),
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(3, activation='softmax')  # Output layer with 3 classes
])

meta_model_cnn.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
meta_model_cnn.fit(stacked_features_train, y_test_deep, epochs=60, batch_size=32, validation_split=0.15)



y_pred_ensemble_cnn = meta_model_cnn.predict(stacked_features_test)

# Step 8: Evaluate the ensemble
ensemble_accuracy = accuracy_score(y_test_deep, np.argmax(y_pred_ensemble_cnn, axis=1))
print(f'Ensemble Accuracy: {ensemble_accuracy}')