In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM, BatchNormalization, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
from tqdm import tqdm





In [81]:
import librosa

In [2]:
# Load the CSV file into a DataFrame
csv_path = 'D:\Omar\Friends\European_HealthCare_Hackathon\ecg_hospitalization\data\processed\meta\data_pairs.csv'  # Replace with the path to your CSV file
df = pd.read_csv(csv_path)

In [3]:
# Initialize empty lists to store data
data_list = []
labels_list = []

In [4]:
# Iterate through rows and load .npy files
for index, row in tqdm(df.iterrows()):
    file_path = row['np_file_path']
    label = row['label']

    # Load the .npy file
    loaded_data = np.load(file_path)

    # Append the loaded data and label to the lists
    data_list.append(loaded_data)
    labels_list.append(label)

23292it [08:30, 45.61it/s] 


In [5]:
# Convert lists to NumPy arrays
data_array = np.array(data_list)
labels_array = np.array(labels_list)

In [6]:
# Print shapes for verification
print("Data Array Shape:", data_array.shape)
print("Labels Array Shape:", labels_array.shape)

Data Array Shape: (23292, 8, 5000)
Labels Array Shape: (23292,)


In [7]:
indices_with_nans = [370, 899, 4733, 4936, 5404, 8354, 9146, 9560, 10268, 10879, 11915, 12946, 13441, 14674, 15413, 15702, 16190, 22258, 23204]

In [8]:
data_array = data_array[np.logical_not(np.isin(np.arange(len(data_array)), indices_with_nans))]

In [9]:
labels_array = labels_array[np.logical_not(np.isin(np.arange(len(labels_array)), indices_with_nans))]

In [10]:
num_patients, num_leads, num_time_points = data_array.shape

In [11]:
ecg_data = data_array
labels = labels_array

In [12]:
# Set seed for reproducibility
np.random.seed(42)

In [13]:
# Generate random indices for shuffling
indices = np.arange(len(labels))
np.random.shuffle(indices)

In [14]:
# Shuffle ecg_data and labels using the generated indices
ecg_data_shuffled = ecg_data[indices]
labels_shuffled = labels[indices]

In [15]:
# To be moved up
from scipy.stats import skew, kurtosis
from scipy import signal
from scipy.fft import fft
from biosppy.signals import ecg  # Biosppy is a library for biosignal processing
import pywt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [106]:
from python_speech_features import mfcc

In [128]:
# For the classical models

# Function to extract features from each lead
def extract_features(lead):
    # 1. Statistical Features
    mean_value = np.mean(lead)
    median_value = np.median(lead)
    std_dev_value = np.std(lead)
    skewness_value = skew(lead)
    kurtosis_value = kurtosis(lead)

    # 2. Time-Domain Features
    # You might need to preprocess the data to find R-peaks for RR interval calculations
    # Example using biosppy
    rpeaks = ecg.ecg(lead, sampling_rate=500, show=False)['rpeaks']
    rr_interval = np.diff(rpeaks)

    # Calculate features from RR intervals
    rr_mean = np.mean(rr_interval)  
    heart_rate = 60 / rr_mean

    # 3. Frequency-Domain Features
    #power_spectral_density (psd) shape is (num_time_points // 2 + 1)
    f, psd = signal.welch(lead, fs=500)
    dominant_frequency = f[np.argmax(psd)]
    spectral_entropy = -np.sum(psd * np.log2(psd + 1e-10))
    
    # Combine all features into a single array
    initial_features = [
        mean_value, median_value, std_dev_value, skewness_value, kurtosis_value,
        rr_mean, heart_rate, dominant_frequency, spectral_entropy, 
    ]

    n_mfcc=13
#    len_of_signal = lead.shape[0]
#    hop_len = len_of_signal // n_mfcc
#    mfcc = librosa.feature.mfcc(y=lead, sr=500, n_mfcc=n_mfcc, hop_length=hop_len)
#    mfcc = mfcc.flatten()
    # Calculate MFCC features
    mfcc_features = mfcc(signal=lead, samplerate=500, numcep=n_mfcc)
    mfcc_features = np.mean(mfcc_features, axis=0)

    initial_features.extend(mfcc_features.tolist())
    extracted_features = np.array(initial_features)


    
    return extracted_features

In [129]:
num_features = len(extract_features(ecg_data[0, 0, :]))

In [130]:
patient_features = np.zeros((num_patients, num_features * num_leads))

In [131]:
# Apply feature extraction for each lead and concatenate features for each patient
for patient_index in tqdm(range(num_patients)):
    patient_lead_features = np.zeros((num_leads, num_features))
    
    for lead_index in range(num_leads):
        patient_lead_features[lead_index, :] = extract_features(ecg_data[patient_index, lead_index, :])
    
    # Concatenate features for the current patient
    patient_features[patient_index, :] = patient_lead_features.flatten()


############## consider try - catch #######################

100%|████████████████████████████████████████████████████████████████████████████| 23273/23273 [53:35<00:00,  7.24it/s]


In [209]:
patient_features.shape

(23273, 176)

In [210]:
ecg_data.shape

(23273, 8, 5000)

In [211]:
labels.shape

(23273,)

In [212]:
X_classical = patient_features

In [213]:
X_deep = ecg_data

In [214]:
# Split the data into training and testing sets
X_train_classical, X_test_classical, y_train_classical, y_test_classical = train_test_split(X_classical, labels, test_size=0.2, random_state=42)
X_train_deep, X_test_deep, y_train_deep, y_test_deep = train_test_split(ecg_data, labels, test_size=0.2, random_state=42)

In [39]:
for i in range(len(y_train_classical)):
    if(y_train_classical[i] != y_train_deep[i]):
        Print("WRONG SPLITTING!!!!!!!!!!!  TRAIN")

for i in range(len(y_test_classical)):
    if(y_test_classical[i] != y_test_deep[i]):
        Print("WRONG SPLITTING!!!!!!!!!!!   TESTTTTTTTTTT")

In [215]:
############################################ DEEP ###############################################

In [216]:
# Build the CNN combined with RNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(8, 5000)))
model.add(MaxPooling1D(pool_size=2))
model.add(BatchNormalization())
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # Assuming 3 output classes

In [217]:
optimizer = Adam(learning_rate=0.001)

In [218]:
batch_size = 32

In [219]:
# Implement early stopping based on validation loss
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [220]:
# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [221]:
# Convert labels to categorical format
le = LabelEncoder()
y_train_deep = to_categorical(le.fit_transform(y_train_deep))
y_test_deep = to_categorical(le.fit_transform(y_test_deep))

In [144]:
# Class proportions
p1 = 0.61
p2 = 0.29
p3 = 0.1

In [145]:
# Calculate weights
w1 = 1 / p1
w2 = 1 / p2
w3 = 1 / p3

In [146]:
# Normalize weights
sum_weights = w1 + w2 + w3
normalized_w1 = w1 / sum_weights
normalized_w2 = w2 / sum_weights
normalized_w3 = w3 / sum_weights

In [224]:
# Train the model
model.fit(X_train_deep, y_train_deep, epochs=50, batch_size=batch_size, validation_data=(X_test_deep, y_test_deep), callbacks=[early_stopping], class_weight={0:0.1, 1:0.2, 2:0.7})

MemoryError: Unable to allocate 5.55 GiB for an array with shape (18618, 8, 5000) and data type float64

In [157]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_deep, y_test_deep)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 1.0561214685440063
Test Accuracy: 0.49924811720848083


In [158]:
y_pred_deep = model.predict(X_test_deep)



In [159]:
y_pred_deep = np.argmax(y_pred_deep, axis=1)

In [160]:
y_test_deep = np.argmax(y_test_deep, axis=1)

In [161]:
# Evaluate the model
accuracy = accuracy_score(y_test_deep, y_pred_deep)
report = classification_report(y_test_deep, y_pred_deep)

In [162]:
accuracy

0.4992481203007519

In [163]:
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.55      0.60      2804
           1       0.34      0.56      0.42      1383
           2       0.17      0.00      0.00       468

    accuracy                           0.50      4655
   macro avg       0.39      0.37      0.34      4655
weighted avg       0.51      0.50      0.49      4655



In [None]:
############################################ CLASSICAL #########################################################

In [164]:
# Standardize the features
scaler = StandardScaler()
X_train_classical = scaler.fit_transform(X_train_classical)
X_test_classical = scaler.transform(X_test_classical)

In [165]:
# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(class_weight = {0 : 0.05, 1 : 0.15, 2 : 0.8}, n_estimators=150, random_state=42)
rf_classifier.fit(X_train_classical, y_train_classical)

In [166]:
y_pred_classical = rf_classifier.predict(X_test_classical)

In [167]:
# Evaluate the model
accuracy = accuracy_score(y_test_classical, y_pred_classical)
report = classification_report(y_test_classical, y_pred_classical)

In [168]:
accuracy

0.637593984962406

In [169]:
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.97      0.77      2804
           1       0.65      0.17      0.27      1383
           2       0.35      0.01      0.02       468

    accuracy                           0.64      4655
   macro avg       0.55      0.38      0.35      4655
weighted avg       0.61      0.64      0.55      4655



In [170]:
######################################### ENSEMBLE ###########################################################

In [171]:
# Step 5: Combine predictions using stacking
stacked_features_train = np.column_stack((y_pred_deep, y_pred_classical))

In [172]:
## MOVE above later
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

In [173]:
# Optional: Train a logistic regression meta-model
meta_model = LogisticRegression()
meta_model.fit(stacked_features_train, y_test_deep)

In [174]:
# Use cross_val_predict for out-of-fold predictions
# y_pred_deep ready
# y_pred_classical ready
stacked_features_test = np.column_stack((y_pred_deep, y_pred_classical))

In [175]:
y_pred_ensemble = meta_model.predict(stacked_features_test)

In [176]:
# Step 6: Evaluate the ensemble
ensemble_accuracy = accuracy_score(y_test_deep, y_pred_ensemble)
print(f'Ensemble Accuracy: {ensemble_accuracy}')

Ensemble Accuracy: 0.6373791621911923


In [177]:
report = classification_report(y_test_deep, y_pred_ensemble)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.97      0.77      2804
           1       0.64      0.17      0.27      1383
           2       0.00      0.00      0.00       468

    accuracy                           0.64      4655
   macro avg       0.42      0.38      0.35      4655
weighted avg       0.57      0.64      0.54      4655



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [184]:
from sklearn.svm import SVC

In [199]:
# Create a Support Vector Machine model
svm_model = SVC(kernel='linear', C=1, random_state=42, class_weight= {0:0.05 , 1:0.15 , 2:0.8})

In [200]:
# Train the model
svm_model.fit(stacked_features_train, y_test_deep)

In [201]:
# Make predictions on the test set
y_pred_svm_ens = svm_model.predict(stacked_features_train)

In [202]:
# Step 6: Evaluate the ensemble
ensemble_accuracy2 = accuracy_score(y_test_deep, y_pred_svm_ens)
print(f'Ensemble Accuracy: {ensemble_accuracy}')

Ensemble Accuracy: 0.6373791621911923


In [203]:
report2 = classification_report(y_test_deep, y_pred_svm_ens)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.97      0.77      2804
           1       0.64      0.17      0.27      1383
           2       0.00      0.00      0.00       468

    accuracy                           0.64      4655
   macro avg       0.42      0.38      0.35      4655
weighted avg       0.57      0.64      0.54      4655



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
