In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, LSTM, BatchNormalization, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
from tqdm import tqdm





In [81]:
import librosa

In [2]:
# Load the CSV file into a DataFrame
csv_path = 'D:\Omar\Friends\European_HealthCare_Hackathon\ecg_hospitalization\data\processed\meta\data_pairs.csv'  # Replace with the path to your CSV file
df = pd.read_csv(csv_path)

In [3]:
# Initialize empty lists to store data
data_list = []
labels_list = []

In [4]:
# Iterate through rows and load .npy files
for index, row in tqdm(df.iterrows()):
    file_path = row['np_file_path']
    label = row['label']

    # Load the .npy file
    loaded_data = np.load(file_path)

    # Append the loaded data and label to the lists
    data_list.append(loaded_data)
    labels_list.append(label)

23292it [08:30, 45.61it/s] 


In [5]:
# Convert lists to NumPy arrays
data_array = np.array(data_list)
labels_array = np.array(labels_list)

In [6]:
# Print shapes for verification
print("Data Array Shape:", data_array.shape)
print("Labels Array Shape:", labels_array.shape)

Data Array Shape: (23292, 8, 5000)
Labels Array Shape: (23292,)


In [7]:
indices_with_nans = [370, 899, 4733, 4936, 5404, 8354, 9146, 9560, 10268, 10879, 11915, 12946, 13441, 14674, 15413, 15702, 16190, 22258, 23204]

In [8]:
data_array = data_array[np.logical_not(np.isin(np.arange(len(data_array)), indices_with_nans))]

In [9]:
labels_array = labels_array[np.logical_not(np.isin(np.arange(len(labels_array)), indices_with_nans))]

In [10]:
num_patients, num_leads, num_time_points = data_array.shape

In [11]:
ecg_data = data_array
labels = labels_array

In [12]:
# Set seed for reproducibility
np.random.seed(42)

In [13]:
# Generate random indices for shuffling
indices = np.arange(len(labels))
np.random.shuffle(indices)

In [14]:
# Shuffle ecg_data and labels using the generated indices
ecg_data_shuffled = ecg_data[indices]
labels_shuffled = labels[indices]

In [15]:
# To be moved up
from scipy.stats import skew, kurtosis
from scipy import signal
from scipy.fft import fft
from biosppy.signals import ecg  # Biosppy is a library for biosignal processing
import pywt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [106]:
from python_speech_features import mfcc

In [109]:
# For the classical models

# Function to extract features from each lead
def extract_features(lead):
    # 1. Statistical Features
    print(type(lead))
    print(lead.shape)
    mean_value = np.mean(lead)
    median_value = np.median(lead)
    std_dev_value = np.std(lead)
    skewness_value = skew(lead)
    kurtosis_value = kurtosis(lead)

    # 2. Time-Domain Features
    # You might need to preprocess the data to find R-peaks for RR interval calculations
    # Example using biosppy
    rpeaks = ecg.ecg(lead, sampling_rate=500, show=False)['rpeaks']
    rr_interval = np.diff(rpeaks)

    # Calculate features from RR intervals
    rr_mean = np.mean(rr_interval)  
    heart_rate = 60 / rr_mean

    # 3. Frequency-Domain Features
    #power_spectral_density (psd) shape is (num_time_points // 2 + 1)
    f, psd = signal.welch(lead, fs=500)
    dominant_frequency = f[np.argmax(psd)]
    spectral_entropy = -np.sum(psd * np.log2(psd + 1e-10))
    
    # Combine all features into a single array
    initial_features = [
        mean_value, median_value, std_dev_value, skewness_value, kurtosis_value,
        rr_mean, heart_rate, dominant_frequency, spectral_entropy, 
    ]
    print(initial_features)

    n_mfcc=13
#    len_of_signal = lead.shape[0]
#    hop_len = len_of_signal // n_mfcc
#    mfcc = librosa.feature.mfcc(y=lead, sr=500, n_mfcc=n_mfcc, hop_length=hop_len)
#    mfcc = mfcc.flatten()
    # Calculate MFCC features
    mfcc_features = mfcc(signal=lead, samplerate=500, numcep=n_mfcc)
    mfcc_features = np.mean(mfcc_features, axis=0)

    print('mfcc', mfcc_features.shape)
    extracted_features = np.array(initial_features.extend(mfcc_features.tolist()))


    
    return extracted_features

In [110]:
extract_features(ecg_data[0, 0, :])

<class 'numpy.ndarray'>
(5000,)
[1.1368683772161604e-17, -0.28153966396249014, 0.9999999997227793, 2.272046014012414, 4.914257393075843, 262.3529411764706, 0.22869955156950672, 3.90625, 1.8457325568259608]
mfcc (999, 13)


array(None, dtype=object)

In [18]:
patient_features = np.zeros((num_patients, num_features * num_leads))

In [19]:
# Apply feature extraction for each lead and concatenate features for each patient
for patient_index in tqdm(range(num_patients)):
    patient_lead_features = np.zeros((num_leads, num_features))
    
    for lead_index in range(num_leads):
        patient_lead_features[lead_index, :] = extract_features(ecg_data[patient_index, lead_index, :])
    
    # Concatenate features for the current patient
    patient_features[patient_index, :] = patient_lead_features.flatten()


############## consider try - catch #######################

100%|████████████████████████████████████████████████████████████████████████████| 23273/23273 [27:32<00:00, 14.09it/s]


In [20]:
patient_features.shape

(23273, 72)

In [21]:
ecg_data.shape

(23273, 8, 5000)

In [22]:
labels.shape

(23273,)

In [23]:
X_classical = patient_features

In [24]:
X_deep = ecg_data

In [25]:
# Split the data into training and testing sets
X_train_classical, X_test_classical, y_train_classical, y_test_classical = train_test_split(X_classical, labels, test_size=0.2, random_state=42)
X_train_deep, X_test_deep, y_train_deep, y_test_deep = train_test_split(ecg_data, labels, test_size=0.2, random_state=42)

In [39]:
for i in range(len(y_train_classical)):
    if(y_train_classical[i] != y_train_deep[i]):
        Print("WRONG SPLITTING!!!!!!!!!!!  TRAIN")

for i in range(len(y_test_classical)):
    if(y_test_classical[i] != y_test_deep[i]):
        Print("WRONG SPLITTING!!!!!!!!!!!   TESTTTTTTTTTT")

In [26]:
np.unique(y_train_classical)

array([0, 1, 2])

In [27]:
np.unique(y_train_deep)

array([0, 1, 2])

In [28]:
y_test_deep

array([0, 1, 0, ..., 1, 0, 0])

In [29]:
y_train_deep

array([0, 1, 1, ..., 0, 2, 0])

In [30]:
############################################ DEEP ###############################################

In [31]:
# Build the CNN combined with RNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(8, 5000)))
model.add(MaxPooling1D(pool_size=2))
model.add(BatchNormalization())
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # Assuming 3 output classes





In [32]:
optimizer = Adam(learning_rate=0.001)

In [33]:
batch_size = 32

In [34]:
# Implement early stopping based on validation loss
#early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [41]:
# Compile the model
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [36]:
# Convert labels to categorical format
le = LabelEncoder()
y_train_deep = to_categorical(le.fit_transform(y_train_deep))
y_test_deep = to_categorical(le.fit_transform(y_test_deep))

In [37]:
# Class proportions
p1 = 0.61
p2 = 0.29
p3 = 0.1

In [38]:
# Calculate weights
w1 = 1 / p1
w2 = 1 / p2
w3 = 1 / p3

In [39]:
# Normalize weights
sum_weights = w1 + w2 + w3
normalized_w1 = w1 / sum_weights
normalized_w2 = w2 / sum_weights
normalized_w3 = w3 / sum_weights

In [45]:
# Train the model
model.fit(X_train_deep, y_train_deep, epochs=50, batch_size=batch_size, validation_data=(X_test_deep, y_test_deep), class_weight={0:normalized_w1, 1:normalized_w2, 2:normalized_w3})

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50

KeyboardInterrupt: 

In [46]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_deep, y_test_deep)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 3.232426881790161
Test Accuracy: 0.5061224699020386


In [47]:
y_pred_deep = model.predict(X_test_deep)



In [49]:
y_pred_deep = np.argmax(y_pred_deep, axis=1)

In [51]:
y_pred_deep

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [53]:
y_test_deep = np.argmax(y_test_deep, axis=1)

In [54]:
# Evaluate the model
accuracy = accuracy_score(y_test_deep, y_pred_deep)
report = classification_report(y_test_deep, y_pred_deep)

In [56]:
accuracy

0.5061224489795918

In [58]:
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.67      0.66      2804
           1       0.34      0.31      0.32      1383
           2       0.10      0.09      0.10       468

    accuracy                           0.51      4655
   macro avg       0.36      0.36      0.36      4655
weighted avg       0.50      0.51      0.50      4655



In [None]:
############################################ CLASSICAL #########################################################

In [59]:
# Standardize the features
scaler = StandardScaler()
X_train_classical = scaler.fit_transform(X_train_classical)
X_test_classical = scaler.transform(X_test_classical)

In [60]:
# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(class_weight = {0 : 0.05, 1 : 0.1, 2 : 0.85}, n_estimators=150, random_state=42)
rf_classifier.fit(X_train_classical, y_train_classical)

In [61]:
y_pred_classical = rf_classifier.predict(X_test_classical)

In [62]:
# Evaluate the model
accuracy = accuracy_score(y_test_classical, y_pred_classical)
report = classification_report(y_test_classical, y_pred_classical)

In [63]:
accuracy

0.635016111707841

In [64]:
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.97      0.77      2804
           1       0.62      0.18      0.28      1383
           2       0.17      0.00      0.00       468

    accuracy                           0.64      4655
   macro avg       0.48      0.38      0.35      4655
weighted avg       0.59      0.64      0.55      4655



In [65]:
######################################### ENSEMBLE ###########################################################

In [66]:
# Step 5: Combine predictions using stacking
stacked_features_train = np.column_stack((y_pred_deep, y_pred_classical))

In [67]:
## MOVE above later
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

In [68]:
# Optional: Train a logistic regression meta-model
meta_model = LogisticRegression()
meta_model.fit(stacked_features_train, y_test_deep)

In [69]:
# Use cross_val_predict for out-of-fold predictions
# y_pred_deep ready
# y_pred_classical ready
stacked_features_test = np.column_stack((y_pred_deep, y_pred_classical))

In [70]:
y_pred_ensemble = meta_model.predict(stacked_features_test)

In [71]:
# Step 6: Evaluate the ensemble
ensemble_accuracy = accuracy_score(y_test_deep, y_pred_ensemble)
print(f'Ensemble Accuracy: {ensemble_accuracy}')

Ensemble Accuracy: 0.6354457572502685


In [72]:
report = classification_report(y_test_deep, y_pred_ensemble)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.97      0.77      2804
           1       0.62      0.18      0.28      1383
           2       0.00      0.00      0.00       468

    accuracy                           0.64      4655
   macro avg       0.42      0.38      0.35      4655
weighted avg       0.57      0.64      0.55      4655



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


2

In [96]:
from sklearn.svm import SVC

In [99]:
# Create a Support Vector Machine model
svm_model = SVC(kernel='poly', C=1, random_state=42)

In [112]:
check = stacked_features_train[:,0] == stacked_features_train[:,1]

In [114]:
for i in range(len(check)):
    print(stacked_features_train[i])

[0 0]
[0 0]
[0 0]
[1 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[2 0]
[0 0]
[1 0]
[1 0]
[1 0]
[0 0]
[2 0]
[2 0]
[0 0]
[1 0]
[0 0]
[0 1]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[2 0]
[0 0]
[0 0]
[0 0]
[1 1]
[0 0]
[0 0]
[1 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[2 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[2 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 1]
[0 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 1]
[0 0]
[2 0]
[0 0]
[0 0]
[1 0]
[2 0]
[1 0]
[1 0]
[2 0]
[0 0]
[0 0]
[0 0]
[1 0]
[1 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[1 0]
[1 0]
[1 0]
[0 0]
[0 1]
[0 0]
[0 0]
[0 0]
[0 1]
[2 0]
[1 0]
[0 0]
[1 1]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[1 1]
[0 0]
[0 0]
[1 0]
[0 0]
[1 1]
[0 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[1 1]
[1 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 1]
[1 1]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[1 0]
[1 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[1 0]
[0 0]
[1 0]
[0 0]
[0 0]
[1 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 0]
[0 1]
[0 0]
[0 0]
[1 0]
[0 0]
[0 0]
[0 0

In [None]:
# Train the model
svm_model.fit(stacked_features_train, y_test_deep)

In [73]:



meta_model_cnn.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
meta_model_cnn.fit(stacked_features_train, y_test_deep, epochs=60, batch_size=32, validation_split=0.15)



y_pred_ensemble_cnn = meta_model_cnn.predict(stacked_features_test)

# Step 8: Evaluate the ensemble
ensemble_accuracy = accuracy_score(y_test_deep, np.argmax(y_pred_ensemble_cnn, axis=1))
print(f'Ensemble Accuracy: {ensemble_accuracy}')

NameError: name 'keras' is not defined