In [3]:
!pip install wfdb
import pandas as pd
import numpy as np
import wfdb
import ast
import os
from sklearn.preprocessing import MultiLabelBinarizer

# Function to load raw data
def load_raw_data(df, path):
    """
    Load raw ECG data at a fixed sampling frequency of 500 Hz.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing ECG metadata, including file paths.
    path : str
        Base path to the PTB-XL dataset.

    Returns:
    --------
    numpy.ndarray
        Array of raw ECG signals.
    """
    data = [wfdb.rdsamp(path + f) for f in df.filename_hr]
    return np.array([signal for signal, meta in data])

# Base path for the PTB-XL dataset
path = '/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/'

# Fixed classification type and lead type
classification_type = "superclasses"  # {"binary", "superclasses", "subclasses"}
lead_types = {
    "lead-I": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
    "bipolar-limb": [3, 4, 5, 6, 7, 8, 9, 10, 11],
    "unipolar-limb": [0, 1, 2, 6, 7, 8, 9, 10, 11],
    "limb-leads": [6, 7, 8, 9, 10, 11],
    "precordial-leads": [0, 1, 2, 3, 4, 5],
    "all-lead": [],
}
lead_name = "lead-I"

# Load and convert annotation data
Y = pd.read_csv(path + 'ptbxl_database.csv', index_col='ecg_id')
Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))

# Load raw ECG data
X = load_raw_data(Y, path)

# Load scp_statements.csv for diagnostic aggregation
agg_df = pd.read_csv(path + 'scp_statements.csv', index_col=0)
agg_df = agg_df[agg_df.diagnostic == 1]

# Aggregation functions for superclasses and subclasses
def aggregate_superclass_diagnostic(y_dic):
    return list(set(agg_df.loc[key].diagnostic_class for key in y_dic.keys() if key in agg_df.index))

def aggregate_subclass_diagnostic(y_dic):
    return list(set(agg_df.loc[key].diagnostic_subclass for key in y_dic.keys() if key in agg_df.index))

# Apply diagnostic aggregation
if classification_type == "superclasses":
    Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_superclass_diagnostic)
else:
    Y['diagnostic_superclass'] = Y.scp_codes.apply(aggregate_subclass_diagnostic)

# Split data into train and test sets
test_fold = 10
X_train = X[np.where(Y.strat_fold != test_fold)]
y_train = Y[Y.strat_fold != test_fold].diagnostic_superclass.tolist()

X_test = X[np.where(Y.strat_fold == test_fold)]
y_test = Y[Y.strat_fold == test_fold].diagnostic_superclass.tolist()

# Binary classification (if applicable)
if classification_type == "binary":
    for idx, labels in enumerate(y_train):
        y_train[idx] = 1 if 'NORM' in labels else 0
    for idx, labels in enumerate(y_test):
        y_test[idx] = 1 if 'NORM' in labels else 0

# Reshape data for selected lead type
def preprocess_lead_data(data, lead_indices):
    """
    Remove unwanted leads and reshape the data.

    Parameters:
    -----------
    data : numpy.ndarray
        Raw ECG signal data.
    lead_indices : list
        Indices of leads to remove.

    Returns:
    --------
    numpy.ndarray
        Preprocessed ECG data.
    """
    processed_data = []
    for ecg in data:
        processed_data.append(np.delete(ecg, lead_indices, axis=1))
    return np.array(processed_data)

list_train = preprocess_lead_data(X_train, lead_types[lead_name])
list_test = preprocess_lead_data(X_test, lead_types[lead_name])

# Save processed data
np.save('/kaggle/working/x_train.npy', np.array(list_train))
np.save('/kaggle/working/x_test.npy', np.array(list_test))

# Convert labels to binary matrices
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

# Save labels
np.save('/kaggle/working/y_train.npy', y_train)
np.save('/kaggle/working/y_test.npy', y_test)

print("Preprocessing complete.")

Preprocessing complete.


In [4]:

"""###Import Libraries"""
from tensorflow.keras.layers import (Conv2D,Conv1D, Add,Activation,
                                     Dropout,Dense,Flatten,Input,BatchNormalization,
                                      ReLU,MaxPooling2D,Concatenate,GlobalAveragePooling2D,MaxPooling1D,GlobalAveragePooling1D
                                     )
from tensorflow.keras.models import Sequential
from tensorflow.keras import optimizers, losses, metrics, regularizers, callbacks
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np
import os

"""#### Import Data"""

path = '/kaggle/working/'

calssificatin_type = {"binary":1,"superclasses":5,"subclasses":23}
classification_name="superclasses"
no_of_classes=calssificatin_type[classification_name]

lead_type={"lead-I":1, "bipolar-limb":3 , "unipolar-limb":3, "limb-leads":6 , "precordial-leads":6,"all-lead":12}
lead_name= "lead-I"
no_of_leads=lead_type[lead_name]

x_train = np.load(path + 'x_train.npy', allow_pickle=True)
x_test  = np.load(path + 'x_test.npy', allow_pickle=True)
y_train = np.load(path + 'y_train.npy', allow_pickle=True)
y_test  = np.load(path + 'y_test.npy', allow_pickle=True)

print("Original shapes:")
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)

x_train = x_train.transpose(0, 2, 1).reshape(-1, 5000, 1)
x_test = x_test.transpose(0, 2, 1).reshape(-1, 5000, 1)
print("After transpose:")
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)

# Use actual dimensions from the arrays, not hardcoded values
actual_train_samples = x_train.shape[0]
actual_test_samples = x_test.shape[0]

print(f"Using {actual_train_samples} training samples and {actual_test_samples} test samples")


print("Final shapes:")
print("x_train:", x_train.shape)
print("x_test:", x_test.shape)

print("x_train :", x_train.shape)
print("y_train :", y_train.shape)
print("x_test  :", x_test.shape)
print("y_test  :", y_test.shape)
print('Data loaded')


from sklearn.preprocessing import MultiLabelBinarizer



"""#### Model"""



# Define the input shape for 5000 time points and 1 lead
input = Input(shape=(5000, no_of_leads))  # Note: Transpose data to match this shape (5000, 1)

# First convolutional block
conv1 = Conv1D(filters=32, kernel_size=15, strides=1, padding='same')(input)
batch1 = BatchNormalization()(conv1)
relu1 = ReLU()(batch1)

# Second convolutional block
conv2 = Conv1D(filters=64, kernel_size=15, strides=2, padding='same')(relu1)
batch2 = BatchNormalization()(conv2)
relu2 = ReLU()(batch2)
drop2 = Dropout(rate=0.1)(relu2)

# Shortcut connection
max1 = MaxPooling1D(pool_size=5, strides=2, padding='same')(relu1)
conv_ = Conv1D(64, kernel_size=1, strides=1, padding='same')(max1)
conc1 = Add()([conv2, conv_])

# Third convolutional block
batch3 = BatchNormalization()(conc1)
relu3 = ReLU()(batch3)
drop3 = Dropout(rate=0.1)(relu3)
conv3 = Conv1D(filters=128, kernel_size=15, strides=2, padding='same')(drop3)
batch3 = BatchNormalization()(conv3)
relu3 = ReLU()(batch3)
drop3 = Dropout(rate=0.1)(relu3)

# Shortcut connection
max2 = MaxPooling1D(pool_size=5, strides=2, padding='same')(conc1)
conv_ = Conv1D(128, kernel_size=1, strides=1, padding='same')(max2)
conc2 = Add()([conv3, conv_])

# Fourth convolutional block
batch4 = BatchNormalization()(conc2)
relu4 = ReLU()(batch4)
drop4 = Dropout(rate=0.1)(relu4)
conv4 = Conv1D(filters=256, kernel_size=15, strides=2, padding='same')(drop4)
batch4 = BatchNormalization()(conv4)
relu4 = ReLU()(batch4)
drop4 = Dropout(rate=0.1)(relu4)

# Shortcut connection
max3 = MaxPooling1D(pool_size=5, strides=2, padding='same')(conc2)
conv_ = Conv1D(256, kernel_size=1, strides=1, padding='same')(max3)
conc3 = Add()([conv4, conv_])

# Final convolutional block
batch5 = BatchNormalization()(conc3)
relu5 = ReLU()(batch5)

# Global pooling and fully connected layers
X = GlobalAveragePooling1D()(relu5)

X = Dense(units=128, kernel_regularizer=tf.keras.regularizers.L2(0.005))(X)
X = BatchNormalization()(X)
X = ReLU()(X)
X = Dropout(rate=0.1)(X)

X = Dense(units=64, kernel_regularizer=tf.keras.regularizers.L2(0.009))(X)
X = BatchNormalization()(X)
X = ReLU()(X)
X = Dropout(rate=0.15)(X)

# Output layer
output = Dense(no_of_classes, activation='sigmoid')(X)

# Define the model
model = Model(inputs=input, outputs=output)

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC(curve='ROC', multi_label=True)]
)

print(model.summary())


"""#### Train Model"""
import tensorflow as tf 
early    = callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True)
reducelr = callbacks.ReduceLROnPlateau(monitor="val_loss", patience=3)

callback = [early, reducelr]


model.compile(optimizer = optimizers.Adam(learning_rate=0.0005),
              loss = losses.BinaryCrossentropy(),
              metrics = [metrics.BinaryAccuracy(), metrics.AUC(curve='ROC', multi_label=True)])

history = model.fit(x_train, y_train, validation_split=0.12, epochs=100, batch_size=32, callbacks=callback)


# """##### Save Model"""

save_path = 'save_path/'
model.save(save_path + "First_Paper.h5")



"""Evaluate the model"""

from sklearn.metrics import precision_recall_curve, f1_score, roc_auc_score, accuracy_score, auc


def sklearn_metrics(y_true, y_pred):
    y_bin = np.copy(y_pred)
    y_bin[y_bin >= 0.5] = 1
    y_bin[y_bin < 0.5]  = 0

    # Compute area under precision-Recall curve
    auc_sum = 0
    for i in range(no_of_classes):
      precision, recall, thresholds = precision_recall_curve(y_true[:, i], y_pred[:,i])
      auc_sum += auc(recall, precision)

    print("Accuracy        : {:.2f}".format(accuracy_score(y_true.flatten(), y_bin.flatten())* 100))
    print("Macro AUC score : {:.2f}".format(roc_auc_score(y_true, y_pred, average='macro') * 100))
    print('AUPRC           : {:.2f}'.format((auc_sum / no_of_classes) * 100))
    print("Micro F1 score  : {:.2f}".format(f1_score(y_true, y_bin, average='micro') * 100))



from typing import Tuple
import numpy as np
import os

import numpy as np
import warnings
from sklearn.metrics import roc_auc_score, accuracy_score


def Metrics(y_true: np.ndarray, y_scores: np.ndarray) -> Tuple[float, float]:
    """Metrics for class-wise accuracy and mean accuracy.

    Parameters
    ----------
    y_true : np.ndarray
        Ground truth labels.
    y_scores : np.ndarray
        Predicted labels.

    Returns
    -------
    tuple[np.ndarray]
        Tuple of arrays containing class-wise accuracy and mean accuracy.

    """

    y_pred = y_scores >= 0.5
    acc = np.zeros(y_pred.shape[-1])

    for i in range(y_pred.shape[-1]):
        acc[i] = accuracy_score(y_true[:, i], y_pred[:, i])

    return acc.tolist(), np.mean(acc)


def AUC(y_true: np.ndarray, y_pred: np.ndarray, verbose: bool = False) -> float:
    """Computes the macro-averaged AUC score.

    Parameters
    ----------
    y_true : np.ndarray
        Ground truth labels.
    y_scores : np.ndarray
        Predicted probabilities.

    Returns
    -------
    float
        macro-average AUC score.

    """

    aucs = []
    assert (
        len(y_true.shape) == 2 and len(y_pred.shape) == 2
    ), "Predictions and labels must be 2D."
    for col in range(y_true.shape[1]):
        try:
            aucs.append(roc_auc_score(y_true[:, col], y_pred[:, col]))
        except ValueError as e:
            if verbose:
                print(
                    f"Value error encountered for label {col}, likely due to using mixup or "
                    f"lack of full label presence. Setting AUC to accuracy. "
                    f"Original error was: {str(e)}."
                )
            aucs.append((y_pred == y_true).sum() / len(y_pred))
    return aucs


y_pred_train = model.predict(x_train)
y_pred_test  = model.predict(x_test)

print("Train")
sklearn_metrics(y_train, y_pred_train)
print("\nTest")
sklearn_metrics(y_test, y_pred_test)

acc, mean_acc = Metrics(y_test, y_pred_test)
class_auc = AUC(y_test, y_pred_test)

print(f"class wise accuracy: {acc}")

print(f"class wise AUC : {class_auc}")

2025-05-10 14:42:17.496093: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746888137.730210      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746888137.790651      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Original shapes:
x_train: (19634, 5000, 1)
x_test: (2203, 5000, 1)
After transpose:
x_train: (19634, 5000, 1)
x_test: (2203, 5000, 1)
Using 19634 training samples and 2203 test samples
Final shapes:
x_train: (19634, 5000, 1)
x_test: (2203, 5000, 1)
x_train : (19634, 5000, 1)
y_train : (19634, 5)
x_test  : (2203, 5000, 1)
y_test  : (2203, 5)
Data loaded


I0000 00:00:1746888151.377052      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1746888151.377813      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


None
Epoch 1/100


I0000 00:00:1746888162.252032     108 service.cc:148] XLA service 0x78c87c003a60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1746888162.252778     108 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1746888162.252798     108 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1746888163.112454     108 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  2/540[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m39s[0m 74ms/step - auc_1: 0.4199 - binary_accuracy: 0.4672 - loss: 2.4759   

I0000 00:00:1746888171.884690     108 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 66ms/step - auc_1: 0.6784 - binary_accuracy: 0.7200 - loss: 1.5697 - val_auc_1: 0.7445 - val_binary_accuracy: 0.7666 - val_loss: 0.6885 - learning_rate: 5.0000e-04
Epoch 2/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 46ms/step - auc_1: 0.7810 - binary_accuracy: 0.8097 - loss: 0.5611 - val_auc_1: 0.7881 - val_binary_accuracy: 0.7567 - val_loss: 0.5367 - learning_rate: 5.0000e-04
Epoch 3/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 47ms/step - auc_1: 0.7963 - binary_accuracy: 0.8145 - loss: 0.4543 - val_auc_1: 0.8019 - val_binary_accuracy: 0.7982 - val_loss: 0.4604 - learning_rate: 5.0000e-04
Epoch 4/100
[1m540/540[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 49ms/step - auc_1: 0.7995 - binary_accuracy: 0.8173 - loss: 0.4328 - val_auc_1: 0.8053 - val_binary_accuracy: 0.7580 - val_loss: 0.5681 - learning_rate: 5.0000e-04
Epoch 5/100
[1m540/540[0m [32m━━━━━━━

In [5]:
# Save the complete model with weights, architecture, and optimizer state
model_save_path = '/kaggle/working/ecg_classifier.h5'
model.save(model_save_path)

# Also save the multilabel binarizer for consistent label encoding/decoding
import pickle
mlb_save_path = '/kaggle/working/mlb_encoder.pkl'
with open(mlb_save_path, 'wb') as f:
    pickle.dump(mlb, f)

print(f"Model saved to {model_save_path}")
print(f"MultiLabelBinarizer saved to {mlb_save_path}")

Model saved to /kaggle/working/ecg_classifier.h5
MultiLabelBinarizer saved to /kaggle/working/mlb_encoder.pkl


In [6]:
import numpy as np
import wfdb
import tensorflow as tf
import os
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
dataset_path="/kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1"
def predict_ecg_from_file(dat_file_path, model_path, mlb_path=None, lead_name="lead-I"):
    """
    Preprocess a .dat file, load the saved model, and predict ECG classes in Kaggle environment.
    
    Parameters:
    -----------
    dat_file_path : str
        Path to the .dat file (without the .dat extension)
    model_path : str
        Path to the saved model (.h5 file)
    mlb_path : str, optional
        Path to the saved MultiLabelBinarizer pickle file for label decoding
    lead_name : str, optional
        Lead configuration to use, default is "lead-I"
        
    Returns:
    --------
    dict
        Dictionary containing prediction probabilities and class names if mlb_path is provided
    """
    # Define lead types mapping (same as in your training code)
    lead_types = {
        "lead-I": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
        "bipolar-limb": [3, 4, 5, 6, 7, 8, 9, 10, 11],
        "unipolar-limb": [0, 1, 2, 6, 7, 8, 9, 10, 11],
        "limb-leads": [6, 7, 8, 9, 10, 11],
        "precordial-leads": [0, 1, 2, 3, 4, 5],
        "all-lead": [],
    }
    
    # Step 1: Load the raw ECG data
    try:
        # Read the signal file (wfdb expects path without extension)
        signal, meta = wfdb.rdsamp(dat_file_path)
        print(f"Loaded signal with shape: {signal.shape}")
        
        # Step 2: Preprocess the data
        # Remove unwanted leads according to lead_name configuration
        if lead_types[lead_name]:
            signal = np.delete(signal, lead_types[lead_name], axis=1)
            print(f"After lead selection: {signal.shape}")
        
        # Reshape to match model input requirements
        # Assuming fixed length of 5000 time points
        if signal.shape[0] != 5000:
            print(f"Warning: Signal length is {signal.shape[0]}, expected 5000. Resampling...")
            # Resample to 5000 points if needed (simple interpolation)
            from scipy.interpolate import interp1d
            x = np.linspace(0, 1, signal.shape[0])
            x_new = np.linspace(0, 1, 5000)
            f = interp1d(x, signal, axis=0, kind='linear')
            signal = f(x_new)
        
        # Reshape for model input (1, 5000, n_leads)
        X = signal.transpose(1, 0).reshape(1, 5000, -1)
        print(f"Final input shape: {X.shape}")
        
        # Step 3: Load the model
        model = tf.keras.models.load_model(model_path)
        print("Model loaded successfully")
        
        # Step 4: Make predictions
        predictions = model.predict(X)
        
        # Step 5: Process results
        result = {"raw_predictions": predictions[0].tolist()}
        
        # If mlb_path is provided, decode the labels
        if mlb_path and os.path.exists(mlb_path):
            with open(mlb_path, 'rb') as f:
                mlb = pickle.load(f)
            
            # Get classes with probability > 0.5
            binary_preds = (predictions[0] >= 0.5).astype(int)
            predicted_classes = mlb.inverse_transform(binary_preds.reshape(1, -1))[0]
            
            # Add class probabilities to result
            class_probs = {}
            for i, class_name in enumerate(mlb.classes_):
                class_probs[class_name] = float(predictions[0][i])
            
            result["predicted_classes"] = list(predicted_classes)
            result["class_probabilities"] = class_probs
        
        return result
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        return {"error": str(e)}

# This function creates an example to test the model with an example from the dataset
def test_with_dataset_example(dataset_path, model_path, mlb_path=None, sample_idx=0):
    """
    Load a sample from the original dataset to test the model
    
    Parameters:
    -----------
    dataset_path : str
        Path to the PTB-XL dataset
    model_path : str
        Path to the saved model
    mlb_path : str, optional
        Path to the saved MultiLabelBinarizer
    sample_idx : int, optional
        Index of the sample to use from the test set
        
    Returns:
    --------
    dict
        Prediction results and ground truth
    """
    import pandas as pd
    import ast
    
    # Load metadata
    meta_path = os.path.join(dataset_path, 'ptbxl_database.csv')
    Y = pd.read_csv(meta_path, index_col='ecg_id')
    Y.scp_codes = Y.scp_codes.apply(lambda x: ast.literal_eval(x))
    
    # Load SCP statements for diagnostic classes
    agg_path = os.path.join(dataset_path, 'scp_statements.csv')
    agg_df = pd.read_csv(agg_path, index_col=0)
    agg_df = agg_df[agg_df.diagnostic == 1]
    
    # Get test data (fold 10)
    test_fold = 10
    test_data = Y[Y.strat_fold == test_fold]
    
    if len(test_data) == 0:
        return {"error": "No test data found"}
    
    # Select a sample
    if sample_idx >= len(test_data):
        sample_idx = 0
        
    sample = test_data.iloc[sample_idx]
    file_path = os.path.join(dataset_path, sample.filename_hr)
    
    # Get ground truth labels (superclasses)
    def aggregate_diagnostic(y_dic):
        return list(set(agg_df.loc[key].diagnostic_class for key in y_dic.keys() 
                      if key in agg_df.index))
                      
    true_labels = aggregate_diagnostic(sample.scp_codes)
    
    print(f"Testing with example: {file_path}")
    print(f"True labels: {true_labels}")
    
    # Get prediction
    prediction = predict_ecg_from_file(file_path, model_path, mlb_path)
    prediction["true_labels"] = true_labels
    
    return prediction

# Example usage for Kaggle notebook - just copy and modify this code in your notebook
def example_usage():
    """
    Example of how to use the function in a Kaggle notebook
    """
    # Kaggle paths
    model_path = '/kaggle/working/ecg_classifier.h5'
    mlb_path = '/kaggle/working/mlb_encoder.pkl'
    
    # Option 1: Test with a sample from the original dataset
    results = test_with_dataset_example(dataset_path, model_path, mlb_path,967)
    
    # Option 2: If you have a specific .dat file to test
    # specific_file = '/kaggle/input/test-file/calibrated_ecg.dat'  # path without .dat extension
    # results = predict_ecg_from_file(specific_file, model_path, mlb_path)
    
    # Print results
    print("\n----- PREDICTION RESULTS -----")
    if "error" in results:
        print(f"Error: {results['error']}")
    else:
        if "true_labels" in results:
            print(f"True labels: {results['true_labels']}")
            
        if "predicted_classes" in results:
            print(f"Predicted classes: {results['predicted_classes']}")
            
            # Show probabilities of all classes
            print("\nClass probabilities:")
            for cls, prob in sorted(results["class_probabilities"].items(), 
                                   key=lambda x: x[1], reverse=True):
                print(f"- {cls}: {prob:.4f}")
        else:
            print("Raw predictions:", results["raw_predictions"])
            
example_usage()

Testing with example: /kaggle/input/ptb-xl-dataset/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.1/records500/08000/08344_hr
True labels: ['MI']
Loaded signal with shape: (5000, 12)
After lead selection: (5000, 1)
Final input shape: (1, 5000, 1)
Model loaded successfully
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step

----- PREDICTION RESULTS -----
True labels: ['MI']
Predicted classes: []

Class probabilities:
- NORM: 0.4742
- CD: 0.3691
- MI: 0.3162
- HYP: 0.1057
- STTC: 0.0925


In [8]:
import numpy as np
import os
import tensorflow as tf
import pickle
from scipy.interpolate import interp1d
from collections import Counter

def read_lead_i_long_dat_file(dat_file_path, sampling_rate=500, data_format='16', scale_factor=0.001):
    """
    Read a 30-second pure Lead I .dat file directly and properly scale it
    
    Parameters:
    -----------
    dat_file_path : str
        Path to the .dat file (with or without .dat extension)
    sampling_rate : int
        Sampling rate in Hz (default 500Hz)
    data_format : str
        Data format of the binary file: '16' for 16-bit integers, '32' for 32-bit floats
    scale_factor : float
        Scale factor to convert units (0.001 for converting µV to mV)
        
    Returns:
    --------
    numpy.ndarray
        ECG signal data for Lead I with shape (total_samples,)
    """
    # Ensure the path ends with .dat
    if not dat_file_path.endswith('.dat'):
        dat_file_path += '.dat'
    
    # Expected samples for full 30 seconds
    expected_samples = sampling_rate * 30
    
    # Read the binary data
    try:
        if data_format == '16':
            # 16-bit signed integers (common format for ECG)
            data = np.fromfile(dat_file_path, dtype=np.int16)
        elif data_format == '32':
            # 32-bit floating point (less common)
            data = np.fromfile(dat_file_path, dtype=np.float32)
        else:
            raise ValueError(f"Unsupported data format: {data_format}")
    
        print(f"Read {len(data)} data points from file")
        
        # For pure Lead I data, no need to reshape or extract
        print(f"Processing pure Lead I data (applying scaling factor {scale_factor})")
        
        # Apply scaling to convert µV to mV
        signal = data * scale_factor
        print(f"Signal amplitude range after scaling: {np.min(signal):.4f} to {np.max(signal):.4f}")
        
        # Handle if signal is not exactly 30 seconds
        if len(signal) < expected_samples:
            print(f"Warning: Signal is shorter than expected 30 seconds ({len(signal)} samples)")
            # Pad with zeros if too short
            padded_signal = np.zeros(expected_samples)
            padded_signal[:len(signal)] = signal
            signal = padded_signal
        elif len(signal) > expected_samples:
            print(f"Warning: Signal is longer than expected 30 seconds ({len(signal)} samples)")
            # Truncate if too long
            signal = signal[:expected_samples]
        
        return signal
        
    except Exception as e:
        print(f"Error reading data file: {e}")
        raise

def segment_signal(signal, sampling_rate=500):
    """
    Segment a 30-second signal into three 10-second segments
    
    Parameters:
    -----------
    signal : numpy.ndarray
        The full signal to segment
    sampling_rate : int
        Sampling rate in Hz
        
    Returns:
    --------
    list
        List of three 10-second signal segments
    """
    # Calculate samples per segment (10 seconds)
    segment_samples = sampling_rate * 10
    
    # Expected samples for full 30 seconds
    expected_samples = sampling_rate * 30
    
    # Ensure the signal is 30 seconds long
    if len(signal) != expected_samples:
        print(f"Warning: Signal length is {len(signal)}, expected {expected_samples}. Reshaping...")
        # Resample to 30 seconds
        x = np.linspace(0, 1, len(signal))
        x_new = np.linspace(0, 1, expected_samples)
        f = interp1d(x, signal, kind='linear', bounds_error=False, fill_value="extrapolate")
        signal = f(x_new)
    
    # Split the signal into three 10-second segments
    segments = []
    for i in range(3):
        start_idx = i * segment_samples
        end_idx = (i + 1) * segment_samples
        segment = signal[start_idx:end_idx]
        segments.append(segment)
        
    return segments

def process_segment(segment, sampling_rate=500):
    """
    Process a segment of ECG data to ensure it's properly formatted for the model
    
    Parameters:
    -----------
    segment : numpy.ndarray
        Raw ECG segment
    sampling_rate : int
        Sampling rate of the ECG
        
    Returns:
    --------
    numpy.ndarray
        Processed segment ready for model input
    """
    # Ensure correct length (5000 samples for 10 seconds)
    if len(segment) != 5000:
        x = np.linspace(0, 1, len(segment))
        x_new = np.linspace(0, 1, 5000)
        f = interp1d(x, segment, kind='linear', bounds_error=False, fill_value="extrapolate")
        segment = f(x_new)
    
    # Optional: Apply additional preprocessing if needed for your model
    # For example, if your model was trained on normalized data:
    # if np.std(segment) > 0:
    #     segment = (segment - np.mean(segment)) / np.std(segment)
    
    return segment

def predict_with_voting(dat_file_path, model_path, mlb_path=None, sampling_rate=500, scale_factor=0.001):
    """
    Process a 30-second .dat file, properly scale it, split it into three 10-second segments,
    make predictions on each segment, and determine the final result through voting.
    
    Parameters:
    -----------
    dat_file_path : str
        Path to the .dat file
    model_path : str
        Path to the saved model (.h5 file)
    mlb_path : str, optional
        Path to the saved MultiLabelBinarizer pickle file for label decoding
    sampling_rate : int
        Sampling rate in Hz (default 500Hz)
    scale_factor : float
        Scale factor to convert units (0.001 for converting µV to mV)
        
    Returns:
    --------
    dict
        Dictionary containing voting results, segment predictions, and final class probabilities
    """
    try:
        # Step 1: Read the 30-second ECG data (pure Lead I) and apply scaling
        full_signal = read_lead_i_long_dat_file(
            dat_file_path, 
            sampling_rate=sampling_rate,
            scale_factor=scale_factor
        )
        print(f"Loaded 30-second Lead I signal with {len(full_signal)} samples")
        
        # Step 2: Split into three 10-second segments
        segments = segment_signal(full_signal, sampling_rate)
        print(f"Split into {len(segments)} segments of {len(segments[0])} samples each")
        
        # Step 3: Load the model (load once to improve performance)
        model = tf.keras.models.load_model(model_path)
        print("Model loaded successfully")
        
        # Load MLB if provided
        mlb = None
        if mlb_path and os.path.exists(mlb_path):
            with open(mlb_path, 'rb') as f:
                mlb = pickle.load(f)
        
        # Step 4: Process each segment and collect predictions
        segment_results = []
        all_predictions = []
        all_predicted_classes = []
        
        for i, segment in enumerate(segments):
            print(f"Processing segment {i+1}/3...")
            
            # Process the segment to ensure it's properly formatted
            processed_segment = process_segment(segment)
            
            # Reshape for model input (batch, time, channels)
            X = processed_segment.reshape(1, 5000, 1)
            
            # Make predictions
            predictions = model.predict(X, verbose=0)
            all_predictions.append(predictions[0])
            
            # Process segment results
            segment_result = {"raw_predictions": predictions[0].tolist()}
            
            # Decode labels if MLB is provided
            if mlb is not None:
                # Get classes with probability > 0.5
                binary_preds = (predictions[0] >= 0.5).astype(int)
                predicted_classes = mlb.inverse_transform(binary_preds.reshape(1, -1))[0]
                all_predicted_classes.append(set(predicted_classes))
                
                # Add class probabilities
                class_probs = {}
                for j, class_name in enumerate(mlb.classes_):
                    class_probs[class_name] = float(predictions[0][j])
                
                segment_result["predicted_classes"] = list(predicted_classes)
                segment_result["class_probabilities"] = class_probs
            
            segment_results.append(segment_result)
        
        # Step 5: Implement voting mechanism
        final_result = {"segment_results": segment_results}
        
        # Average the raw predictions
        avg_predictions = np.mean(all_predictions, axis=0)
        final_result["averaged_raw_predictions"] = avg_predictions.tolist()
        
        # Implement voting for class labels
        if mlb is not None:
            # Flatten all predicted classes into a single list
            all_classes = []
            for classes in all_predicted_classes:
                all_classes.extend(classes)
            
            # Count occurrences of each class
            class_counts = Counter(all_classes)
            
            # Classes that appear in at least 2 of 3 segments (majority vote)
            voted_classes = [cls for cls, count in class_counts.items() if count >= 2]
            
            # If no classes meet the threshold, take the ones with highest average probability
            if not voted_classes:
                # Calculate average probability for each class
                avg_class_probs = {}
                for cls_idx, cls_name in enumerate(mlb.classes_):
                    avg_prob = np.mean([pred[cls_idx] for pred in all_predictions])
                    avg_class_probs[cls_name] = avg_prob
                
                # Take classes with average probability > 0.5
                voted_classes = [cls for cls, prob in avg_class_probs.items() if prob >= 0.5]
                
                # If still no classes, take the top one
                if not voted_classes:
                    voted_classes = [max(avg_class_probs.items(), key=lambda x: x[1])[0]]
            
            # Calculate final class probabilities (average across segments)
            final_class_probs = {}
            for cls_idx, cls_name in enumerate(mlb.classes_):
                final_class_probs[cls_name] = float(np.mean([pred[cls_idx] for pred in all_predictions]))
            
            final_result["voted_classes"] = voted_classes
            final_result["final_class_probabilities"] = final_class_probs
        
        return final_result
        
    except Exception as e:
        import traceback
        traceback.print_exc()
        return {"error": str(e)}

def visualize_segment(signal, sampling_rate=500, segment_num=None):
    """
    Helper function to visualize a segment of ECG data
    
    Parameters:
    -----------
    signal : numpy.ndarray
        The ECG signal to visualize
    sampling_rate : int
        Sampling rate of the ECG
    segment_num : int, optional
        Segment number for title
    """
    try:
        import matplotlib.pyplot as plt
        
        # Time axis in seconds
        time = np.arange(len(signal)) / sampling_rate
        
        plt.figure(figsize=(15, 4))
        plt.plot(time, signal)
        title = "ECG Signal"
        if segment_num is not None:
            title += f" - Segment {segment_num}"
        plt.title(title)
        plt.xlabel("Time (seconds)")
        plt.ylabel("Amplitude (mV)")
        plt.grid(True)
        
        filename = f"/kaggle/working/ecg_segment_{segment_num}.png" if segment_num else "/kaggle/working/ecg_full.png"
        plt.savefig(filename)
        plt.close()
        print(f"Visualization saved to {filename}")
        
    except Exception as e:
        print(f"Error visualizing signal: {e}")

# Main execution for Kaggle environment
if __name__ == "__main__":
    # Specific paths for Kaggle
    dat_file_path = '/kaggle/input/test-file/calibrated_ecg.dat'
    model_path = '/kaggle/working/ecg_classifier.h5'
    mlb_path = '/kaggle/working/mlb_encoder.pkl'
    
    print("Starting ECG analysis...")
    print(f"DAT file: {dat_file_path}")
    print(f"Model path: {model_path}")
    print(f"MLB path: {mlb_path}")
    
    # Get predictions with voting - now with proper scaling from µV to mV
    results = predict_with_voting(
        dat_file_path, 
        model_path, 
        mlb_path,
        scale_factor=0.001  # Convert microvolts to millivolts
    )
    
    # Print results
    print("\n----- PREDICTION RESULTS WITH VOTING -----")
    if "error" in results:
        print(f"Error: {results['error']}")
    else:
        if "voted_classes" in results:
            print(f"\nFINAL PREDICTION (VOTED): {results['voted_classes']}")
            
            # Show probabilities of all classes (sorted by probability)
            print("\nFinal Class Probabilities (averaged across segments):")
            for cls, prob in sorted(results["final_class_probabilities"].items(), 
                                    key=lambda x: x[1], reverse=True):
                print(f"- {cls}: {prob:.4f}")
            
            # Show segment-by-segment results
            print("\nResults by segment:")
            for i, segment_result in enumerate(results["segment_results"]):
                print(f"\nSegment {i+1}:")
                if "predicted_classes" in segment_result:
                    if segment_result["predicted_classes"]:
                        print(f"Classes: {segment_result['predicted_classes']}")
                    else:
                        print("No classes exceeded the threshold")
                    
                    # Show top probabilities for each segment
                    print("Top probabilities:")
                    top_probs = sorted(segment_result["class_probabilities"].items(), 
                                      key=lambda x: x[1], reverse=True)[:3]
                    for cls, prob in top_probs:
                        print(f"- {cls}: {prob:.4f}")
                else:
                    print("No class probabilities available")
        else:
            print("Raw predictions (averaged):", results["averaged_raw_predictions"])
            
    print("\nAnalysis complete!")

Starting ECG analysis...
DAT file: /kaggle/input/test-file/calibrated_ecg.dat
Model path: /kaggle/working/ecg_classifier.h5
MLB path: /kaggle/working/mlb_encoder.pkl
Read 15000 data points from file
Processing pure Lead I data (applying scaling factor 0.001)
Signal amplitude range after scaling: -0.7330 to 1.5470
Loaded 30-second Lead I signal with 15000 samples
Split into 3 segments of 5000 samples each
Model loaded successfully
Processing segment 1/3...
Processing segment 2/3...
Processing segment 3/3...

----- PREDICTION RESULTS WITH VOTING -----

FINAL PREDICTION (VOTED): ['CD']

Final Class Probabilities (averaged across segments):
- CD: 0.3410
- NORM: 0.2947
- HYP: 0.2457
- MI: 0.1499
- STTC: 0.1111

Results by segment:

Segment 1:
No classes exceeded the threshold
Top probabilities:
- CD: 0.4235
- NORM: 0.2705
- HYP: 0.2462

Segment 2:
No classes exceeded the threshold
Top probabilities:
- NORM: 0.3566
- HYP: 0.2556
- CD: 0.2008

Segment 3:
No classes exceeded the threshold
Top 