In [1]:
import os
import csv
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from tensorflow import convert_to_tensor, float32, expand_dims
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, LabelBinarizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from scipy import signal
import pywt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal
from __future__ import print_function
import numpy as np
import mlflow
from sklearn.utils.class_weight import compute_class_weight
%matplotlib inline


In [2]:
path = "Dataset/mitbih_database"
filenames = next(os.walk(path))[2]
records=list()
annotations=list()
filenames.sort()
for f in filenames:
    filename, file_extension = os.path.splitext(f)
    if(file_extension=='.csv'):
        records.append(path+'/'+filename+file_extension)
    else:
        annotations.append(path+'/'+filename+file_extension)

In [3]:
def get_record_signals(index):
    signals = []
    labels = []
    with open(records[index],'r') as csvfile:
        filereader = csv.reader(csvfile,delimiter=',',quotechar='|')
        row_index = -1
        for row in filereader:
            if(row_index >= 0):
                signals.insert(row_index, int(row[1]))
            row_index += 1
        signals = np.array(signals)
    with open(annotations[index],'r') as csvfile:
        filereader = csv.reader(csvfile,delimiter=',',quotechar='|')
        row_index = -1
        for row in filereader:
            if(row_index >= 0):
                elements = list(filter(lambda x: len(x) > 0, row[0].split(" ")))
                labels.insert(row_index, [int(elements[1]), elements[2]])
            row_index += 1
        labels = np.array(labels)
    return signals, labels

In [4]:
def apply_detrend_and_butterworth(signals):
    fs = 360.0
    N = 650000
    T = N / fs
    t = np.linspace(0, T, N, endpoint=False)

    # Detrend (usuniecie DC/linearna skÅ‚adowa)
    data_detrended = signal.detrend(signals)

    # Butterworth bandpass (np. 0.5 - 40 Hz)
    lowcut = 0.5
    highcut = 40.0
    nyq = 0.5 * fs
    b, a = signal.butter(4, [lowcut/nyq, highcut/nyq], btype='band')
    data_filt = signal.filtfilt(b, a, data_detrended)  # zero-phase
    return data_filt

In [5]:
def extract_wavelet_features(data):
    coeffs = pywt.wavedec(data, 'sym4', level=4)

    features = []
    for c in coeffs[1:]:
        features.append(np.sum(c**2))
    return features

In [6]:
def apply_wavelet(data): 
    w = pywt.Wavelet('sym4')
    maxlev = pywt.dwt_max_level(len(data), w.dec_len)
    threshold = 0.04 # Threshold for filtering

    coeffs = pywt.wavedec(data, 'sym4', level=maxlev)
    for i in range(1, len(coeffs)):
        coeffs[i] = pywt.threshold(coeffs[i], threshold*max(coeffs[i]))
        
    datarec = pywt.waverec(coeffs, 'sym4')
    return datarec

In [7]:
def apply_welch(data):
    fs = 360.0
    nperseg = 64
    noverlap = nperseg // 2
    f_welch, Pxx = signal.welch(data, fs=fs, nperseg=nperseg, noverlap=noverlap, window='hann')
    return f_welch, Pxx

In [8]:
def apply_fft(data):
    fs = 360.0
    N = len(data)
    X = np.fft.fft(data)
    freqs = np.fft.fftfreq(N, 1/fs)
    positive = freqs >= 0
    freqs_pos = freqs[positive]
    X_pos = np.abs(X[positive]) / N
    return X_pos, freqs_pos

In [9]:
signals, labels = get_record_signals(6)
print("Number of samples:", len(signals))
print("Value Range:", np.min(signals), "->", np.max(signals))
print("Classes:", np.unique(labels[:, 1]))

Number of samples: 650000
Value Range: 611 -> 1538
Classes: ['+' 'N' 'V' '~']


In [10]:
def zscore_per_beat(x, eps=1e-8):
    """
    Z-score normalization per beat
    """
    return (x - np.mean(x)) / (np.std(x) + eps)


In [11]:
def process_data(window_size):
    X = []
    y = []

    half_w = window_size // 2
    valid_labels = {'A', 'L', 'N', 'R', 'V'}

    for i in range(len(records)):
        signals, labels = get_record_signals(i)
        signals = apply_detrend_and_butterworth(signals)
        sig_len = len(signals)

        mask = np.isin(labels[:, 1], list(valid_labels))
        filtered_labels = labels[mask]
        
        for j in range(3, len(filtered_labels)):
            prev_2_label_pos = int(filtered_labels[j - 2][0])
            prev_1_label_pos = int(filtered_labels[j - 1][0])
            # label_pos = int(filtered_labels[j][0])
            # next_1_label_pos = int(filtered_labels[j + 1][0])
            prev_3_label_pos = int(filtered_labels[j - 3][0])
            # prev_2_label_pos = int(filtered_labels[j - 2][0])
            # prev_1_label_pos = int(filtered_labels[j - 1][0])
            label_pos = int(filtered_labels[j][0])
            # next_1_label_pos = int(filtered_labels[j + 1][0])
            
            RR_j_2 = prev_2_label_pos - prev_3_label_pos
            start_prev_2 = int(int(filtered_labels[j-2][0]) - 0.6 * RR_j_2)
            end_prev_2  = int(int(filtered_labels[j-2][0]) + 0.8 * RR_j_2)

            RR_j_1 = prev_1_label_pos - prev_2_label_pos
            start_prev_1 = int(int(filtered_labels[j-1][0]) - 0.6 * RR_j_1)
            end_prev_1  = int(int(filtered_labels[j-1][0]) + 0.8 * RR_j_1)
            
            RR_j = label_pos - prev_1_label_pos
            start_curr = int(int(filtered_labels[j][0]) - 0.6 * RR_j)
            end_curr  = int(int(filtered_labels[j][0]) + 0.8 * RR_j)
            
            # RR_j_p1 = next_1_label_pos - label_pos
            # start_next_1 = int(int(filtered_labels[j+1][0]) - 0.6 * RR_j_p1)
            # end_next_1  = int(int(filtered_labels[j+1][0]) + 0.8 * RR_j_p1)

            # signal_min_2 = signals[prev_2_label_pos - half_w:prev_2_label_pos + half_w]
            # signal_min_1 = signals[prev_1_label_pos - half_w:prev_1_label_pos + half_w]
            # signal_current = signals[label_pos - half_w:label_pos + half_w]
            # signal_next_1 = signals[next_1_label_pos - half_w:next_1_label_pos + half_w]

            # label_pos = int(filtered_labels[j][0])
            # signal = signals[label_pos - half_w:label_pos + half_w]

            if start_prev_2 < 0 or end_curr > sig_len:
                continue

            # wavelet_energy = apply_wavelet(signals[start:end])
            # wavelet_prev_1 = apply_wavelet(signal_min_1)
            # wavelet_curr = apply_wavelet(signal_current)
            # wavelet_next_1 = apply_wavelet(signal_next_1)
            # fusion = [*wavelet_prev_1, *wavelet_curr, *wavelet_next_1]
            fusion = np.stack([
                apply_wavelet(zscore_per_beat(signal.resample(signals[start_prev_2:end_prev_2], 256))),
                apply_wavelet(zscore_per_beat(signal.resample(signals[start_prev_1:end_prev_1], 256))),
                apply_wavelet(zscore_per_beat(signal.resample(signals[start_curr:end_curr], 256)))
            ], axis=-1)
            # fusion = np.stack([apply_wavelet(zscore_per_beat(signal_min_2)), apply_wavelet(zscore_per_beat(signal_min_1)), 
            # fusion = apply_wavelet(zscore_per_beat(signal_current))#], axis=-1)
            # fusion = zscore_per_beat(signal_current)
            X.append(fusion)
            y.append(filtered_labels[j][1])

    return X, y


In [12]:
#Hyperparameters
WINDOW_SIZE = 180
EPOCHS = 500
BATCH_SIZE = 128
TEST_SIZE = 0.25
N_RUNS = 5

In [13]:
mlflow.end_run()

In [14]:
for i in range(5):
    X, y = process_data(window_size=WINDOW_SIZE)
    
    mlflow.start_run()
    mlflow.set_experiment("CONV1D_MIT_BIH_Arrythmia_Classification_Wavelet_ZScore_Relative_Window_2")
    mlflow.log_param("model", "CONV1D-32-MaxPool2-CONV1D-32-MaxPool2-Dropout(0.1)-Dense512")
    mlflow.log_param("input_dim", (256, 3))
    mlflow.log_param("epochs", EPOCHS)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("optimizer", "adam")
    mlflow.log_param("loss", "categorical_crossentropy")
    mlflow.log_param("test_size", TEST_SIZE)
    mlflow.log_param("scaler", "ZScore")
    mlflow.log_param("classes", "A,L,sN,R,V")
    mlflow.log_param("window_size", 256)
    
    X = np.array(X)
    y = np.array(y)
    # Train/test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)
    
    #Label Binarization
    lb = LabelBinarizer()
    y_train = lb.fit_transform(y_train)
    y_test = lb.transform(y_test)
    
    # Under-sampling
    n_samples = X_train.shape[0]
    X_flat = X_train.reshape(n_samples, -1)  
    rus = RandomUnderSampler(sampling_strategy='auto')
    X_resampled_flat, y_resampled = rus.fit_resample(X_flat, y_train)
    X_train = X_resampled_flat.reshape(
        -1,
        X_train.shape[1],
        X_train.shape[2]
    )
    
    y_train = y_resampled

    #Min/Max Scalings
    # scaler = MinMaxScaler()
    # X_train_res = scaler.fit_transform(X_train_res)
    # X_train_res = X_train_res[..., np.newaxis]
    # X_train_res = expand_dims(X_train_res, axis=-1)
    # X_test = scaler.transform(X_test)
    
    # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # print(f"Using device: {device}")
    # print(type(X_test))
    # print(X_test.dtype)
    # print(X_test.shape)
    # print(type(X_test[0]))
    cnn = Sequential()
    cnn.add(Conv1D(32, kernel_size=5, strides = 3, activation='relu', input_shape=(256,3)))
    cnn.add(MaxPooling1D(pool_size=2, strides = 2))
    cnn.add(Conv1D(32, kernel_size=5, strides = 3, activation='relu'))
    cnn.add(MaxPooling1D(pool_size=2, strides = 2))
    cnn.add(Dropout(0.10))
    # cnn.add(Conv1D(32, kernel_size=5, strides = 3, activation='relu'))
    # cnn.add(MaxPooling1D(pool_size=2, strides = 2))
    cnn.add(Flatten())
    cnn.add(Dense(512, activation='relu'))
    cnn.add(Dense(5, activation='softmax'))

    cnn.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])
    
    history = cnn.fit(X_train, y_train,
          epochs=EPOCHS,
          batch_size=BATCH_SIZE)
        
    X_test_tf = convert_to_tensor(X_test, dtype=float32)
    y_test_tf = convert_to_tensor(y_test)
    # X_test_tf = expand_dims(X_test_tf, axis=-1)

    y_pred = np.argmax(cnn.predict(X_test_tf), axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)

    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None
    )

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro"
    )

    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
        y_true, y_pred, average="weighted"
    )

    mlflow.log_metric("accuracy", accuracy)

    mlflow.log_metric("precision_macro", precision_macro)
    mlflow.log_metric("recall_macro", recall_macro)
    mlflow.log_metric("f1_macro", f1_macro)

    mlflow.log_metric("precision_weighted", precision_weighted)
    mlflow.log_metric("recall_weighted", recall_weighted)
    mlflow.log_metric("f1_weighted", f1_weighted)

    cm = confusion_matrix(y_true, y_pred)
    cm_norm = cm / cm.sum(axis=1, keepdims=True)

    plt.figure(figsize=(7, 6))
    sns.heatmap(
        cm_norm,
        annot=True,
        fmt=".2f",
        cmap="Blues"
    )
    plt.title("Normalized Confusion Matrix")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

    plt.tight_layout()
    plt.savefig("confusion_matrix.png")
    plt.close()

    mlflow.log_artifact("confusion_matrix.png")
    report = classification_report(y_true, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("classification_report.txt")
    mlflow.end_run()    

  return FileStore(store_uri, store_uri)
2026/01/02 12:29:51 INFO mlflow.tracking.fluent: Experiment with name 'CONV1D_MIT_BIH_Arrythmia_Classification_Wavelet_ZScore_Relative_Window_2' does not exist. Creating a new experiment.


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [15]:
experiment_name = "CONV1D_MIT_BIH_Arrythmia_Classification_Wavelet_ZScore_Relative_Window_2"

runs = mlflow.search_runs(
    experiment_names=[experiment_name],
    output_format="pandas"
)

summary = (
    runs
    .groupby(["params.window_size", "params.epochs", "params.model"])
    .agg(
        accuracy_mean=("metrics.accuracy", "mean"),
        accuracy_std=("metrics.accuracy", "std"),
        f1_macro_mean=("metrics.f1_macro", "mean"),
        f1_macro_std=("metrics.f1_macro", "std"),
    )
    .reset_index()
)

print(summary)

  params.window_size params.epochs  \
0                256           500   

                                        params.model  accuracy_mean  \
0  CONV1D-32-MaxPool2-CONV1D-32-MaxPool2-Dropout(...       0.977573   

   accuracy_std  f1_macro_mean  f1_macro_std  
0      0.008489        0.94179      0.018112  
