In [7]:
import os
import csv
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow import convert_to_tensor, float32
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import seaborn as sns
import mlflow
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from imblearn.under_sampling import RandomUnderSampler


In [8]:
path = "Dataset/mitbih_database"
filenames = next(os.walk(path))[2]
records=list()
annotations=list()
filenames.sort()
for f in filenames:
    filename, file_extension = os.path.splitext(f)
    if(file_extension=='.csv'):
        records.append(path+'/'+filename+file_extension)
    else:
        annotations.append(path+'/'+filename+file_extension)

In [9]:
def get_record_signals(index):
    signals = []
    labels = []
    with open(records[index],'r') as csvfile:
        filereader = csv.reader(csvfile,delimiter=',',quotechar='|')
        row_index = -1
        for row in filereader:
            if(row_index >= 0):
                signals.insert(row_index, int(row[1]))
            row_index += 1
        signals = np.array(signals)
    with open(annotations[index],'r') as csvfile:
        filereader = csv.reader(csvfile,delimiter=',',quotechar='|')
        row_index = -1
        for row in filereader:
            if(row_index >= 0):
                elements = list(filter(lambda x: len(x) > 0, row[0].split(" ")))
                labels.insert(row_index, [int(elements[1]), elements[2]])
            row_index += 1
        labels = np.array(labels)
    return signals, labels

In [10]:
signals, labels = get_record_signals(6)
print("Number of samples:", len(signals))
print("Value Range:", np.min(signals), "->", np.max(signals))
print("Classes:", np.unique(labels[:, 1]))

Number of samples: 650000
Value Range: 611 -> 1538
Classes: ['+' 'N' 'V' '~']


In [11]:
def process_data(window_size):
    X = []
    y = []

    half_w = window_size // 2
    valid_labels = {'A', 'L', 'N', 'R', 'V'}

    for i in range(len(records)):
        signals, labels = get_record_signals(i)
        sig_len = len(signals)

        mask = np.isin(labels[:, 1], list(valid_labels))
        filtered_labels = labels[mask]

        for label in filtered_labels:
            center = int(label[0])
            start = center - half_w
            end = center + half_w

            if start < 0 or end > sig_len:
                continue

            X.append(signals[start:end])
            y.append(label[1])

    return X, y


In [None]:
#Hyperparameters
WINDOW_SIZE = 250
EPOCHS = 500
BATCH_SIZE = 128
TEST_SIZE = 0.25
N_RUNS = 5

In [13]:
# signals_list = [X[y.index("A")], X[y.index("L")], X[y.index("N")], X[y.index("R")], X[y.index("V")]]
# titles = ["A - Atrial premature beat (APB)", "L - Left bundle branch block beat (LBBB)", "N - Normal beat", "R - Right bundle branch block beat (RBBB)", "V - Premature ventricular contraction (PVC)"]

# fig, ax = plt.subplots(1, 5, figsize=(20, 5))

# for a, sig, title in zip(ax, signals_list, titles):
#     a.plot(range(0, len(sig)), sig)
#     a.set_title(title)
#     a.set_xlabel("Sample")
#     a.grid(True)

# ax[0].set_ylabel("Amplitude")
# plt.tight_layout()
# plt.show()

In [14]:
for i in range(N_RUNS):
    X, y = process_data(window_size=WINDOW_SIZE)
    
    # values, counts = np.unique(y, return_counts=True)
    # plt.bar(values, counts)
    # plt.title('Class Distribution')
    # plt.xlabel('Class')
    # plt.ylabel('Number of Samples')
    # plt.show()
    
    mlflow.start_run()
    mlflow.set_experiment("DNN_MIT_BIH_Arrythmia_Classification")
    mlflow.log_param("model", "DNN-Dense-32-Softmax")
    mlflow.log_param("input_dim", WINDOW_SIZE)
    mlflow.log_param("epochs", EPOCHS)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("optimizer", "adam")
    mlflow.log_param("loss", "categorical_crossentropy")
    mlflow.log_param("test_size", TEST_SIZE)
    mlflow.log_param("scaler", "MinMaxScaler")
    mlflow.log_param("classes", "A,L,N,R,V")
    mlflow.log_param("window_size", WINDOW_SIZE)
    # Train/test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=42)
    
    #Label Binarization
    lb = preprocessing.LabelBinarizer()
    y_train = lb.fit_transform(y_train)
    y_test = lb.transform(y_test)
    
    # Under-sampling
    rus = RandomUnderSampler(sampling_strategy='auto')
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
    
    #Min/Max Scaling
    scaler = MinMaxScaler()
    X_train_res = scaler.fit_transform(X_train_res)
    X_test = scaler.transform(X_test)
    
    # device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # print(f"Using device: {device}")
    # print(type(X_test))
    # print(X_test.dtype)
    # print(X_test.shape)
    # print(type(X_test[0]))
    model = Sequential()
    model.add(InputLayer(input_shape=(WINDOW_SIZE,), name="InputLayer"))
    model.add(Dense(units=32, activation="relu", name=f"HiddenLayer-1"))
    model.add(Dropout(0.1))
    model.add(Dense(5, activation='softmax'))

    model.compile(optimizer='adam',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'],
                    )
    history = model.fit(X_train_res, y_train_res,
            epochs=EPOCHS,
            batch_size=BATCH_SIZE, verbose=0)
        
    X_test_tf = convert_to_tensor(X_test, dtype=float32)
    y_test_tf = convert_to_tensor(y_test)

    y_pred = np.argmax(model.predict(X_test), axis=1)
    y_true = np.argmax(y_test, axis=1)

    accuracy = accuracy_score(y_true, y_pred)

    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None
    )

    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average="macro"
    )

    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
        y_true, y_pred, average="weighted"
    )

    mlflow.log_metric("accuracy", accuracy)

    mlflow.log_metric("precision_macro", precision_macro)
    mlflow.log_metric("recall_macro", recall_macro)
    mlflow.log_metric("f1_macro", f1_macro)

    mlflow.log_metric("precision_weighted", precision_weighted)
    mlflow.log_metric("recall_weighted", recall_weighted)
    mlflow.log_metric("f1_weighted", f1_weighted)

    cm = confusion_matrix(y_true, y_pred)
    cm_norm = cm / cm.sum(axis=1, keepdims=True)

    plt.figure(figsize=(7, 6))
    sns.heatmap(
        cm_norm,
        annot=True,
        fmt=".2f",
        cmap="Blues"
    )
    plt.title("Normalized Confusion Matrix")
    plt.ylabel("True label")
    plt.xlabel("Predicted label")

    plt.tight_layout()
    plt.savefig("confusion_matrix.png")
    plt.close()

    mlflow.log_artifact("confusion_matrix.png")
    report = classification_report(y_true, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("classification_report.txt")
    mlflow.end_run()    

  return FileStore(store_uri, store_uri)




In [15]:
experiment_name = "DNN_MIT_BIH_Arrythmia_Classification"

runs = mlflow.search_runs(
    experiment_names=[experiment_name],
    output_format="pandas"
)

summary = (
    runs
    .groupby(["params.window_size", "params.epochs"])
    .agg(
        accuracy_mean=("metrics.accuracy", "mean"),
        accuracy_std=("metrics.accuracy", "std"),
        f1_macro_mean=("metrics.f1_macro", "mean"),
        f1_macro_std=("metrics.f1_macro", "std"),
    )
    .reset_index()
)

print(summary)


  params.window_size params.epochs  accuracy_mean  accuracy_std  \
0                150           100       0.821878      0.053662   
1                200           100       0.840625      0.096149   
2                250           100       0.885184      0.024113   
3                250          1000       0.921682      0.043035   
4                250           250       0.876635      0.071668   
5                250            30       0.779502      0.053969   
6                250           500       0.941440      0.005831   
7                300           100       0.832426      0.060128   

   f1_macro_mean  f1_macro_std  
0       0.738300      0.033209  
1       0.763278      0.057163  
2       0.789733      0.023002  
3       0.841315      0.059127  
4       0.802491      0.055297  
5       0.711360      0.025527  
6       0.866624      0.009167  
7       0.731862      0.046838  
