In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import signal
import time
import os



# Data
current_dir = os.getcwd()
os.path.join(os.getcwd(), '../data/100.csv')
data_100 = pd.read_csv(os.path.join(current_dir, '../data/100.csv'), header=None)
data_101 = pd.read_csv(os.path.join(current_dir, '../data/101.csv'), header=None)
data_107 = pd.read_csv(os.path.join(current_dir, '../data/107.csv'), header=None)
data_201 = pd.read_csv(os.path.join(current_dir, '../data/201.csv'), header=None)
data_301 = pd.read_csv(os.path.join(current_dir, '../data/301.csv'), header=None)
data_403 = pd.read_csv(os.path.join(current_dir, '../data/403.csv'), header=None)
data_404 = pd.read_csv(os.path.join(current_dir, '../data/404.csv'), header=None)
data_405 = pd.read_csv(os.path.join(current_dir, '../data/405.csv'), header=None)
data_407 = pd.read_csv(os.path.join(current_dir, '../data/407.csv'), header=None)
data_413 = pd.read_csv(os.path.join(current_dir, '../data/413.csv'), header=None)


# labels
labels_100 = pd.read_csv(os.path.join(current_dir, '../labels/100_labels.csv'))
labels_101 = pd.read_csv(os.path.join(current_dir, '../labels/101_labels.csv'))
labels_107 = pd.read_csv(os.path.join(current_dir, '../labels/107_labels.csv'))
labels_201 = pd.read_csv(os.path.join(current_dir, '../labels/201_labels.csv'))
labels_301 = pd.read_csv(os.path.join(current_dir, '../labels/301_labels.csv'))
labels_403 = pd.read_csv(os.path.join(current_dir, '../labels/403_labels.csv'))
labels_404 = pd.read_csv(os.path.join(current_dir, '../labels/404_labels.csv'))
labels_405 = pd.read_csv(os.path.join(current_dir, '../labels/405_labels.csv'))
labels_407 = pd.read_csv(os.path.join(current_dir, '../labels/407_labels.csv'))
labels_413 = pd.read_csv(os.path.join(current_dir, '../labels/413_labels.csv'))

# Custom
custom_100 = pd.read_csv(os.path.join(current_dir, '../custom/100_custom.csv'))
custom_101 = pd.read_csv(os.path.join(current_dir, '../custom/101_custom.csv'))
custom_107 = pd.read_csv(os.path.join(current_dir, '../custom/107_custom.csv'))
custom_201 = pd.read_csv(os.path.join(current_dir, '../custom/201_custom.csv'))
custom_403 = pd.read_csv(os.path.join(current_dir, '../custom/403_custom.csv'))


# df name
labels_100.name = '100'
labels_101.name = '101'
labels_107.name = '107'
labels_201.name = '201'
labels_301.name = '301'
labels_403.name = '403'
labels_404.name = '404'
labels_405.name = '405'
labels_407.name = '407'
labels_413.name = '413'


def resize_heartbeat(heartbeat, size=187):
    return heartbeat[:size] if len(heartbeat) >= size else np.pad(heartbeat, (0, size - len(heartbeat)))

# Butterworth bandpass
def filter_signal(data, low, high, target=256):
    fc_low = 2 * low / target  # highpass
    fc_high = 2 * high / target  # lowpass

    b, a = signal.butter(2, [fc_low, fc_high], btype='bandpass')
    # b, a = signal.butter(filter_order, fc_high, btype='low')

    # filtfilt applies the same filter twice
    filtered_data = signal.filtfilt(b, a, data, axis=0)  # lfilter is phase shifting and for 'online'

    return filtered_data

# Revised dataset
def generate_data(custom, labels, input_data):
    data = np.concatenate(input_data.to_numpy().tolist())
    
    datapoint = []

    for i in range(len(custom)):
        try:
            qrs = custom.Samples[i]
            qrs_index = labels.loc[labels.Samples == qrs].index[0]
            if custom.Label[i] != 'N':
                left = int(labels.Samples[qrs_index - 1] + 20)
                right = int(labels.Samples[qrs_index + 1])
            else:
                left = int(labels.Samples[qrs_index] - ((labels.Samples[qrs_index]-labels.Samples[qrs_index - 1])/2.5))
                right = int(labels.Samples[qrs_index] + ((labels.Samples[qrs_index + 1]-labels.Samples[qrs_index])/1.5))
 
            heartbeat = data[left:right] * -1
            
            filtered_heartbeat = filter_signal(heartbeat, 1, 30, 256) # Butterworth filter

            normalised_heartbeat = (filtered_heartbeat - filtered_heartbeat.min()) / filtered_heartbeat.ptp() # Normalise signal 0-1
            
            resampled_heartbeat = signal.resample(normalised_heartbeat, 256)
            # resampled_heartbeat = signal.resample(normalised_heartbeat, 125) # Resample to 125Hz

            resised_heartbeat = resize_heartbeat(resampled_heartbeat, 187) # Remove or pad signal to size

            _data = {'Label': custom.Label[i], 'Form': custom.Form[i], 'Filename': labels.name, 'QRS': qrs,
                     'Heartbeat': resised_heartbeat}

            datapoint.append(_data)
        except:
            KeyError

    df = pd.DataFrame(datapoint)
    return df


# Plot data n * m
def plot_data(data, n, m):
    for i in range(len(data)):
        fig, ax = plt.subplots(n, m)
        ax = ax.flatten()
        x = i * n * m
        for j in range(n * m):
            ax[j].plot(data.Heartbeat[x + j])
            ax[j].set_title(str(data.Label[x + j]) + '|' + str(data['Filename'][x + j] + '|' + str(data['QRS'][x + j])))
        # plt.tight_layout()
        plt.show()



In [2]:
"""
    203 - Keep only N1, N2, N3 = 7620 beats
    405 - N1(24771) N3(17120) V2(4824) V3(1123)
"""

# Custom
df100 = generate_data(custom_100, labels_100, data_100)
df101 = generate_data(custom_101, labels_101, data_101)
df107 = generate_data(custom_107, labels_107, data_107)
df201 = generate_data(custom_201, labels_201, data_201)
df403 = generate_data(custom_403, labels_403, data_403)

# All
df301 = generate_data(labels_301, labels_301, data_301)
df404 = generate_data(labels_404, labels_404, data_404)
df405 = generate_data(labels_405, labels_405, data_405)
df407 = generate_data(labels_407, labels_407, data_407)
df413 = generate_data(labels_413, labels_413, data_413)

df403n = df403.loc[df403.Label == 'N']
df403n = df403n.drop(df403n.loc[df403n.Form != 'N2'].index)
df403 = df403.drop(df403.loc[df403.Label == 'N'].index)
df403 = pd.concat([df403n, df403])
df201 = df201.drop(df201.loc[df201.Label == 'N'].index)
df301 = df301.drop(df301.loc[df301.Label == 'N'].index)
df404 = df404.drop(df404.loc[df404.Label == 'N'].index)
df405 = df405.drop(df405[(df405.Form != 'N1') & (df405.Form != 'V2') & (df405.Form != 'V3')].index)
df407 = df407.drop(df407.loc[df407.Label == 'N'].index)
df413 = df413.drop(df413.loc[df413.Label == 'N'].index)

train = [df101, df107, df301, df403, df407]
test = [df404, df405, df413]

training = pd.DataFrame(df100)
testing = pd.DataFrame(df201)

for i in range(len(train)):
    print(train[i].Label.value_counts())
    # training = training.append(train[i])
    training = pd.concat([training, train[i]], ignore_index=True)

for i in range(len(test)):
    print(test[i].Label.value_counts())
    # testing = testing.append(test[i])
    testing = pd.concat([testing, test[i]], ignore_index=True)

# Remove empty lists
training = training[training.Heartbeat.str.len() > 0]
testing = testing[testing.Heartbeat.str.len() > 0]

# Drop NaN
training = training[training['Form'].notna()].reset_index(drop=True)
testing = testing[testing['Form'].notna()].reset_index(drop=True)



Label
N    2129
V       3
Name: count, dtype: int64
Label
N    6216
V     857
Name: count, dtype: int64
Label
S    4821
V      10
Name: count, dtype: int64
Label
N    22598
V     9168
Name: count, dtype: int64
Label
V    5871
Name: count, dtype: int64
Label
V    24066
Name: count, dtype: int64
Label
N    24766
V     5947
Name: count, dtype: int64
Label
S    1373
Name: count, dtype: int64


In [3]:
# Assign class labels
label_mapping = {'N': 0, 'S': 1, 'V': 2, 'F': 3, 'Q': 4}
training['Label'] = training['Label'].map(label_mapping)
testing['Label'] = testing['Label'].map(label_mapping)

def heartbeat_to_columns(df):
    # Extract heartbeat data and convert each to 187 columns
    heartbeat_data = np.array(df['Heartbeat'].tolist()) # Convert heartbeat column to array
    labels = df['Label'].values
    
    # Dataframe where each row is a heartbeat
    heartbeat_df = pd.DataFrame(heartbeat_data.tolist())
    heartbeat_df['Label'] = labels # Add class label to final column
    
    return heartbeat_df

train_final = heartbeat_to_columns(training)
test_final = heartbeat_to_columns(testing)

# Combine the train and test DataFrames
combined_df = pd.concat([train_final, test_final], ignore_index=True)

# Shuffle the combined DataFrame
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Calculate the number of rows for training and testing
train_size = int(len(combined_df) * 0.8) # 80% for training
test_size = len(combined_df) - train_size # 20% for testing

# Split the shuffled DataFrame into train and test sets
train_split = combined_df[:train_size]
test_split = combined_df[train_size:]

# Save the train and test dataframes to csv
train_split.to_csv('../../train_canine_shuffled.csv', index=False, header=False)
test_split.to_csv('../../test_canine_shuffled.csv', index=False, header=False)