In [1]:
import csv
import os

import biosppy.signals.ecg as ecg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
raw_data_train = pd.read_csv('Data/train.csv', index_col='id')
raw_data_test = pd.read_csv('Data/test.csv', index_col='id')

In [3]:
train_data_X = raw_data_train.drop(columns=['y']).to_numpy(dtype='float32')
test_data = raw_data_test.to_numpy(dtype='float32')

In [4]:
def get_average_beats(data):
    rows, columns = data.shape
    
    means = np.zeros((rows, 180))
    variances = np.zeros((rows, 180))
    medians = np.zeros((rows, 180))
    peaks = []
    print(f"The data has:\nData points:{rows}, Features:{columns}\n")
    
    sampling_rate=300
    for r in range(rows):
        curr_row = data[r]
        curr_row = curr_row[~np.isnan(curr_row)]
        
        if(r%1000 == 0):
            print(f"Data digested up to row: {r}")
        
        r_peaks = ecg.engzee_segmenter(curr_row, sampling_rate=sampling_rate)['rpeaks']
        
        if(r_peaks.size == 0):
            peaks.append(np.array([-1]))
            print(f"No peaks found for {r}")
        else:
            peaks.append(r_peaks)
        
        beats = ecg.extract_heartbeats(curr_row, r_peaks, sampling_rate=sampling_rate)['templates']
        
        if(beats.shape[0] == 0):
            print(f"No beats found for {r}")
            
        if len(beats) != 0:
            mu = np.mean(beats, axis=0)
            var = np.std(beats, axis=0)
            md = np.median(beats, axis=0)
            
            means[r] = mu
            variances[r] = var
            medians[r] = md
        
    return peaks, means, variances, medians

In [5]:
peaks_train, means_train, vars_train, medians_train = get_average_beats(train_data_X)

The data has:
Data points:5117, Features:17807

Data digested up to row: 0
No peaks found for 447
No beats found for 447
No peaks found for 579
No beats found for 579
No peaks found for 593
No beats found for 593
No peaks found for 955
No beats found for 955
Data digested up to row: 1000
No peaks found for 1247
No beats found for 1247
No peaks found for 1383
No beats found for 1383
Data digested up to row: 2000
No peaks found for 2493
No beats found for 2493
No peaks found for 2931
No beats found for 2931
Data digested up to row: 3000
No peaks found for 3009
No beats found for 3009
No peaks found for 3145
No beats found for 3145
No peaks found for 3519
No beats found for 3519
No peaks found for 3884
No beats found for 3884
Data digested up to row: 4000
No peaks found for 4025
No beats found for 4025
No peaks found for 4340
No beats found for 4340
No peaks found for 4625
No beats found for 4625
No peaks found for 4684
No beats found for 4684
No peaks found for 4693
No beats found for 46

In [6]:
peaks_test, means_test, vars_test, medians_test = get_average_beats(test_data)

The data has:
Data points:3411, Features:17807

Data digested up to row: 0
No peaks found for 155
No beats found for 155
No peaks found for 172
No beats found for 172
No peaks found for 450
No beats found for 450
No peaks found for 509
No beats found for 509
No peaks found for 646
No beats found for 646
No peaks found for 704
No beats found for 704
No peaks found for 950
No beats found for 950
No peaks found for 970
No beats found for 970
Data digested up to row: 1000
No peaks found for 1040
No beats found for 1040
No peaks found for 1147
No beats found for 1147
No peaks found for 1740
No beats found for 1740
Data digested up to row: 2000
No peaks found for 2223
No beats found for 2223
No peaks found for 2267
No beats found for 2267
No peaks found for 2304
No beats found for 2304
No peaks found for 2501
No beats found for 2501
No peaks found for 2894
No beats found for 2894
Data digested up to row: 3000
No peaks found for 3169
No beats found for 3169
No peaks found for 3381
No beats fo

In [7]:
np.savetxt("Data/train_means.csv", means_train, delimiter=",", fmt="%.10f")
np.savetxt("Data/train_vars.csv", vars_train, delimiter=",", fmt="%.10f")
np.savetxt("Data/train_medians.csv", medians_train, delimiter=",", fmt="%.10f")

np.savetxt("Data/test_means.csv", means_test, delimiter=",", fmt="%.10f")
np.savetxt("Data/test_vars.csv", vars_test, delimiter=",", fmt="%.10f")
np.savetxt("Data/test_medians.csv", medians_test, delimiter=",", fmt="%.10f")

In [9]:
def save_peaks(peaks, file_name):
    
    with open(file_name, 'w') as file:
        for arr in peaks:
            # Save the shape followed by the flattened array
            row = ",".join(map(str, arr))
            file.write(row + "\n")

In [10]:
save_peaks(peaks_train, 'Data/train_peaks.csv')
save_peaks(peaks_test, 'Data/test_peaks.csv')

In [69]:
'''
List of all data:
train_data_X, test_data
peaks_train, means_train, vars_train, medians_train
peaks_test, means_test, vars_test, medians_test
'''

found = np.zeros(5117)
for i in range(5):
    npeaks = len(peaks_train[i])
    print(npeaks)
    
    curr_row = train_data_X[i]
    curr_row = curr_row[~np.isnan(curr_row)]
    normalized_signal_length = len(curr_row) / 180
    print(normalized_signal_length)
    
    found[i] = normalized_signal_length / npeaks
    
print(found)

66
17807
37
17807
30
17807
70
17807
45
17807
[1.37390572 1.28423423 1.52314815 ... 0.         0.         0.        ]
