In [165]:
import csv
import os

import biosppy.signals.ecg as ecg
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
from scipy import signal
import numpy as np
import pandas as pd

In [27]:
#import raw features
raw_train_data = pd.read_csv('Data/train.csv', index_col='id')
train_data_X = raw_train_data.drop(columns=['y']).to_numpy(dtype='float32')
train_data_y = raw_train_data[['y']].to_numpy(dtype='float32').ravel()

#import means vars medians
raw_train_data_means = np.loadtxt("Data/train_means.csv", delimiter=",")
train_data_means = raw_train_data_means.astype(np.float32)

raw_train_data_vars = np.loadtxt("Data/train_vars.csv", delimiter=",")
train_data_vars = raw_train_data_vars.astype(np.float32)

raw_train_data_medians = np.loadtxt("Data/train_medians.csv", delimiter=",")
train_data_medians = raw_train_data_medians.astype(np.float32)

def get_peaks(file_name):
    peaks = []
    with open(file_name, 'r') as file:
        for line in file:
            line = line.strip()
            arr = np.array(list(map(int, line.strip().split(','))))
            peaks.append(arr)
            
    return peaks

train_r_peaks = get_peaks("Data/train_peaks.csv")
test_r_peaks = get_peaks("Data/test_peaks.csv")

print("Finished fetching preprocessed data")

Finished fetching preprocessed data


In [103]:
def get_derivative(signal):
    return np.diff(signal)

def get_mobility(signal, var):
    return np.sqrt(np.var(get_derivative(signal)) / var)

In [318]:
def get_features(data_raw, data_peaks, data_means, data_vars, data_medians, idx):
    
    raw = data_raw[idx]
    raw = raw[~np.isnan(raw)]
    r_peaks = data_peaks[idx]
    
    # RAW Data ----------------------------------------------------------------------
    
    raw_mean = np.mean(raw)
    raw_std = np.std(raw)
    raw_median = np.median(raw)
    raw_var = np.var(raw)
    
    raw_mav = np.mean(np.abs(raw))
    raw_rms = np.sqrt(np.mean(raw**2))
    raw_wl = np.sum(np.abs(get_derivative(raw))) / len(raw)
    
    raw_max = np.max(np.abs(raw))
    raw_abssum = np.sum(np.abs(raw)) / len(raw)
    raw_energy = np.sum(raw**2) /len(raw)
    
    raw_skeness = skew(raw)
    raw_kurtosis = kurtosis(raw)
    
    raw_mobility = get_mobility(raw, raw_var)
    raw_complexity = get_mobility(get_derivative(raw), np.var(get_derivative(raw))) / get_mobility(raw, raw_var)
    
    
    f, Pxx = signal.welch(raw, 300, nperseg=1024)
    spectral_centroid = np.sum(f * Pxx) / np.sum(Pxx)
    
    arith_mean = np.mean(Pxx)
    geo_mean = np.exp(np.mean(np.log(Pxx)))  # Geometric mean (log-mean approach)
    sfm = geo_mean / arith_mean
    
    roll_off_percentage = 0.85
    total_energy = np.sum(Pxx)
    cumulative_energy = np.cumsum(Pxx)
    roll_off_index = np.where(cumulative_energy >= roll_off_percentage * total_energy)[0][0]
    spectral_roll_off = f[roll_off_index]
    
    
    # Averaged Data ----------------------------------------------------------------------
    
    mu = data_means[idx]
    var = data_vars[idx]
    md = data_medians[idx]
    
    if(len(r_peaks) == 1 and r_peaks[0] == -1):
        r_nums = np.float64(0.0)
        
        mu_mav = np.float64(0.0)
        mu_rms = np.float64(0.0)
        mu_wl = np.float64(0.0)
        
        R_index = np.float64(0.0)
        Q_index = np.float64(0.0)
        S_index = np.float64(0.0)
        P_index = np.float64(0.0)
        T_index = np.float64(0.0)
        
        mu_max = np.float64(0.0)
        mu_abssum = np.float64(0.0)
        mu_energy = np.float64(0.0)
    else:
        r_nums = len(r_peaks) / (len(raw)*300)
        
        mu_mav = np.mean(np.abs(mu))
        mu_rms = np.sqrt(np.mean(mu**2))
        mu_wl = np.sum(np.abs(get_derivative(mu))) / 180
        
        R_index = np.argmax(mu)
        
        if R_index > 0:
            reversed_slice = mu[:R_index][::-1]
            min_index = np.argmin(reversed_slice)
            Q_index = R_index - min_index - 1
        else:
            Q_index = np.float64(0.0)
        
        if R_index > 0:
            slice = mu[:R_index]
            min_index = np.argmin(slice)
            S_index = R_index + min_index 
        else:
            S_index = np.float64(0.0)
            
        if Q_index > 0:
            reversed_slice = mu[:Q_index][::-1]
            max_index = np.argmax(reversed_slice)
            P_index = Q_index - max_index - 1
        else:
            P_index = np.float64(0.0)
        
        if S_index > 0:
            slice = mu[:S_index]
            max_index = np.argmin(slice)
            T_index = S_index + max_index 
        else:
            T_index = np.float64(0.0)
        
        mu_max = np.max(np.abs(mu))
        mu_abssum = np.sum(np.abs(mu)) / len(mu)
        mu_energy = np.sum(mu**2) /len(mu)

    average_mean = np.mean(mu)
    average_var = np.mean(var)
    average_median = np.mean(md)
    
    if(len(r_peaks) <= 2):
        skewness_mu = np.float64(0.0)
        kurtosis_mu = np.float64(0.0)
    else:
        skewness_mu = skew(mu)
        kurtosis_mu = kurtosis(mu)
    
    std_mean = np.std(mu)
    std_var = np.std(var)
    std_median = np.std(md)
    
    if(len(r_peaks) <= 2):
        rr_intervals = np.float64(0.0)
        sdnn = np.float64(0.0)
        rmssd = np.float64(0.0)
    else:
        rr_intervals = np.diff(r_peaks) / 300
        sdnn = np.std(rr_intervals)
        rmssd = np.sqrt(np.mean(np.square(np.diff(rr_intervals))))
    
    feature_array = np.array([
                        spectral_centroid, sfm, spectral_roll_off, r_nums,
                        raw_mean, raw_std, raw_median, raw_skeness, raw_kurtosis, raw_var, raw_mobility, raw_complexity,
                        raw_mav, raw_rms, raw_wl, raw_max, raw_abssum, raw_energy,
                        R_index.astype(np.float32), Q_index.astype(np.float32), S_index.astype(np.float32),
                        P_index.astype(np.float32), T_index.astype(np.float32),
                        mu_mav, mu_rms, mu_wl, mu_max, mu_abssum, mu_energy,
                        average_mean, average_var, average_median,
                        skewness_mu, kurtosis_mu,
                        std_mean, std_var, std_median,
                        sdnn, rmssd
                     ])
    
    return feature_array

In [320]:
abba = get_features(train_data_X, train_r_peaks, train_data_means, train_data_vars, train_data_medians, 504)

In [322]:
stacked_arrays = []
for i in range(5117):
    abba = get_features(train_data_X, train_r_peaks, train_data_means, train_data_vars, train_data_medians, i)
    stacked_arrays.append(abba)
    
features = np.vstack(stacked_arrays)
print(features.shape)

(5117, 39)


In [270]:
print(features[4984])
print(features.dtype)

contains_nan = np.any(np.isnan(features))
print(contains_nan)

[ 9.60148479e+00  2.68462161e-03  1.75781250e+01  5.81683159e-01
  9.58626556e+01  1.10000000e+01 -5.76741604e-02  9.57839065e+00
  9.18964941e+03  2.26179570e-01  1.95005846e+00  5.70144997e+01
  9.58644180e+01  7.67397454e+00  5.55000000e+02  5.70144979e+01
  9.18998656e+03  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00]
float64
False


In [272]:
print(features.shape)
print(train_data_y.shape)

(5117, 38)
(5117,)


In [324]:
y_reshaped = train_data_y.reshape(-1, 1)
result = np.hstack([y_reshaped, features])
print(result.shape)
print(result[30])

(5117, 40)
[ 3.00000000e+00  8.09257722e+00  1.46454293e-02  1.52343750e+01
  1.48111577e-05  1.93982964e+01  3.32239777e+02 -4.70000000e+01
  1.10453693e+00  3.72997120e+00  1.10383266e+05  2.30824038e-01
  2.78094506e+00  2.21401505e+02  3.32805603e+02  4.04160701e+01
  1.71500000e+03  2.21401506e+02  1.10759560e+05  6.00000000e+01
  2.00000000e+01  8.00000000e+01  5.00000000e+00  1.48000000e+02
  9.82021179e+01  2.01381653e+02  2.25709147e+01  1.06473535e+03
  9.82021159e+01  4.05545722e+04  7.19642181e+01  3.32658936e+02
  3.10444450e+01  3.51544103e+00  1.33462243e+01  1.88084351e+02
  8.15171509e+01  2.02075974e+02  4.57376290e-01  7.27783815e-01]


In [326]:
np.savetxt("Data/features.csv", result, delimiter=",", fmt="%.10f")

In [296]:
gre = np.loadtxt("Data/features.csv", delimiter=",")
fe = gre.astype(np.float32)
print(fe.shape)
print(fe[30])

(5117, 39)
[ 3.00000000e+00  8.09257698e+00  1.46454293e-02  1.52343750e+01
  1.93982964e+01  3.32239777e+02 -4.70000000e+01  1.10453689e+00
  3.72997117e+00  1.10383266e+05  2.30824038e-01  2.78094506e+00
  2.21401505e+02  3.32805603e+02  4.04160690e+01  1.71500000e+03
  2.21401505e+02  1.10759562e+05  6.00000000e+01  2.00000000e+01
  8.00000000e+01  5.00000000e+00  1.48000000e+02  9.82021179e+01
  2.01381653e+02  2.25709152e+01  1.06473535e+03  9.82021179e+01
  4.05545703e+04  7.19642181e+01  3.32658936e+02  3.10444450e+01
  3.51544094e+00  1.33462248e+01  1.88084351e+02  8.15171509e+01
  2.02075974e+02  4.57376301e-01  7.27783799e-01]


In [304]:
#import raw features
raw_train_data = pd.read_csv('Data/test.csv', index_col='id')
test_data_X = raw_train_data.to_numpy(dtype='float32')

#import means vars medians
raw_test_data_means = np.loadtxt("Data/test_means.csv", delimiter=",")
test_data_means = raw_test_data_means.astype(np.float32)

raw_test_data_vars = np.loadtxt("Data/test_vars.csv", delimiter=",")
test_data_vars = raw_test_data_vars.astype(np.float32)

raw_test_data_medians = np.loadtxt("Data/test_medians.csv", delimiter=",")
test_data_medians = raw_test_data_medians.astype(np.float32)

In [328]:
stacked = []
for i in range(3411):
    abba = get_features(test_data_X, test_r_peaks, test_data_means, test_data_vars, test_data_medians, i)
    stacked.append(abba)
    
test_features = np.vstack(stacked)
print(features.shape)

(5117, 39)


In [330]:
np.savetxt("Data/test_features.csv", test_features, delimiter=",", fmt="%.10f")