# In this notebook we are creating a training and test set in the form of csv file for the base model of our project. From the recorded signals segmented in minutes, we extract two features related to heart beats : bpm (beats per minute) and sdnn  (standard deviation of the NN (R-R) intervals) with a binary associated label (apnea or not)

Import libraries

In [7]:
import matplotlib.pyplot  as plt 
import numpy as np 
import wfdb
from utils import *
import heartpy as hp
import pandas as pd 
import os
import re
import csv

Useful constants

In [8]:
NB_SAMPLES_PER_MINUTE = 6000

train_files = [
    "a01",
    "c01",
    "b01",
    "a02",
    "c02",
    "b02",
    "a03",
    "c03",
    "b03",
    "a04",
    "c04",
    "b04",
    "a05",
    "c05",
    "a06",
    "c06",
    "a07",
    "c07",
    "a08",
    "c08",
    "a09",
    "a10",
    "a11",
    "a12",
    "a13",
    "a14",
    "a15",
    "a16",
]


test_files = ["b05", "c09", "c10", "a17", "a18", "a19", "a20"]

# assign directory
directory = "apnea-ecg-database-1.0.0"

This function generates the csv with two features for each segment of 60 segments, the bpm and the sdnn

In [9]:
# creates csv in current directory
def create_data_csv(source_directory, files, output_name):
    samples = []
    for filename in files:
        print(filename)
        labels_session = []

        # load the outputs
        with open("outputs/" + filename + ".txt", "r") as file:
            lines = file.readlines()
            for line in lines:
                labels_session.append(1 if re.split(" +", line)[3] == "A" else 0)

        # here we want to make sure that we take the number of samples measured in the output file
        nb_samples_output = int(re.split(" +", lines[-1])[2]) + NB_SAMPLES_PER_MINUTE
        labels_session = labels_session[1:]
        # measure the ecg using wfdb and heartpy
        record = wfdb.rdrecord(
            source_directory + "/" + filename
        ) 
        # we do this because the number of labels could be bigger than the signal length
        session_ecg = record.p_signal[0:nb_samples_output]

        # compute the heart rates and sdnn, for every two minute with a slide of 60 seconds
        session_ecg = session_ecg.reshape((len(session_ecg)))
        working_data, measures = hp.process_segmentwise(
            session_ecg[3000:],
            sample_rate=100.0,
            segment_width=60,
            segment_overlap=0,
            segment_min_size=0,
            replace_outliers=True,
        )

        nb_samples = min(len(labels_session), len(measures["bpm"]))

        labels_session = labels_session[0:nb_samples]

        hr = measures["bpm"][0:nb_samples]
        hrv_sdnn = measures["sdnn"][0:nb_samples]
        handle_nans(hr)
        handle_nans(hrv_sdnn)

        session_samples = np.c_[labels_session, hr, hrv_sdnn]

        samples.extend(session_samples)
    np.savetxt(output_name + ".csv", samples, delimiter=",", fmt=("%d, %f, %f"))

Generate the csv files

In [10]:
create_data_csv(directory, train_files, 'base_model_train_set')
create_data_csv(directory, test_files, 'base_model_test_set')


a01


  result = super().mean(axis=axis, dtype=dtype, **kwargs)[()]
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.
  a = np.asanyarray(a)


c01
b01


A theoretically impossible result was found during the iteration
process for finding a smoothing spline with fp = s: s too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.


a02
c02


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


b02
a03
c03
b03
a04
c04
b04
a05
c05
a06
c06
a07
c07
a08
c08
a09
a10
a11
a12
a13
a14
a15
a16
b05
c09
c10
a17
a18
a19
a20


Insert the columns names on the csv's

In [11]:
columns = ["apnea", "bpm", "sdnn"]
def insert_column_names(filename):
    with open(filename, "r") as infile:
        reader = list(csv.reader(infile))
        reader.insert(0, columns)

    with open(filename, "w", newline ='') as outfile:
        writer = csv.writer(outfile)
        for line in reader:
            writer.writerow(line)
insert_column_names('base_model_train_set.csv')     
insert_column_names('base_model_test_set.csv')            

Here we are looking for the ratio of apnea and non apnea in our dataset

In [12]:
# creates csv in current directory
def compute_ratio(source_directory, files):
    labels = []
    for filename in files:
        labels_session = []
        # load the outputs
        with open("outputs/" + filename + ".txt", "r") as file:
            lines = file.readlines()
            for line in lines:
                labels_session.append(1 if re.split(" +", line)[3] == "A" else 0)
        labels.extend(labels_session)

    print("ratio of apnea : ", sum(labels) / len(labels))

all_recordings = []

all_recordings.extend(train_files)
all_recordings.extend(test_files)    

compute_ratio(directory, all_recordings)


ratio of apnea :  0.38216485772953945
