In [None]:
# Using Wavelet and Gradient Boosting Classifier, compared to one with oversampling

In [1]:
import csv
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
import pandas as pd
import collections

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, accuracy_score,f1_score,recall_score,precision_score
from ecgdetectors import Detectors

import keras
from keras.models import Sequential, load_model
from keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Flatten
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, Flatten
from keras.layers import Dense, Dropout, LeakyReLU
# from keras.utils import plot_model
from keras.optimizers import SGD
from keras import regularizers
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import ReduceLROnPlateau
from keras.utils.np_utils import to_categorical

from keras import backend as K
from keras.layers.normalization import BatchNormalization
from sklearn.metrics import fbeta_score


from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE, KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler


In [2]:
# -*- coding: utf-8 -*-
"""
Diese Datei sollte nicht verändert werden und wird von uns gestellt und zurückgesetzt.

Funktionen zum Laden und Speichern der Dateien


@author: Maurice Rohr
"""
import csv
import scipy.io as sio
import os


### Achtung! Diese Funktion nicht verändern.

def load_references(folder='../training/'):
    '''
    Parameters
    ----------
    folder : TYPE, optional
        Ort der Trainingsdaten. The default is '../training/'.

    Returns
    -------
    ecg_leads : list of numpy arrays
        EKG Signale.
    ecg_labels : list of str
        gleiche Laenge wie ecg_leads. Werte: 'N','A','O','~'
    fs : float
        Sampling Frequenz.
    ecg_names : list of str

    '''
    ecg_leads = list()
    ecg_labels = list()
    ecg_names = list()
    fs = 300
    with open(folder + 'REFERENCE.csv') as csv_file:  # Einlesen der Liste mit Dateinamen und Zuordnung
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            data = sio.loadmat(folder + row[0] + '.mat')  # Import der EKG-Dateien
            ecg_lead = data['val'][0]
            label = row[1]
            ecg_leads.append(ecg_lead)
            ecg_labels.append(label)
            ecg_names.append(row[0])
    print(str(len(ecg_leads)) + "\t Dateien wurden geladen.")

    return ecg_leads, ecg_labels, fs, ecg_names


### Achtung! Diese Funktion nicht verändern.

def save_predictions(predictions):
    '''
    Speichert Prädiktion in CSV-Datei
    Parameters
    ----------
    predictions : list of tuples
        ("Name der Datei", Label). Label : ['N','A','O','~']

    Returns
    -------
    None.

    '''
    if os.path.exists("PREDICTIONS.csv"):
        os.remove("PREDICTIONS.csv")

    with open('PREDICTIONS.csv', mode='w', newline='') as predictions_file:
        predictions_writer = csv.writer(predictions_file, delimiter=',')
        for prediction in predictions:
            predictions_writer.writerow([prediction[0], prediction[1]])
        print(str(len(predictions)) + "\t Labels wurden geschrieben.")

In [3]:
## N ==0 , A == 1

def F1_SCORE(y_test,y_pred):
#     df_gt = pd.read_csv("../training/REFERENCE.csv", header=None)  # Wahrheit
#     df_pred = pd.read_csv("prediction.csv", header=None)   # Klassifikationen

    TP = 0
    TN = 0
    FP = 0
    FN = 0
    
    for i in range(len(y_test)):
#         gt_name = dataset['file_name'][i]
#         gt_class = dataset['label'][i]

#         pred_indx = df_pred[df_pred[0]==gt_name].index.values

#         if not pred_indx.size:
#             print("Prediktion für " + gt_name + " fehlt, nehme \"normal\" an.")
#             pred_class = 1
#         else:
#             pred_indx = pred_indx[0]
#             pred_class = df_pred[1][pred_indx]

        gt_class = y_test[i]
        pred_class = y_pred[i]
        
        if gt_class == 1 and pred_class == 1:
            TP += 1
        if gt_class == 0 and pred_class == 0:
            TN +=1
        if gt_class == 0 and pred_class == 1:
            FP += 1
        if gt_class == 1 and pred_class == 0:
            FN += 1
        
    F1 = TP / (TP + 1/2*(FP+FN))      
    
#     prec = TP /(TP+FP)
#     recall = TP / (TP+FN)
#     F1 = 2 /( (1/prec) + (1/recall) )  
#     F1 = 2 * prec * recall / (prec + recall)
    print(F1,TN,FP,FN,TP)

In [5]:
ecg_leads,ecg_labels,fs,ecg_names = load_references() # Importiere EKG-Dateien, zugehörige Diagnose, Sampling-Frequenz (Hz) und Name                                                # Sampling-Frequenz 300 Hz
# print('ecg_leads: ', ecg_leads)
# print('ecg_names: ', ecg_names)
# print('ecg_labels: ', ecg_labels)


detectors = Detectors(fs)                                 # Initialisierung des QRS-Detektors
sdnn_normal = np.array([])                                # Initialisierung der Feature-Arrays
sdnn_afib = np.array([])
sdnn_total =[]

for idx, ecg_lead in enumerate(ecg_leads):
    r_peaks = detectors.hamilton_detector(ecg_lead)     # Detektion der QRS-Komplexe
    sdnn = np.std(np.diff(r_peaks)/fs*1000)             # Berechnung der Standardabweichung der Schlag-zu-Schlag Intervalle (SDNN) in Millisekunden
    sdnn_total = np.append(sdnn_total, r_peaks)  # Kombination der beiden SDNN-Listen

    if ecg_labels[idx]=='N':
        sdnn_normal = np.append(sdnn_normal, sdnn)         # Zuordnung zu "Normal"
    if ecg_labels[idx]=='A':
        sdnn_afib = np.append(sdnn_afib, sdnn)             # Zuordnung zu "Vorhofflimmern"



# print(sdnn_total.shape)

# # df means data frame
df = pd.DataFrame(ecg_leads).fillna(0) # To avoid NAN problems, fill 0 for the train matrices
df['file_name'] = pd.Series(ecg_names) # don't need .mat from file's name
# print('df: ',df)
df['label'] = pd.Series(ecg_labels)
# print('df: ',df)
dataset = df[(df["label"] == 'N') | (df["label"] == 'A')].reset_index(drop=True) # only keep A and N
print(dataset)


# Divide dataset into train-,valid- and test-dataset
# train,valid,test = 60%,20%,20%
X = dataset.iloc[:, :-2].values # all rows, columns without file_name and label
y = dataset["label"].values
# print('X:',X.shape,'\n',X)
# print('y:',y.shape)

# X = X.reshape(X.shape[0], X.shape[1], 1)

for n, i in enumerate(y):
    if i == "N":
        y[n] = 0
    elif i == "A":
        y[n] = 1

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=10,stratify=y)
# x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.25, random_state=10)

import collections
result = collections.Counter(y_test)
print(result)

6000	 Dateien wurden geladen.


  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


         0     1     2     3     4     5     6     7     8     9  ...  18278  \
0        8     8     8     8     8     8     7     7     7     8  ...    0.0   
1     -504  -602  -704  -803  -886  -926  -921  -900  -892  -887  ...    0.0   
2       23    24    26    32    40    49    57    61    61    58  ...    0.0   
3      -55   -69   -81   -93  -104  -115  -124  -133  -138  -137  ...    0.0   
4      -14   -16   -18   -20   -21   -25   -38   -53   -58   -60  ...    0.0   
5      -10   -15   -22   -28   -29   -27   -25   -21   -13    10  ...    0.0   
6      492   589   690   794   885   915   926   925   914   907  ...    0.0   
7     -196  -248  -302  -335  -357  -372  -381  -381  -375  -366  ...    0.0   
8      -25   -50   -68   -84   -97  -106  -112  -116  -119  -122  ...    0.0   
9      -36   -74  -104  -135  -167  -201  -227  -245  -260  -273  ...    0.0   
10      41    50    62    75    90    98   105   111   116   119  ...    0.0   
11     -59   -64   -68   -71   -74   -77

Counter({0: 896, 1: 130})


In [9]:
def get_ecg_features(ecg_data, ecg_labels, waveletname):
    list_features = []
    list_unique_labels = list(set(ecg_labels))
    list_labels = [list_unique_labels.index(elem) for elem in ecg_labels]
    for signal in ecg_data:
        list_coeff = pywt.wavedec(signal, waveletname)
        features = []
        for coeff in list_coeff:
            features += get_features(coeff)
        list_features.append(features)
    return list_features, list_labels

X_train_ecg, Y_train_ecg = get_ecg_features(x_train, y_train, 'db4')
X_test_ecg, Y_test_ecg = get_ecg_features(x_test, y_test, 'db4')


In [None]:
=======================================GET ECG FEATURE=================================================================

In [16]:
cls = GradientBoostingClassifier(n_estimators=10000)
cls.fit(X_train_ecg, Y_train_ecg)

cls_pred_test = cls.predict(X_test_ecg)

f1_cls_test = f1_score(list(Y_test_ecg), list(cls_pred_test))
print('The f1_cls:', f1_cls_test)
cls_cm_test =confusion_matrix(list(Y_test_ecg), list(cls_pred_test))
print('cls_cm: ',cls_cm_test)

print('cls_cm_test:\n')
F1_SCORE(y_test, cls_pred_test)

The f1_cls: 0.4102564102564103
cls_cm:  [[871  25]
 [ 90  40]]
cls_cm_test:

0.41025641025641024 871 25 90 40


In [None]:
====================================OVERSAMPLING AND GER ECG FEATURE=======================================================================

In [7]:
from imblearn.over_sampling import SMOTE
x_over, y_over = SMOTE(random_state=42).fit_resample(list(x_train), list(y_train))

In [8]:
num_over = collections.Counter(y_over)
print(num_over)

Counter({0: 2685, 1: 2685})


In [13]:
X_over_ecg, Y_over_ecg = get_ecg_features(x_over, y_over, 'db4')

In [14]:
cls = GradientBoostingClassifier(n_estimators=10000)
cls.fit(list(X_over_ecg), list(Y_over_ecg))

cls_over_pred = cls.predict(X_test_ecg)


f1_cls = f1_score(list(Y_test_ecg), list(cls_over_pred))
print('The f1_cls:', f1_cls)
cls_cm=confusion_matrix(list(Y_test_ecg), list(cls_over_pred))
print('cls_cm: ',cls_cm)

print('cls_cm:\n')
F1_SCORE(Y_test_ecg, cls_over_pred)


The f1_cls: 0.460093896713615
cls_cm:  [[862  34]
 [ 81  49]]
cls_cm:

0.460093896713615 862 34 81 49


In [None]:
==============================================END==================================================================