# TODO
- try different wavelets and classifier parameters
- stacking classifier?
- sample from classes such that they all have same cardinality
- try with CNN (or RNN)
- hiearchical classification: first classify 3 vs all, then 0 vs 1 vs 2

# Imports & setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from feature_extraction import *
import scipy
from scipy import fft
from scipy import signal
from collections import Counter
import pywt
from biosppy.signals import ecg
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

In [2]:
n_cores = 1
random_state = 71
sampling_rate = 300
data_directory = 'data/'

waveletname = 'db31'
level = 5 #10

# Data import

In [3]:
X_train_df = pd.read_csv(data_directory + 'X_train.csv', index_col='id')
y_train_df = pd.read_csv(data_directory + 'y_train.csv', index_col='id')
X_test_df = pd.read_csv(data_directory + 'X_test.csv', index_col='id')

# Length adjustments

### Drop trailing NaN

In [4]:
def drop_trailing_na(df: pd.DataFrame):
    return [df.loc[i].dropna().to_numpy() for i in range(df.shape[0])]

X_train_full = drop_trailing_na(X_train_df)
y_train_full = y_train_df['y'].to_numpy()
X_test = drop_trailing_na(X_test_df)

# Noise handling

In [5]:
def wavelet_transform(signal):
    return pywt.wavedec(signal, waveletname, level=level)

def wavelet_noise_cancellation(signal):
    coeffs = wavelet_transform(signal)
    return pywt.waverec(coeffs, waveletname)

def wavelet_noise_cancellation_bulk(data):
    result = []
    for signal in data:
        result.append(wavelet_noise_cancellation(signal))
    return result

"""X_train_full_filtered = wavelet_noise_cancellation_bulk(X_train_full)
X_test_filtered = wavelet_noise_cancellation_bulk(X_test)"""

'X_train_full_filtered = wavelet_noise_cancellation_bulk(X_train_full)\nX_test_filtered = wavelet_noise_cancellation_bulk(X_test)'

# Separation in training and validation

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=random_state)

# Features extraction

### Features to extract

In [7]:
def calculate_entropy(list_values):
    value, probabilities = np.unique(list_values, return_counts=True)
    entropy = scipy.stats.entropy(probabilities)
    return [entropy]

def calculate_crossings(list_values):
    zero_crossing_indices = np.nonzero(np.diff(np.array(list_values) > 0))[0]
    no_zero_crossings = len(zero_crossing_indices)
    mean_crossing_indices = np.nonzero(np.diff(np.array(list_values) > np.nanmean(list_values)))[0]
    no_mean_crossings = len(mean_crossing_indices)
    return [no_zero_crossings, no_mean_crossings]
 
def calculate_statistics(list_values):
    n5 = np.nanpercentile(list_values, 5)
    n25 = np.nanpercentile(list_values, 25)
    n75 = np.nanpercentile(list_values, 75)
    n95 = np.nanpercentile(list_values, 95)
    median = np.nanpercentile(list_values, 50)
    mean = np.nanmean(list_values)
    std = np.nanstd(list_values)
    var = np.nanvar(list_values)
    rms = np.nanmean(np.sqrt(list_values**2))
    return [n5, n25, n75, n95, median, mean, std, var, rms]

def get_array_features(arr):
    features = []
    features += calculate_entropy(arr)
    features += calculate_crossings(arr)
    features += calculate_statistics(arr)
    return features

def get_wavelet_features(signal):
    features = []
    list_coeff = wavelet_transform(signal)
    for coeff in list_coeff:
        features += get_array_features(coeff)
    return features

def calculate_consecutive_diff(x):
    return np.ediff1d(x)

def get_values(template, r_peaks, peaks):
    result = []
    for i in range(len(peaks)):
        result.append(template[i][peaks[i] - r_peaks[i] + 60])
    return np.array(result)

def get_ecg_values(signal):
    result = ecg.ecg(signal, sampling_rate=sampling_rate, show=False)
    template = result['templates']

    p_peaks, p_start, p_end = getPPositions(result)

    q_peaks, q_start = getQPositions(result)
    for i in range(len(q_start)):
        if q_start[i] == p_peaks[i]:
            q_start[i] = int(p_end[i] + abs(q_peaks[i] - p_end[i]) / 2)

    r_peaks = result['rpeaks'].tolist()

    s_peaks, s_end = getSPositions(result)
    
    t_peaks, t_start, t_end = getTPositions(result)
    
    beats = fft.fft(template)
    heart_rate = sampling_rate * (60.0 / np.diff(result['rpeaks']))
    heart_rate = np.append(heart_rate, heart_rate[-1]).reshape(-1, 1)

    # They are of length = # heart beats - 1 !!!
    RRinterval = calculate_consecutive_diff(r_peaks)
    PPinterval = calculate_consecutive_diff(p_peaks)
    TPinterval = p_start[1:] - t_end[:-1]

    Pduration = p_end - p_start
    PRsegment = q_start - p_end
    PRinterval = q_start - p_start
    QRScomplex = s_end - q_start
    QTinterval = t_end - q_start
    STsegment = t_start - s_end
    STTsegment = t_end - s_end

    p_values = get_values(template, r_peaks, p_peaks)
    q_values = get_values(template, r_peaks, q_peaks)
    r_values = get_values(template, r_peaks, r_peaks)
    s_values = get_values(template, r_peaks, s_peaks)
    t_values = get_values(template, r_peaks, t_peaks)

    PQ_diff = q_peaks - p_peaks
    PR_diff = r_peaks - p_peaks
    PS_diff = s_peaks - p_peaks
    PT_diff = t_peaks - p_peaks
    QR_diff = r_peaks - q_peaks
    QS_diff = s_peaks - q_peaks
    QT_diff = t_peaks - q_peaks
    RS_diff = s_peaks - r_peaks
    RT_diff = t_peaks - r_peaks
    ST_diff = t_peaks - s_peaks

    return heart_rate, np.real(beats), np.imag(beats), \
        RRinterval, PPinterval, Pduration, PRsegment, PRinterval, QRScomplex, QTinterval, STsegment, STTsegment, TPinterval, \
        p_values, q_values, r_values, s_values, t_values, \
        PQ_diff, PR_diff, PS_diff, PT_diff, QR_diff, QS_diff, QT_diff, RS_diff, RT_diff, ST_diff
    ## Useful values if you want to use time-series-like arrays
    # return p_start, p_peaks, p_end, q_start, q_peaks, r_peaks, s_peaks, s_end, t_start, t_peaks, t_end

def get_features(signal):
    features = []

    ecg_values_list = get_ecg_values(signal)
    for ecg_values in ecg_values_list:
        features += get_array_features(ecg_values)

    features += get_wavelet_features(signal)

    return features

def get_dataset_features(data):
    list_features = []
    for signal in data:
        list_features.append(get_features(signal))
    return list_features

In [8]:
X_train_extracted = get_dataset_features(X_train)

In [9]:
X_val_extracted = get_dataset_features(X_val)

In [29]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
selection = SelectKBest(mutual_info_classif, k=260).fit(X_train_extracted, y_train.T.ravel())
X_train_extracted2 = selection.transform(X_train_extracted)
X_val_extracted2 = selection.transform(X_val_extracted)
cls = XGBClassifier(seed=random_state)
cls.fit(X_train_extracted2, y_train.T.ravel())

y_train_pred = cls.predict(X_train_extracted2)
y_val_pred = cls.predict(X_val_extracted2)
train_score = f1_score(y_train, y_train_pred, average='micro')
val_score = f1_score(y_val, y_val_pred, average='micro')

print(train_score, val_score)

1.0 0.7998046875


# Classification

### Classifier definition

In [10]:
#cls = GradientBoostingClassifier(n_estimators=100, verbose=1, random_state=random_state)
cls = XGBClassifier(seed=random_state) #(n_estimators=100, gamma=1, reg_alpha=3, reg_lambda=0, max_depth=10, min_child_weight=0, colsample_bytree=0.85, seed=random_state)

## Classifier fit and evaluation

In [11]:
cls.fit(X_train_extracted, y_train.T.ravel())

y_train_pred = cls.predict(X_train_extracted)
y_val_pred = cls.predict(X_val_extracted)

In [12]:
train_score = f1_score(y_train, y_train_pred, average='micro')
val_score = f1_score(y_val, y_val_pred, average='micro')

print(train_score, val_score)

1.0 0.8046875


# Final classification

In [9]:
X_train_full_extracted = get_dataset_features(X_train_full)
X_test_extracted = get_dataset_features(X_test)

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC
svc = SVC(random_state=random_state)
svc_model = Pipeline([ ('scaler', StandardScaler()), ('svc', svc) ])

from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=random_state)
etc_model = Pipeline([ ('scaler', StandardScaler()), ('etc', etc) ])

from sklearn.ensemble import BaggingClassifier
bc = BaggingClassifier(estimator=svc_model, random_state=random_state)
bc_model = Pipeline([ ('scaler', StandardScaler()), ('bc', bc) ])

from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(estimator=svc_model, random_state=random_state)
abc_model = Pipeline([ ('scaler', StandardScaler()), ('abc', abc) ])

estimators = [
    ('xgbc', cls),
    ('svc', svc_model),
    #('gb', gb_model),
    ('etc', etc_model),
    #('gpr', gpr_model),
    ('bc', bc_model),
    ('abc', abc_model)
]

final_pipeline = Pipeline([ ('model', LogisticRegression()) ])
classifier = StackingClassifier(estimators, final_pipeline, n_jobs=n_cores)

In [14]:
y_test_pred = classifier.fit(X_train_full_extracted, y_train_full.T.ravel()).predict(X_test_extracted)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Just XGBClassifier
# y_test_pred = cls.fit(X_train_full_extracted, y_train_full.T.ravel()).predict(X_test_extracted)

### Writing results

In [None]:
table = pd.DataFrame({'id': np.arange(0, y_test_pred.shape[0]), 'y': y_test_pred.flatten()})
table.to_csv(data_directory + 'y_test_pred.csv', index=False)