In [1]:
import numpy as np
from scipy import signal
import pandas as pd
import pickle
import re
import importlib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import matplotlib.pyplot as plt
from matplotlib import style
import plotly.graph_objects as go

import util
import plot

In [None]:
importlib.reload(util)

In [2]:
def moving_mean(data, window_size=200):
    weights = np.ones(window_size) / window_size
    return np.convolve(data, weights, mode='same')


In [23]:
def extract_psd(file_names, seg_num=5, fs_new=2.4):
    b, a = signal.butter(3, 0.1)
    df = pd.DataFrame()
    psd_1, psd_2 = [], []
    
    for file in file_names:
        with open('data/processed/' + file + '.pkl', 'rb') as f:
            res = pickle.load(f)
            apn = res['apn']

        with open('features/HR_' + file + '.pkl', 'rb') as f:
            res = pickle.load(f)
            hr = res['hr']
            t_hr = res['t'] # in minute

        # Remove outliers (defined as hr > 2)
        idx_valid = (hr < 2) & (hr > 0.5)
        hr = hr[idx_valid]
        t_hr = t_hr[idx_valid]

        # Filter out noise
        hr_smth = signal.filtfilt(b, a, hr)

        # Resample data
        t_interp = np.arange(t_hr[0], t_hr[-1], 1 / fs_new / 60)
        hr_interp = np.interp(t_interp, t_hr, hr_smth)
        
        group = util.ecg_diagnose(apn) if file[0] == 'x' else file[0].upper() 
        for minute in range(len(apn) - 4):
            # 5-min window
            hr_cur = hr_interp[(t_interp > minute) & (t_interp < minute + 5)]
            freq, psd_ = signal.periodogram(
                x=hr_cur, 
                fs=fs_new)
            psd_1.append(psd_)
            
            # 3-min window
            hr_cur = hr_interp[(t_interp > minute + 1) & (t_interp < minute + 4)]
            _, psd_ = signal.periodogram(
                x=hr_cur, 
                fs=fs_new)
            psd_2.append(psd_)
            
            df = df.append({
                'apn': apn[minute + 2],
                'file': file,
                'group': group,
            }, ignore_index=True)
        
    df['apn'] = df['apn'].astype(int)
#     psd_1, psd_2 = np.vstack(psd_1), np.vstack(psd_2)
    return df, freq, psd_1, psd_2

In [24]:
train_df = pd.read_csv('resources\File_train.csv')
df, freq, psd_1, psd_2 = extract_psd(train_df['file'])

In [25]:
def feature_psd(psd_list, freq, f_thres):
    fea = []
    for psd in psd_list:
        area_total = psd.sum()
        area_lf = psd[freq < f_thres].sum()
        area_hf = psd[freq > f_thres].sum()
        fea.append([
            psd.max(),
            freq[np.argmax(psd)],
            area_total,
            area_lf,
            area_hf,
            area_lf / area_total,
            area_hf / area_total,
            area_lf / area_hf,
        ])
        
    return fea

## Optimize threshold for 5-min psd

In [26]:
res = []
for f_thres in (2 * np.logspace(-3, -1, 30)):
    fea = feature_psd(psd_1, freq, f_thres)
    fea_df = pd.concat([df, pd.DataFrame(fea)], axis=1)
    feature_col = fea_df.drop(labels=['apn', 'group', 'file'], axis=1).columns
    mdl = LogisticRegression(solver='lbfgs', max_iter=1e4)
    acc_train, acc_val = util.model_evaluation_CV(mdl, fea_df, train_df, feature_col, n=4, normalize=True)
    res.append([f_thres, acc_val])
        
res = np.vstack(res)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 358 but corresponding boolean dimension is 360

In [16]:
idx = np.argmax(res[:, 1])
res[idx, :]

array([0.0097878 , 0.80472232])

## Optimize threshold for 3-min psd

In [17]:
res = []
for f_thres in (2 * np.logspace(-3, -1, 30)):
    fea = feature_psd(psd_2, freq, f_thres)
    fea_df = pd.concat([df, pd.DataFrame(fea)], axis=1)
    feature_col = fea_df.drop(labels=['apn', 'group', 'file'], axis=1).columns
    mdl = LogisticRegression(solver='lbfgs', max_iter=1e4)
    acc_train, acc_val = util.model_evaluation_CV(mdl, fea_df, train_df, feature_col, n=4, normalize=True)
    res.append([f_thres, acc_val])
        
res = np.vstack(res)

In [18]:
idx = np.argmax(res[:, 1])
res[idx, :]

array([0.0097878 , 0.78938044])

## Combined features

In [None]:
fea_1 = feature_psd(psd_1, freq, 0.01)
fea_2 = feature_psd(psd_2, freq, 0.01)
fea_df = pd.concat([df, pd.DataFrame(np.hstack((fea_1, fea_2)))], axis=1)
feature_col = fea_df.drop(labels=['apn', 'group', 'file'], axis=1).columns

In [None]:
fea_df

In [None]:
mdl = LogisticRegression(solver='lbfgs', max_iter=1e4)
acc_train, acc_val = util.model_evaluation_CV(mdl, fea_df, train_df, feature_col, n=4, normalize=True)

In [None]:
acc_val

In [None]:
fea_df = pd.concat([df, pd.DataFrame(fea)], axis=1)
feature_col = fea_df.drop(labels=['apn', 'group', 'file'], axis=1).columns

In [None]:
feature_col

In [None]:
df