## Imports

In [8]:
## Link: https://github.com/WJMatthew/WESAD/blob/master/data_wrangling.py
import os
import re
import pickle
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import scipy.signal as scisig
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
import matplotlib as mpl
import heartpy as hp
import biosppy
import neurokit2 as nk
from heartpy.datautils import *
from heartpy.peakdetection import *
mpl.rcParams['agg.path.chunksize'] = 10000
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from sklearn.pipeline import Pipeline
import csv 
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# Feature Extraction

Features coded:

ECG: mean, std, min, max, bpm, ibi, sdnn, sdsd, rmssd, pnn20, pnn50, 


PPG/BVP: mean, std, min, max, peak_freq

TEMP:mean, std, min, max, drange, slope

RESP: mean, std, min, max, rate; Inh: mean, std; Exh: mean, std, I/E


EDA: mean, std, min, max, slipe, drange; SCR: mean, std, min, max; SCL: mean, std, min, max

ACC x,y,z; chest, wrist:  mean, std, min, max, abs_integral, peak_freq
Acc net: mean, std, min, max, abs_integral, peak_freq


Features not coded yet:


To replicate this study with similar modalities to RADWear, I will drop the following signals: EDA_c, EMG, TEMP_C

## Parameters

In [3]:
# E4 (wrist) Sampling Frequencies
fs_dict = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4, 'label': 700, 'Resp': 700, 'ECG': 700, 
           'chest': 700}
# Window size
WINDOW_IN_SECONDS = 60
stride = 1

# Labels
label_dict = {'baseline': 1, 'stress': 2, 'amusement': 0}
# Int to label mappings
int_to_label = {1: 'baseline', 2: 'stress', 0: 'amusement'}
# Feature names
feat_names = None
# Where to save the data
savePath = 'data/WESAD'
loadPath_GN = 'data/GN-WESAD'
# Where to get the data
subject_feature_path = '/subject_feats'

if not os.path.exists(savePath):
    os.makedirs(savePath)
if not os.path.exists(savePath + subject_feature_path):
    os.makedirs(savePath + subject_feature_path)

In [4]:
# Data import
os.listdir(savePath)
df = pd.read_csv(savePath +'/oct5_feats4.csv', index_col=0)
pd.set_option('display.max_columns', None) 


We want to drop columns in df that are not in RADWear to match modalities. 

In [5]:
# drop _c columns
columns_list = df.columns.tolist()
drop_list = []
#df.drop(columns=['Resp_C'])
for column in columns_list:
    if 'EMG' in column or 'EDA_C' in column or 'Temp_C' in column or 'TEMP_C' in column or 'SCR_C' in column or 'SCL_C' in column:
        drop_list.append(column)

reduced_df = df.drop(columns=drop_list)
df = reduced_df

## Generate correlation between features and labels

In [6]:
df = pd.read_csv(savePath +'/oct5_feats4.csv', index_col=0)


In [7]:
df.columns

Index(['net_acc_C_mean', 'net_acc_C_std', 'net_acc_C_min', 'net_acc_C_max',
       'net_acc_mean', 'net_acc_std', 'net_acc_min', 'net_acc_max', 'EDA_mean',
       'EDA_std',
       ...
       'Resp_C_Inhal_std', 'Resp_C_Exhal_mean', 'Resp_C_Exhal_std',
       'Resp_C_I/E', 'TEMP_drange', 'TEMP_C_drange', 'TEMP_slope',
       'TEMP_C_slope', 'subject', 'label'],
      dtype='object', length=120)

In [None]:
vals = abs(df.corr()['label']).sort_values(ascending=False)
if False:
    for i in range(len(vals)):
        print(vals.index[i], vals[i])

corr = df.corr()
plot_corr = True
if plot_corr:
    plt.figure(figsize=(16,10))
    sns.heatmap(corr,xticklabels=True, yticklabels=True,square=True)

In [None]:
# df = df.loc[:, df.columns != 'Resp_C_rate'] #I don't know why this is here. 

features = df.loc[:, df.columns != 'label'].columns
print_feats_list = False
if print_feats_list:
    for ft_idx in range(len(features)):
        print(features[ft_idx], ft_idx)

## split data into train and test set

In [None]:

X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

# Modeling

## Cross Validation

### K-Fold Cross Validation

### Leave-One-Out Cross Validation

# Models 

The models will be included in this study are: 

DT, RF, SVM, AB, LDA and kNN. 

Completed: LDA, RF, SVM, AB, DT, kNN 

Incomplete: NaN 

In [None]:
fb_model_list = ['DT', 'RF', 'SVM', 'LDA', 'KNN', 'AdaBoost']


## Linear Discriminant Analysis

In [57]:
def run_LDA(X_train, X_test, y_train, y_test):
    sc = StandardScaler()  
    X_train = sc.fit_transform(X_train)  
    X_test = sc.transform(X_test)  

    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    y_pred = lda.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)  
    print(cm)  
    print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))  
    lda_baseline_acc = accuracy_score(y_test, y_pred)
    return lda_baseline_acc


lda_baseline_acc = run_LDA(X_train, X_test, y_train, y_test)
print('lda accuracy: ', lda_baseline_acc)


[[ 8 11  1]
 [ 4 63  1]
 [ 0  0 28]]
Accuracy: 0.853448275862069
[[ 8 11  1]
 [ 4 63  1]
 [ 0  0 28]]
Accuracy: 0.853448275862069
lda accuracy:  0.853448275862069
shit lda accuracy:  0.853448275862069


### Feature Importance (Top 20)

## Random Forest Classifier

In [58]:
def run_RF(X_train, X_test, y_train, y_test, max_depth=4, random_state=0):
    classifier = RandomForestClassifier(max_depth=max_depth, random_state=random_state)
    classifier.fit(X_train, y_train)  
    y_pred = classifier.predict(X_test)  
    cm = confusion_matrix(y_test, y_pred)  
    print(cm)  
    print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))  
    rf_baseline_acc = accuracy_score(y_test, y_pred)
    importances = classifier.feature_importances_
    forest_importances = pd.Series(importances, index=features).sort_values(ascending=False)

    return rf_baseline_acc, forest_importances
rf_baseline_acc , forest_importances= run_RF(X_train, X_test, y_train, y_test)



[[13  6  1]
 [ 0 67  1]
 [ 0  0 28]]
Accuracy: 0.9310344827586207
[[13  6  1]
 [ 0 67  1]
 [ 0  0 28]]
Accuracy: 0.9310344827586207
rf accuracy:  0.9310344827586207
rf accuracy:  0.9310344827586207


### Feature Importance (Top 20)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
forest_importances[:20].plot.barh(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

## Support Vector Machine

### Linear SVM

In [None]:
def run_svm( X_train, X_test, y_train, y_test, C=1, random_state=0, kernel='linear'):
    # Create a SVC classifier using a linear kernel
    clf = SVC(kernel=kernel, C=C, random_state=random_state)
    # Train the classifier
    clf.fit(X_train, y_train)

    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    lm_svc=(classification_report(y_test, y_pred, digits=4))
    print(lm_svc)
    svm_baseline_acc = accuracy_score(y_test, y_pred)
    pd.Series(abs(clf.coef_[0]), index=features).nlargest(10).plot(kind='barh') # Feature Importance (Top 20)
    return svm_baseline_acc
svm_baseline_acc = run_svm(X_train, X_test, y_train, y_test)
svm2_baseline_acc = run_svm(X_train, X_test, y_train, y_test, C=0.9) # linear svm lambda 2

## Adaboost 

In [None]:
def run_ab( X_train, X_test, y_train, y_test, n_estimators=100, random_state=0):
    clf = AdaBoostClassifier(n_estimators=100, random_state=0)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    ab_baseline_acc = accuracy_score(y_test, y_pred)
    ab2_baseline_acc = clf.score(X_test, y_test)
    ab_imp = clf.feature_importances_
    ab_imp = pd.Series(ab_imp, index=features).sort_values(ascending=False)
    # plot importances
    fig, ax = plt.subplots(figsize=(12, 5))
    ab_imp[:20].plot.barh(ax=ax)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()
    return ab_baseline_acc, ab2_baseline_acc
ab_baseline_acc, ab2_baseline_acc = run_ab(X_train, X_test, y_train, y_test)

## Decision Tree 

In [None]:
def run_dt(X_train, X_test, y_train, y_test):
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    dt_baseline_acc = accuracy_score(y_test, y_pred)

    # plot importances
    dt_importances = pd.Series(clf.feature_importances_, index=features).sort_values(ascending=False)

    fig, ax = plt.subplots(figsize=(12, 5))
    dt_importances[:20].plot.barh(ax=ax)
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()

    return dt_baseline_acc
dt_baseline_acc = run_dt(X_train, X_test, y_train, y_test)

print('decision tree baseline accuracy: ' + str(dt_baseline_acc))

### Feature Importance (Top 20)

## k-Nearest Neighbors 

In [None]:
def run_knn(X_train, X_test, y_train, y_test, n_neighbors=3):
    nca = NeighborhoodComponentsAnalysis(random_state=42)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    #knn.fit(X_train, y_train)
    #y_pred = knn.predict(X_test)
    #knn_baseline_acc = accuracy_score(y_test, y_pred)

    ####################
    #  Feature importance cannot be discovered for the kNN model. 
    #####################
    #y_pred = knn.predict(X_test)
    #knn_baseline_acc = accuracy_score(y_test, y_pred)
    nca_pipe = Pipeline([('nca', nca), ('knn', knn)])
    nca_pipe.fit(X_train, y_train)
    knn_baseline_acc = nca_pipe.score(X_test, y_test)
    #print('knn baseline accuracy: ' + str(knn_baseline_acc))
    return knn_baseline_acc
knn_baseline_acc = run_knn(X_train, X_test, y_train, y_test)

# Adding Noise

## Signal to Noise Ratio

For a non-constant signal $S$ and noise $N$, the signal to noise ratio is defined as the following:
$$ SNR = \frac{\mathbb{E}[S^2]}{\mathbb{E}[N^2]} $$

The expected value $\mathbb{E}[X]$ of any continuous random variable $X$ is $\int_{-\infty}^{\infty} x p(x) dx $, where $p(x)$ is its associated probability density function.

For homoskedastic noise, we can use closed form expressions to compute $E[N^2]$.

- For Gaussian distributed noise $N$ ~ $n(\mu, \sigma^2)$, notice that $\text{V}[N] = \mathbb{E}[N^2] - (\mathbb{E}[N])^2,$ so $\mathbb{E}[N^2] = \text{V}[N] + (\mathbb{E}[N])^2 = \sigma^2 + \mu$. In our case $\mu = 0$, so $\mathbb{E}[N^2] = \sigma^2$.

- For uniformly distributed noise $N$ ~ $u(\alpha, \beta)$, by the same logic as above $\mathbb{E}[N^2] = \left(\frac{\alpha - \beta}{2}\right)^2$.

- For frequency-domain noise $N$ of the form $A\sin(2\pi x \frac{1}{f}) + y, \mathbb{E}[N^2] \approx y^2 + \frac{A^2}{2}$. Note the $\approx$ since we cannot guarantee that the signal will end precisely on the end of the sin wave.

For heteroskedastic noise, because there is no closed form expression, we simply take `N.mean()` where $N$ is our noise

## Denoising Using Principal Component Analysis

Let $X$ represent our set of physiological signals and $\textbf{x}_i$ denote the *i*-th column of $X$. In our case, $\textbf{x}_i$ is one of the ECG, BVP, EDA, ACC, etc. The post-noise signal we observe $\textbf{x}_i = \widetilde{\textbf{x}}_i + \xi_i$ is composed of the original raw signal and Gaussian distributed noise with $\mathbb{E}[\xi] = 0$ and $V[\xi] = E[N^2] = \sigma^2 = \frac{\mathbb{E}[S^2]}{SNR}$. Literature has indicated that a principal component analysis of $\textbf{x}_i$ can produce an estimate of $\widetilde{\textbf{x}}_i$ that is closer than the noisy measurements are (citation needed).

## Calculate Distribution Parameters from SNR

Given a signal $S$, we can specify a signal to noise ratio $SNR = \frac{\mathbb{E}[S^2]}{\mathbb{E}[N^2]}$ and use this to calculate $\mathbb{E}[N^2]$ because $SNR$ and $\mathbb{E}[S^2]$ are known. So $\mathbb{E}[N^2] = \frac{\mathbb{E}[S^2]}{SNR}$.

Then, for any homoskedastic noise following a well-defined probability density function (PDF), we can solve for the parameters of the PDF using the known value $\mathbb{E}[N^2]$.

- For Gaussian distributed noise $N$ ~ $n(\mu, \sigma^2)$, notice that $\text{V}[N] = \mathbb{E}[N^2] - (\mathbb{E}[N])^2,$ so $\mathbb{E}[N^2] = \text{V}[N] + (\mathbb{E}[N])^2 = \sigma^2 + \mu$. In our case $\mu = 0$, so $\mathbb{E}[N^2] = \sigma^2$. Thus, $\sigma^2 = \frac{\mathbb{E}[S^2]}{SNR}$.

## Gaussian Noise

The Gaussian probability density function is of the following form:
\begin{equation}
f(x) = \frac{1}{\sigma \sqrt{2 \pi}}exp\left(-\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^2\right)
\end{equation}

### Estimating $\mu$ and $\sigma$ of the Gaussian

#### Greatest $n$-Differential with Homoskedasticity Approach

For a signal $S$, the greatest $n$-differential with homoskedasticity approach constructs a Gaussian distribution such that $\mu$ = 0 and $\sigma = \alpha \cdot max(|S_i - S_{i+n}|)$, where $max(|S_i - S_{i+n}|)$ denotes the maximum absolute difference of the signal between index $i$ and $i+n$ in the entire signal, and $\alpha$ is a parameter that multiplicatively scales the intensity of the added noise. We can choose to set $n$ to any value, although we have empirically found $n = 5$ to be the best. We set $\mu$ to $0$ so we don't vertically shift the original signal after adding noise. 

In conclusion, we randomly sample from the following probability density function:
$$
f(x) = \frac{1}{\alpha \cdot max(|S_i - S_{i+n}|) \sqrt{2 \pi}}exp\left(-\frac{1}{2}\left(\frac{x}{\alpha \cdot max(|S_i - S_{i+n}|)}\right)^2\right)
$$

This noise exhbits homoskedasticity because it does not vary with time.

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=5, figsize=(16,4))
axs[0].plot(np.ravel(patients_new_noise1[0]['signal']['chest']['Resp']))
axs[0].set_title('SNR: 0.01')
axs[1].plot(np.ravel(patients_new_noise2[0]['signal']['chest']['Resp']))
axs[1].set_title('SNR: 0.05')
axs[2].plot(np.ravel(patients_new_noise3[0]['signal']['chest']['Resp']))
axs[2].set_title('SNR: 0.1')
axs[3].plot(np.ravel(patients_new_noise4[0]['signal']['chest']['Resp']))
axs[3].set_title('SNR: 0.2')
axs[4].plot(np.ravel(patients_new_noise5[0]['signal']['chest']['Resp']))
axs[4].set_title('SNR: 0.5')


# Data Preparation (pt. 2)

Prepare the data again, this time with the noisy data

In [5]:
#snrs = [0.00001, 0.0001,  0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6] # this is biggest list we'd need
snrs = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.6] # this is what we ran #0.00001,
n_samples = 10 # number of samples taken per SNR
loadPath = '../data/GN-WESAD'
#savePath = '../data/GN-WESAD'
subject_feature_path = '/subject_feats'
n_samples = 10 
subject_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]
fb_model_list = ['DT', 'RF', 'SVM', 'LDA', 'KNN', 'AdaBoost']
cases = ['WESAD', 'GN-WESAD', 'PR-WESAD']

In [None]:
class rparser_with_noise:
    # Code adapted from https://github.com/arsen-movsesyan/springboard_WESAD/blob/master/parsers/readme_parser.py
    VALUE_EXTRACT_KEYS = {
        "age": {
            'search_key': 'Age',
            'delimiter': ':'
        },
        "height": {
            'search_key': 'Height',
            'delimiter': ':'
        },
        "weight": {
            'search_key': 'Weight',
            'delimiter': ':'
        },
        "gender": {
            'search_key': 'Gender',
            'delimiter': ':'
        },
        "dominant_hand": {
            'search_key': 'Dominant',
            'delimiter': ':'
        },
        "coffee_today": {
            'search_key': 'Did you drink coffee today',
            'delimiter': '? '
        },
        "coffee_last_hour": {
            'search_key': 'Did you drink coffee within the last hour',
            'delimiter': '? '
        },
        "sport_today": {
            'search_key': 'Did you do any sports today',
            'delimiter': '? '
        },
        "smoker": {
            'search_key': 'Are you a smoker',
            'delimiter': '? '
        },
        "smoke_last_hour": {
            'search_key': 'Did you smoke within the last hour',
            'delimiter': '? '
        },
        "feel_ill_today": {
            'search_key': 'Do you feel ill today',
            'delimiter': '? '
        }
    }
    
    DATA_PATH = 'data/WESAD/'
    parse_file_suffix = '_readme.txt'
    
    
    def __init__(self):
        
        self.readme_locations = {subject_directory: self.DATA_PATH + subject_directory + '/' 
                          for subject_directory in os.listdir(self.DATA_PATH)
                              if re.match('^S[0-9]{1,2}$', subject_directory)}
        
        # Check if parsed readme file is available ( should be as it is saved above )
        if not os.path.isfile('data/readmes.csv'):
            print('Parsing Readme files')
            self.parse_all_readmes()
        else:
            print('Files already parsed.')
            
        self.merge_with_feature_data_with_noise()
        
        
    def parse_readme(self, subject_id):
        with open(self.readme_locations[subject_id] + subject_id + self.parse_file_suffix, 'r') as f:

            x = f.read().split('\n')

        readme_dict = {}

        for item in x:
            for key in self.VALUE_EXTRACT_KEYS.keys():
                search_key = self.VALUE_EXTRACT_KEYS[key]['search_key']
                delimiter = self.VALUE_EXTRACT_KEYS[key]['delimiter']
                if item.startswith(search_key):
                    d, v = item.split(delimiter)
                    readme_dict.update({key: v})
                    break
        return readme_dict


    def parse_all_readmes(self):
        
        dframes = []

        for subject_id, path in self.readme_locations.items():
            readme_dict = self.parse_readme(subject_id)
            df = pd.DataFrame(readme_dict, index=[subject_id])
            dframes.append(df)

        df = pd.concat(dframes)
        df.to_csv(self.DATA_PATH + 'readmes.csv')

        
    def merge_with_feature_data_with_noise(self):
        # Confirm feature files are available
        if os.path.isfile('data/may14_feats4_with_noise.csv'):
            feat_df = pd.read_csv('data/may14_feats4_with_noise.csv', index_col=0)
            print(feat_df.info())
        else:
            print('No feature data available. Exiting...')
            return
           
        # Combine data and save
        df = pd.read_csv(f'{self.DATA_PATH}readmes.csv', index_col=0)

        dummy_df = pd.get_dummies(df)
        
        dummy_df['subject'] = dummy_df.index.str[1:].astype(int)

        dummy_df = dummy_df[['age', 'height', 'weight', 'gender_ female', 'gender_ male',
                           'coffee_today_YES', 'sport_today_YES', 'smoker_NO', 'smoker_YES',
                           'feel_ill_today_YES', 'subject']]

        merged_df = pd.merge(feat_df, dummy_df, on='subject')

        merged_df.to_csv('data/noise_snr_0.6.csv')
rp_with_noise = rparser_with_noise()

# Modeling (pt. 2)

Model again, this time with the noisy data

In [None]:
df = pd.read_csv('data/noise_snr_0.15.csv', index_col=0)
pd.set_option('display.max_columns', None) 

In [None]:
s = df.isna().sum()
for i in range(len(s)):
    if s[i] > 0:
        print(s.index[i], s[i])

In [None]:
df = df.loc[:, df.columns != 'Resp_C_rate']

In [None]:
features = df.loc[:, df.columns != 'label'].columns

In [None]:
X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

## Linear Discriminant Analysis

In [None]:
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  


lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))  

## Random Forest Classifier

In [None]:
classifier = RandomForestClassifier(max_depth=4, random_state=0)
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test)  

In [None]:
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))  

### Feature Importance (Top 20)

In [None]:
importances = classifier.feature_importances_
forest_importances = pd.Series(importances, index=features).sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(12, 5))
forest_importances[:20].plot.barh(ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

## Support Vector Machine

In [None]:
# Create a SVC classifier using a linear kernel
clf = SVC(kernel='linear', C=1, random_state=0)
# Train the classifier
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_out = clf.predict(X_test)
lm_svc=(classification_report(y_test, y_out, digits=4))
print(lm_svc)

# Compare Results

Compare the results of the noisy data models and the clean data models

- Plots of SNR (x-axis) vs. accuracy (y-axis)
- Compare feature importances across different noise regimes
    - Develop dynamic evaluation method based on original feature importance / added noise

## Test Each Model Architecture

results_table = pd.DataFrame(columns=[
    'SNR', 'Accuracy', 'F1-Score', 'dataset'])
    snr     %           F1           WESAD      Noise function

In [None]:
#read data
gn_wesad_day = '2023-11-12'
model = {type}
fb_model_list = ['DT', 'RF', 'SVM', 'LDA', 'KNN', 'AdaBoost'] #redudant
e2e_model_list = ['LSTM', 'CNN', 'GRU', 'RNN'] # placeholder for now

In [None]:
def gn_wesad_path(n_i, snr):
    return f'{savePath}/n_{n_i}/snr_{snr}{subject_feature_path}/{gn_wesad_day}_feats.csv'
def fetch_data(n_i, snr):
    file_path = gn_wesad_path(n_i, snr)
    df = pd.read_csv(file_path, index_col=0)
    return df
def e2e_model(df, model):

    #this is mostly going to be fetching the data. 
    #maybe check if results are already there. if yes, return them.
    #if not, run the model via sbatch script and return the results.
    e2e_results_path = '../data/e2e_results' # this is just a placeholder for now
    accuracy = []
    precision = []
    dataset = []
    snr = []
    n_i = []
    noise_fcn = []

    return [accuracy, precision, dataset, snr, n_i]
def fb_model(df, model):
    #unlike e2e, this is going to be a bit more complicated.
    # it will run the model instead of just fetching the results.

    fb_results_path = gn_wesad_path(n_i, snr) # this is just a placeholder for now
    accuracy = []
    precision = []
    dataset = []
    snr = []
    n_i = []
    noise_fcn = []
    return [accuracy, precision, dataset, snr, n_i]

def get_model(df, model):

    if model in e2e_model_list:
        return e2e_model(df, model)
    elif model in fb_model_list:
        return fb_model(df, model)
    

    accuracy = []
    precision = []
    dataset = []
    snr = []
    n_i = []
    noise_fcn = []

    return [accuracy, precision, dataset, snr, n_i]

def run_wesad_models(loadPath, fb_model_list):

    results_table = pd.DataFrame()
    for n_i in range(n_samples):
        
        for model in fb_model_list:
            
            
            model_output = get_model(df, model)
        
            results_table.loc[model] = pd.Series({'SNR':model_output[0], 'Accuracy':,
                                           'F1 Score':, 'dataset':})
    return results_table
def run_GN_models(snrs, n_samples, loadPath, fb_model_list):
    
    # the difference mainly between WESAD and GN-WESAD is that GN-WESAD has multiple snrs. 
    results_table = pd.DataFrame()    
    
    for n_i in range(n_samples):
        for snr in snrs:
            
            df = fetch_data(n_i, snr)
            
            for model in fb_model_list:
                
                model_output = get_model(df, model)
        
                results_table.loc[model] = pd.Series({'SNR':model_output[0], 'Accuracy':,
                                           'F1 Score':, 'dataset':})
    
    date = 
    file_name = 
    df = pd.read_csv(loadPath+ )
    pd.set_option('display.max_columns', None)
    
    # split data into features and labels


    for n_i in range(n_samples):
        for snr in snrs:
            for model in fb_model_list:
                
                model_output = get_model(df, model)

                results_table.loc[str(model)]['Accuracy'] = model_output[0]
                results_table.loc[str(model)]['Precision'] = model_output[1]
                results_table.loc[str(model)]['dataset'] = model_output[2]
                results_table.loc[str(model)]['snr'] = model_output[3]
                results_table.loc[str(model)]['n_i'] = model_output[4]
                   
    return results_table


print(fb_model_list)
lda_accuracy = []
rf_accuracy = []
svm_accuracy = []
ft_imp_matrix = []
# For each signal to noise ratio
for case in cases:
    if case == 'GN-WESAD':
        loadPath = '../data/GN-WESAD'
        GN_model_results = run_GN_models(snrs, subject_ids, n_samples, loadPath, fb_model_list, feature_list)
    if case == 'WESAD':
        loadPath = '../data/WESAD'
    if case == 'PR-WESAD':
        loadPath = '../data/PR-WESAD'

for i in range(len(snrs)):
    # Get data
    df = pd.read_csv('data/noise_snr_'+str(snrs[i])+'.csv', index_col=0)
    # Since Resp_C_rate is null for the first four, simply get rid of it
    df = df.loc[:, df.columns != 'Resp_C_rate']
    
    # Get features, label
    X = df.drop('label', axis=1).values
    y = df['label'].values

    # Get train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  
    
    # Scale the data
    sc = StandardScaler()  
    X_train = sc.fit_transform(X_train)  
    X_test = sc.transform(X_test)  
    
    # Test LDA
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    y_pred = lda.predict(X_test)
    lda_accuracy.append(accuracy_score(y_test, y_pred))
    
    # Test RF
    classifier = RandomForestClassifier(max_depth=4, random_state=0)
    classifier.fit(X_train, y_train)  
    y_pred = classifier.predict(X_test)  
    rf_accuracy.append(accuracy_score(y_test, y_pred))
    importances = classifier.feature_importances_
    forest_importances = pd.Series(importances, index=features).sort_values(ascending=False)
    ft_imp_matrix.append(importances)

    fig, ax = plt.subplots(figsize=(12, 5))
    forest_importances[:20].plot.barh(ax=ax)
    ax.set_title("Feature importances using MDI with SNR: " + str(snrs[i]))
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()
    plt.show();
    
    # Test SVM
    clf = SVC(kernel='linear', C=1, random_state=0)
    clf.fit(X_train, y_train)
    y_out = clf.predict(X_test)    
    svm_accuracy.append(accuracy_score(y_test, y_pred))

    # test kNN

    # test DT

    # test AdaBoost
# list of all models
#     

In [None]:
df = pd.read_csv('data/noise_snr_0.15.csv', index_col=0)
pd.set_option('display.max_columns', None) 

In [None]:
s = df.isna().sum()
for i in range(len(s)):
    if s[i] > 0:
        print(s.index[i], s[i])

In [None]:
df = df.loc[:, df.columns != 'Resp_C_rate']

In [None]:
features = df.loc[:, df.columns != 'label'].columns

In [None]:
X = df.drop('label', axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

## Linear Discriminant Analysis

In [None]:
sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  


lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))  

## Random Forest Classifier

In [None]:
classifier = RandomForestClassifier(max_depth=4, random_state=0)
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test)  

In [None]:
cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy: ' + str(accuracy_score(y_test, y_pred)))  

In [None]:

def gn_wesad_path(n_i, snr):
    return f'{savePath}/n_{n_i}/snr_{snr}{subject_feature_path}/{gn_wesad_day}_feats.csv'

def e2e_model(df, model):

    #this is mostly going to be fetching the data. 
    #maybe check if results are already there. if yes, return them.
    #if not, run the model via sbatch script and return the results.
    e2e_results_path = '../data/e2e_results' # this is just a placeholder for now
    accuracy = []
    precision = []
    dataset = []
    snr = []
    n_i = []
    noise_fcn = []

    return [accuracy, precision, dataset, snr, n_i]
def fb_model(df, model):
    #unlike e2e, this is going to be a bit more complicated.
    # it will run the model instead of just fetching the results.

    fb_results_path = gn_wesad_path(n_i, snr) # this is just a placeholder for now
    accuracy = []
    precision = []
    dataset = []
    snr = []
    n_i = []
    noise_fcn = []
    return [accuracy, precision, dataset, snr, n_i]

def get_model(df, model):

    if model in e2e_model_list:
        return e2e_model(df, model)
    elif model in fb_model_list:
        return fb_model(df, model)
    

    accuracy = []
    precision = []
    dataset = []
    snr = []
    n_i = []
    noise_fcn = []

    return [accuracy, precision, dataset, snr, n_i]

def run_wesad_models(loadPath, fb_model_list):

    results_table = pd.DataFrame()
    for n_i in range(n_samples):
        
        for model in fb_model_list:
            
            
            model_output = get_model(df, model)
        
            results_table.loc[model] = pd.Series({'SNR':model_output[0], 'Accuracy':,
                                           'F1 Score':, 'dataset':})
    return results_table
def run_GN_models(snrs, n_samples, loadPath, fb_model_list):
    
    # the difference mainly between WESAD and GN-WESAD is that GN-WESAD has multiple snrs. 
    results_table = pd.DataFrame()    
    
    for n_i in range(n_samples):
        for snr in snrs:
            file_path = gn_wesad_path(n_i, snr)
            df = pd.read_csv(file_path, index_col=0)
            
            for model in fb_model_list:
                
                model_output = get_model(df, model)
        
                results_table.loc[model] = pd.Series({'SNR':model_output[0], 'Accuracy':,
                                           'F1 Score':, 'dataset':})
    
    date = 
    file_name = 
    df = pd.read_csv(loadPath+ )
    pd.set_option('display.max_columns', None)
    
    # split data into features and labels


    for n_i in range(n_samples):
        for snr in snrs:
            for model in fb_model_list:
                
                model_output = run_FB_models(df, model)

                results_table.loc[str(model)]['Accuracy'] = model_output[0]
                results_table.loc[str(model)]['Precision'] = model_output[1]
                results_table.loc[str(model)]['dataset'] = model_output[2]
                results_table.loc[str(model)]['snr'] = model_output[3]
                results_table.loc[str(model)]['n_i'] = model_output[4]
                   
    return results_table


print(fb_model_list)
lda_accuracy = []
rf_accuracy = []
svm_accuracy = []
ft_imp_matrix = []
# For each signal to noise ratio
for case in cases:
    if case == 'GN-WESAD':
        loadPath = '../data/GN-WESAD'
        GN_model_results = run_GN_models(snrs, subject_ids, n_samples, loadPath, fb_model_list, feature_list)
    if case == 'WESAD':
        loadPath = '../data/WESAD'
    if case == 'PR-WESAD':
        loadPath = '../data/PR-WESAD'

for i in range(len(snrs)):
    # Get data
    df = pd.read_csv('data/noise_snr_'+str(snrs[i])+'.csv', index_col=0)
    # Since Resp_C_rate is null for the first four, simply get rid of it
    df = df.loc[:, df.columns != 'Resp_C_rate']
    
    # Get features, label
    X = df.drop('label', axis=1).values
    y = df['label'].values

    # Get train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  
    
    # Scale the data
    sc = StandardScaler()  
    X_train = sc.fit_transform(X_train)  
    X_test = sc.transform(X_test)  
    
    # Test LDA
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    y_pred = lda.predict(X_test)
    lda_accuracy.append(accuracy_score(y_test, y_pred))
    
    # Test RF
    classifier = RandomForestClassifier(max_depth=4, random_state=0)
    classifier.fit(X_train, y_train)  
    y_pred = classifier.predict(X_test)  
    rf_accuracy.append(accuracy_score(y_test, y_pred))
    importances = classifier.feature_importances_
    forest_importances = pd.Series(importances, index=features).sort_values(ascending=False)
    ft_imp_matrix.append(importances)

    fig, ax = plt.subplots(figsize=(12, 5))
    forest_importances[:20].plot.barh(ax=ax)
    ax.set_title("Feature importances using MDI with SNR: " + str(snrs[i]))
    ax.set_ylabel("Mean decrease in impurity")
    fig.tight_layout()
    plt.show();
    
    # Test SVM
    clf = SVC(kernel='linear', C=1, random_state=0)
    clf.fit(X_train, y_train)
    y_out = clf.predict(X_test)    
    svm_accuracy.append(accuracy_score(y_test, y_pred))

    # test kNN

    # test DT

    # test AdaBoost
# list of all models
#     

In [None]:
import scipy.stats as ss
for ft in range(0, len(ft_imp_matrix)):
    ft_imp_matrix[ft] = [sorted(ft_imp_matrix[ft], reverse=True).index(x) for x in ft_imp_matrix[ft]]

In [None]:
ft_imp_df = pd.DataFrame(ft_imp_matrix, columns=features)

In [None]:
ft_imp_df.head()

In [None]:
ft_imp_df.columns.values

In [None]:
ft_imp_df.reset_index().pivot(index=ft_imp_df, columns=ft_imp_df.columns.values)

## Tabulate results 

In [32]:
results_table = pd.DataFrame(columns=['SNR', 'Accuracy', 'F1-Score', 'dataset'])
results_table.loc[str('SVM')] = pd.Series({'SNR':1, 'Accuracy':5,
                                           'F1 Score':2, 'dataset':'WESAD'})
results_table.loc[str('RF')] = pd.Series({'SNR':1, 'Accuracy':5,
                                          'F1 Score':2, 'dataset':'WESAD'})

'''
fb_model_list = ['DT', 'RF', 'SVM', 'LDA', 'KNN', 'AdaBoost']

for model in fb_model_list:
    for i in range(len(snrs)):
        results_table.loc[str(model) + str(snrs[i])] = pd.Series({'SNR':snrs[i], 'Accuracy':svm_accuracy[i], 'F1 Score':2, 'dataset':'WESAD'})
'''
display(results_table)



<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,SNR,Accuracy,F1-Score,dataset
SVM,1,5,,WESAD
RF,1,5,,WESAD


## Plot SNR vs. Accuracy

In [None]:
sns.set_style('dark')
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(14,7))
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

min_accuracy = min(min(lda_accuracy), min(rf_accuracy), min(svm_accuracy)) - 0.01
max_accuracy = max(lda_baseline_acc, rf_baseline_acc, svm_baseline_acc) + 0.01

axs[0].plot(snrs, lda_accuracy, label='Accuracy (with noise)', marker='o');
axs[0].plot(snrs, [lda_baseline_acc]*len(snrs), linestyle='dashed', label='Accuracy (without noise)')
axs[0].set_title('Linear Discriminant Analysis', fontsize=20);
axs[0].set_xlabel('Signal to Noise Ratio', fontsize=15);
axs[0].set_ylabel('Classification Accuracy', fontsize=15);
axs[0].legend();
axs[0].set_ylim([min_accuracy, max_accuracy]);

axs[1].plot(snrs, rf_accuracy, label='Accuracy (with noise)', marker='o');
axs[1].plot(snrs, [rf_baseline_acc]*len(snrs), linestyle='dashed', label='Accuracy (without noise)')
axs[1].set_title('Random Forest', fontsize=20);
axs[1].set_xlabel('Signal to Noise Ratio', fontsize=15);
axs[1].legend();
axs[1].set_ylim([min_accuracy, max_accuracy]);

axs[2].plot(snrs, svm_accuracy, label='Accuracy (with noise)', marker='o');
axs[2].plot(snrs, [svm_baseline_acc]*len(snrs), linestyle='dashed', label='Accuracy (without noise)')
axs[2].set_title('Support Vector Machine', fontsize=20);
axs[2].set_xlabel('Signal to Noise Ratio', fontsize=15);
axs[2].legend();
axs[2].set_ylim([min_accuracy, max_accuracy]);

# add knn, dt and adaboost

axs[3].plot(snrs, knn_baseline_acc, label='Accuracy (with noise)', marker='o');
axs[3].plot(snrs, [knn_baseline_acc]*len(snrs), linestyle='dashed', label='Accuracy (without noise)')
axs[3].set_title('kNN', fontsize=20);
axs[3].set_xlabel('Signal to Noise Ratio', fontsize=15);
axs[3].legend();
axs[3].set_ylim([min_accuracy, max_accuracy]);

axs[4].plot(snrs, dt_baseline_acc, label='Accuracy (with noise)', marker='o');
axs[4].plot(snrs, [dt_baseline_acc]*len(snrs), linestyle='dashed', label='Accuracy (without noise)')
axs[4].set_title('Decision Tree', fontsize=20);
axs[4].set_xlabel('Signal to Noise Ratio', fontsize=15);
axs[4].legend();
axs[4].set_ylim([min_accuracy, max_accuracy]);

axs[5].plot(snrs, ab_baseline_acc, label='Accuracy (with noise)', marker='o');
axs[5].plot(snrs, [ab_baseline_acc]*len(snrs), linestyle='dashed', label='Accuracy (without noise)')
axs[5].set_title('AdaBoost', fontsize=20);
axs[5].set_xlabel('Signal to Noise Ratio', fontsize=15);
axs[5].legend();
axs[5].set_ylim([min_accuracy, max_accuracy]);


In [34]:
import logging
logging.basicConfig(level=logging.INFO)
logging.warning("Feature extraction for ecg failed. \n This happened for participant ")




# Testing