In [59]:
# from google.colab import drive
# drive.mount('/content/drive')

In [60]:
# # navigate to root directory of current file in order to access other files relatively
# %cd /content/drive/MyDrive/Colab\ Notebooks/thesis-writing-1/eda-signal-classifier

In [61]:
# !pip install PyWavelets

In [62]:
import datetime
import math
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import requests
import re
import tensorflow as tf

from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# import and load model architectures as well as decoder
from models.cueva import LSTM_FE
from models.llanes_jurado import LSTM_CNN
from utilities.preprocessors import correct_signals
from utilities.loaders import load_meta_data, concur_load_data, charge_raw_data, _combine_data

from utilities.visualizers import (
    view_time_frame,
    view_wavelet_coeffs,
    analyze,
    data_split_metric_values,
    view_value_frequency,
    multi_class_heatmap,
    view_metric_values,
    view_classified_labels,
    view_label_freq,
    disp_cat_feat,
    plot_all_features,
    describe_col,
    ModelResults,
    view_all_splits_results)

from utilities.feature_extractors import (
    concur_extract_features_from_all,
    extract_features,
    extract_features_hybrid,
    extract_features_per_hour)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Downloading dataset

If your project requires downloading a larger file, then you may run into issues using the steps above when you try to load the entire file into memory. To overcome those issues, you can download large files in a streaming fashion to avoid reading the content of large responses all at once

In [63]:
# download_dataset("https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/w8fxrg4pv5-2.zip")

# Loading dataset

In [64]:
# # Extract data from zip file
# with zipfile.ZipFile('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/EDABE dataset.zip', 'r') as zip_ref:
#     zip_ref.extractall('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)')

In [65]:
# ahixac_eda_df_128hz = pd.read_csv('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/ahixac_expert1.csv', sep=';')
# ahixac_eda_df_128hz

In [66]:
# ahixac_eda_df_128hz.columns = ['time', 'raw_signal', 'clean_signal', 'label', 'auto_signal', 'pred_art', 'post_proc_pred_art']

In [67]:
# start_time = ahixac_eda_df_128hz.iloc[0]['time']
# start_time

In [68]:
# ahixac_eda_df_128hz.set_index(pd.date_range(start=start_time, periods=ahixac_eda_df_128hz.shape[0], freq=get_time_frequency(128)), inplace=True)
# ahixac_eda_df_128hz

# Downsampling 128hz signals to 16hz

In [69]:
# ahixac_eda_df_16hz = interpolate_signals(ahixac_eda_df_128hz, sample_rate=128, start_time=start_time, target_hz=16)
# ahixac_eda_df_16hz

# Low-pass filtering raw 128hz and 16hz signals

In [70]:
# ahixac_eda_df_128hz['filtered_signal'] = butter_lowpass_filter(ahixac_eda_df_128hz['raw_signal'], cutoff=1.0, samp_freq=128, order=6)
# ahixac_eda_df_16hz['filtered_signal'] = butter_lowpass_filter(ahixac_eda_df_16hz['raw_signal'], cutoff=1.0, samp_freq=16, order=6)

In [71]:
# ahixac_eda_df_128hz

In [72]:
# ahixac_eda_df_128hz.iloc[63]

In [73]:
# timestamp_list = ahixac_eda_df_128hz.index.tolist()[::64]
# timestamp_list

In [74]:
# timestamp_list[-1].timestamp()

In [75]:
# ahixac_eda_df_16hz

In [76]:
# ahixac_eda_df_16hz[:8]

In [77]:
# view_time_frame(ahixac_eda_df_128hz, samp_freq=128, cols_to_use=['raw_signal', 'filtered_signal'], img_title='subject ahixac 128hz time frame')
# view_time_frame(ahixac_eda_df_16hz, samp_freq=16, cols_to_use=['raw_signal', 'filtered_signal'], img_title='subject ahixac 16hz time frame')

# Iterate through signals per hour

In [78]:
# data_128hz = extract_features_per_hour(ahixac_eda_df_128hz, hertz=128, window_size=0.5, verbose=True)
# data_128hz

In [79]:
# data_16hz = extract_features_per_hour(ahixac_eda_df_16hz, hertz=16, window_size=0.5, verbose=True)
# data_16hz

#### if we had a 128hz dataset with derived timestamps that increase every 0.5s such as this [0.0, 0.5, 1.0, 1.5, ..., 6506.0] then our segments would be:
```
[0.0, 0.5)
[0.5, 1.0)
[1.0, 1.5)
...
[6504.5, 6506.0)
```

#### 832830 / 64 is 13012.96875 or when "`math.ceil()`ed" is 13013

In [80]:
# math.ceil(13012.96875), math.floor(13012.96875)

In [81]:
# for feature_segments, labels in data_128hz:
#     print(labels.value_counts())

#### here in the first hour of our data the number of artifacts out of all 7200 0.5s segments is 716 or roughly 9.9% of our data, and the number of non-artifacts out of all 7200 0.5s segments is 6484 or roughly 90% of our data

#### For the second hour of our data the number of artifacts out of all 5813 0.5s segments is 208 or roughly 3.58% of our data, and the number of non-artifacts out of all 5813 0.5s segments is 5605 or roughly 96.42% of our data

In [82]:
# for feature_segments, labels in data_16hz:
#     print(labels.value_counts())

#### Here the reason why we have almost the same number of artifact and non-artifact labels to the 128hz data is because we interpolated our 128hz data to 16hz thus losing some of our labels

In [83]:
# ahixac_eda_data = rejoin_data(data_128hz, data_16hz)
# ahixac_eda_data

#### concatenating calculated features from 128hz and 16hz data of the first hour

In [84]:
# ahixac_eda_data[0].columns

# Now we ought to do these for all subjects

# scanning train folder

In [85]:
train_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/')
train_files

['ahixac_expert1.csv',
 'akakip_expert2.csv',
 'aqamom_expert2.csv',
 'aretez_expert1.csv',
 'asifex_expert2.csv',
 'axeyoh_expert2.csv',
 'efawep_expert2.csv',
 'egemow_expert2.csv',
 'ejofeq_expert2.csv',
 'erecij_expert1.csv',
 'esirur_expert1.csv',
 'ewehov_expert2.csv',
 'exozef_expert2.csv',
 'idagah_expert2.csv',
 'ihikay_expert1.csv',
 'ihinot_expert1.csv',
 'imocac_expert2.csv',
 'iqiyat_expert2.csv',
 'obujoh_expert2.csv',
 'ohayeh_expert1.csv',
 'ohufow_expert1.csv',
 'ojotew_expert1.csv',
 'onivuk_expert1.csv',
 'opunad_expert1.csv',
 'otecab_expert2.csv',
 'otuqom_expert1.csv',
 'owegud_expert2.csv',
 'oxisux_expert1.csv',
 'tchgij_expert2.csv',
 'ufoyek_expert2.csv',
 'uqozew_expert1.csv',
 'urogif_expert1.csv',
 'uzefow_expert1.csv']

# Concurrently read each .csv file and use functions that will spit out the features

In [86]:
# train_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/', train_files, arch="ml")
# train_eda_data

#### Above code takes about 204 minutes or 3 hrs and 20 minutes to run

In [87]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels) in train_eda_data:
#     feature_segments.to_csv(f'./data/Artifact Detection Data/train/{subject_name}_features.csv')
#     labels.to_csv(f'./data/Artifact Detection Data/train/{subject_name}_labels.csv')

In [88]:
test_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/')
test_files

['afegip_expert1.csv',
 'ajeric_expert2.csv',
 'ekamis_expert2.csv',
 'iguted_expert1.csv',
 'inefoh_expert1.csv',
 'otafeh_expert1.csv',
 'oxused_expert2.csv',
 'pqbqpr_expert2.csv',
 'uhepah_expert1.csv',
 'ukudab_expert2.csv']

In [89]:
# test_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/', test_files, arch="ml")
# test_eda_data

In [90]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels)  in test_eda_data:
#     feature_segments.to_csv(f'./data/Artifact Detection Data/test/{subject_name}_features.csv')
#     labels.to_csv(f'./data/Artifact Detection Data/test/{subject_name}_labels.csv')

# This section attempts outlier datapoint removal i.e. rows with purely zeros from newly generated features resulting from above lines 

In [91]:
# ahixac_features = pd.read_csv(f'./data/Artifact Detection Data/train/ahixac_expert1_features.csv', index_col=0)
# ahixac_features

#### recall axis 1 is the x axis and axis 0 is the y axis

In [92]:
# non_zero_rows = (ahixac_features != 0).any(axis=1)
# non_zero_rows

In [93]:
# ahixac_features.index[~non_zero_rows]

In [94]:
# ahixac_features[non_zero_rows]

In [95]:
# non_zero_rows_alt = ~(ahixac_features == 0).all(axis=1)
# non_zero_rows_alt

In [96]:
# import re

# # what I want is to individually open all the feature segment files as well as their corresponding label files
# for train_subject_name in train_files:
#     train_subject_name = re.sub(r".csv", "", train_subject_name)
#     subject_features = pd.read_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_features.csv', index_col=0)
#     subject_labels = pd.read_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_labels.csv', index_col=0)

#     # allow modifications to the dataframe here i.e.
#     # removing rows with purely 0.0 values for every feature/column
#     # as these outliers can negatively impact the training of the 
#     # ml model
#     non_zero_rows = (subject_features != 0).any(axis=1)

#     # keep only the rows that are non zero rows
#     # this goes also for rows in the subjects labels
#     subject_features[non_zero_rows].to_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_features.csv')
#     subject_labels[non_zero_rows].to_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_labels.csv')

# This section attempts to use lstm feature extractor model to convert eda signals to lstm features that a scikit learn svm can use as input. This will implement high level feature engineering for the hybrid lstm-svm model

In [97]:
train_subjects_signals, train_subjects_labels, train_subjects_names, train_subject_to_id = concur_load_data(feat_config="cueva",)

length of x_signals: 672486
window size: 640
length of x_signals: 749774
window size: 640
length of x_signals: 708696
window size: 640
length of x_signals: 786318
window size: 640
length of x_signals: 751975
window size: 640
length of x_signals: 819275
window size: 640
length of x_signals: 750519
window size: 640
length of x_signals: 732054
window size: 640
length of x_signals: 755310
window size: 640
length of x_signals: 747558
window size: 640
length of x_signals: 755310
window size: 640
length of x_signals: 782183
window size: 640
length of x_signals: 832830
window size: 640
length of x_signals: 794070
window size: 640
length of x_signals: 871590
window size: 640
length of x_signals: 925752
window size: 640
number of rows created: 10498
number of rows created: 11064
number of rows created: 11429
number of rows created: 11671
number of rows created: 11706
number of rows created: 11792
number of rows created: 11717
number of rows created: 11792
number of rows created: 11740
number of 

In [98]:
len(train_subjects_signals)

33

In [99]:
train_subjects_labels

[array([[0],
        [0],
        [0],
        ...,
        [1],
        [1],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [1],
        [1],
        [1]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]),
 array([[0],
        [0],
        [0],
        ...,
        [0],
     

In [100]:
train_subjects_names

['imocac_expert2',
 'ahixac_expert1',
 'obujoh_expert2',
 'otecab_expert2',
 'onivuk_expert1',
 'uqozew_expert1',
 'ejofeq_expert2',
 'tchgij_expert2',
 'aretez_expert1',
 'asifex_expert2',
 'otuqom_expert1',
 'uzefow_expert1',
 'ojotew_expert1',
 'iqiyat_expert2',
 'efawep_expert2',
 'opunad_expert1',
 'ihinot_expert1',
 'owegud_expert2',
 'idagah_expert2',
 'akakip_expert2',
 'ewehov_expert2',
 'oxisux_expert1',
 'ohayeh_expert1',
 'urogif_expert1',
 'ufoyek_expert2',
 'esirur_expert1',
 'ohufow_expert1',
 'exozef_expert2',
 'aqamom_expert2',
 'erecij_expert1',
 'axeyoh_expert2',
 'egemow_expert2',
 'ihikay_expert1']

In [101]:
test_subjects_signals, test_subjects_labels, test_subjects_names, test_subject_to_id = concur_load_data(feat_config="cueva", data_split="test")

length of x_signals: 765045
window size: 640
length of x_signals: 762960
window size: 640
length of x_signals: 770814
window size: 640
length of x_signals: 817326
window size: 640
length of x_signals: 840582
window size: 640
length of x_signals: 871590
window size: 640
length of x_signals: 801735
window size: 640
length of x_signals: 856086
window size: 640
length of x_signals: 914989
window size: 640
length of x_signals: 980118
window size: 640
number of rows created: 12034
number of rows created: 11944
number of rows created: 11912
number of rows created: 12761
number of rows created: 12518
number of rows created: 13125
number of rows created: 13609
number of rows created: 14287
number of rows created: 13367
number of rows created: 15305
subjects signals, labels, names and subject to id lookup loaded


In [102]:
len(test_subjects_signals)

10

In [103]:
import tensorflow as tf
from models.cueva import LSTM_FE 

# using tensorflow load weights of LSTM model
# load train and cross signals of model
lstm_fe_hp = load_meta_data('./saved/misc/cueva_lstm-fe_meta_data.json')
lstm_fe = LSTM_FE(**lstm_fe_hp)
lstm_fe.load_weights('./saved/weights/cueva_lstm-fe_21_0.7489.weights.h5')

In [104]:
lstm_fe.summary()

In [105]:
lstm_layer_2 = lstm_fe.get_layer('lstm-layer-2')
lstm_layer_2.output

<KerasTensor shape=(None, 32), dtype=float32, sparse=False, name=keras_tensor_85>

In [106]:
lstm_fe.inputs

[<KerasTensor shape=(None, 640, 1), dtype=float32, sparse=None, name=keras_tensor_79>]

In [107]:
lstm_fe_main = tf.keras.Model(inputs=lstm_fe.inputs, outputs=lstm_layer_2.output)
lstm_fe_main

<Functional name=functional_25, built=True>

In [108]:
lstm_fe_main.summary()

# subjects_signals[0] for instance previously had a shape (10701, 640, 1) and after feature extraction its shape  will now be (10701, 32) since the number of output units of the LSTM set was 32

In [109]:
for index, train_subject_name in enumerate(train_subjects_names):
    # use last lstm layer of trained side task model to predict
    # output that will be used as features given the original signals 
    print(f'subject: {train_subject_name}')
    print(f'initial shape: {train_subjects_signals[index].shape}')
    train_subject_hof = lstm_fe_main.predict(train_subjects_signals[index])
    print(f'output shape: {train_subject_hof.shape}')

    # create columns with its length the same as the number of columns
    # of the higher order features matrix 
    columns = [f'HOF_{i}' for i in range(1, train_subject_hof.shape[1] + 1)]
    train_subject_hof_df = pd.DataFrame(train_subject_hof, columns=columns)

    # save both lstm features and lstm labels
    train_subject_hof_df.to_csv(f'./data/Hybrid Artifact Detection Data/train/{train_subject_name}_hof.csv')

subject: imocac_expert2
initial shape: (11792, 640, 1)
[1m369/369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 45ms/step
output shape: (11792, 32)
subject: ahixac_expert1
initial shape: (13003, 640, 1)
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 51ms/step
output shape: (13003, 32)
subject: obujoh_expert2
initial shape: (12792, 640, 1)
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 44ms/step
output shape: (12792, 32)
subject: otecab_expert2
initial shape: (11792, 640, 1)
[1m369/369[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 45ms/step
output shape: (11792, 32)
subject: onivuk_expert1
initial shape: (11717, 640, 1)
[1m367/367[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 46ms/step
output shape: (11717, 32)
subject: uqozew_expert1
initial shape: (13609, 640, 1)
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 46ms/step
output shape: (13609, 32)
subject: ejofeq_expert2
initial shape: (10498, 640, 

In [110]:
for index, test_subject_name in enumerate(test_subjects_names):
    # use last lstm layer of tested side task model to predict
    # output that will be used as features given the original signals 
    print(f'subject: {test_subject_name}')
    print(f'initial shape: {test_subjects_signals[index].shape}')
    test_subject_hof = lstm_fe_main.predict(test_subjects_signals[index])
    print(f'output shape: {test_subject_hof.shape}')

    # create columns with its length the same as the number of columns
    # of the higher order features matrix 
    columns = [f'HOF_{i}' for i in range(1, test_subject_hof.shape[1] + 1)]
    test_subject_hof_df = pd.DataFrame(test_subject_hof, columns=columns)

    # save both lstm features and lstm labels
    test_subject_hof_df.to_csv(f'./data/Hybrid Artifact Detection Data/test/{test_subject_name}_hof.csv')

subject: oxused_expert2
initial shape: (11944, 640, 1)
[1m374/374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 45ms/step
output shape: (11944, 32)
subject: ekamis_expert2
initial shape: (13609, 640, 1)
[1m426/426[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 49ms/step
output shape: (13609, 32)
subject: otafeh_expert1
initial shape: (11912, 640, 1)
[1m373/373[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 48ms/step
output shape: (11912, 32)
subject: uhepah_expert1
initial shape: (12034, 640, 1)
[1m377/377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step
output shape: (12034, 32)
subject: ajeric_expert2
initial shape: (12761, 640, 1)
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 44ms/step
output shape: (12761, 32)
subject: inefoh_expert1
initial shape: (13367, 640, 1)
[1m418/418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 40ms/step
output shape: (13367, 32)
subject: pqbqpr_expert2
initial shape: (15305, 640, 

# this next section will implement low level feature engineering for hybrid lstm-svm model

In [111]:
# train_files[:1]

In [112]:
# train_hybrid_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/', train_files, arch="hybrid")
# train_hybrid_eda_data

In [113]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels) in train_hybrid_eda_data:
#     feature_segments.to_csv(f'./data/Hybrid Artifact Detection Data/train/{subject_name}_lof.csv')
#     labels.to_csv(f'./data/Hybrid Artifact Detection Data/train/{subject_name}_labels.csv')

In [114]:
# test_hybrid_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/', test_files, arch="hybrid")
# test_hybrid_eda_data

In [115]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels) in test_hybrid_eda_data:
#     feature_segments.to_csv(f'./data/Hybrid Artifact Detection Data/test/{subject_name}_lof.csv')
#     labels.to_csv(f'./data/Hybrid Artifact Detection Data/test/{subject_name}_labels.csv')

In [116]:
# # once notebook reaches end remove data to clear space
# os.remove('./data/EDABE dataset.zip')
# os.remove('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)')