In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# # navigate to root directory of current file in order to access other files relatively
# %cd /content/drive/MyDrive/Colab\ Notebooks/thesis-writing-1/eda-signal-classifier

In [None]:
# !pip install PyWavelets

In [None]:
import datetime
import math
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import requests
import re
import tensorflow as tf

from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# import and load model architectures as well as decoder
from models.cueva import LSTM_FE
from models.llanes_jurado import LSTM_CNN
from utilities.preprocessors import correct_signals
from utilities.loaders import load_meta_data, concur_load_data, charge_raw_data, _combine_data

from utilities.visualizers import (
    view_time_frame,
    view_wavelet_coeffs,
    analyze,
    data_split_metric_values,
    view_value_frequency,
    multi_class_heatmap,
    view_metric_values,
    view_classified_labels,
    view_label_freq,
    disp_cat_feat,
    plot_all_features,
    describe_col,
    ModelResults,
    view_all_splits_results)

from utilities.feature_extractors import (
    concur_extract_features_from_all,
    extract_features,
    extract_features_hybrid,
    extract_features_per_hour)

%load_ext autoreload
%autoreload 2

# Downloading dataset

If your project requires downloading a larger file, then you may run into issues using the steps above when you try to load the entire file into memory. To overcome those issues, you can download large files in a streaming fashion to avoid reading the content of large responses all at once

In [None]:
# download_dataset("https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/w8fxrg4pv5-2.zip")

# Loading dataset

In [None]:
# # Extract data from zip file
# with zipfile.ZipFile('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/EDABE dataset.zip', 'r') as zip_ref:
#     zip_ref.extractall('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)')

In [None]:
# ahixac_eda_df_128hz = pd.read_csv('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/ahixac_expert1.csv', sep=';')
# ahixac_eda_df_128hz

In [None]:
# ahixac_eda_df_128hz.columns = ['time', 'raw_signal', 'clean_signal', 'label', 'auto_signal', 'pred_art', 'post_proc_pred_art']

In [None]:
# start_time = ahixac_eda_df_128hz.iloc[0]['time']
# start_time

In [None]:
# ahixac_eda_df_128hz.set_index(pd.date_range(start=start_time, periods=ahixac_eda_df_128hz.shape[0], freq=get_time_frequency(128)), inplace=True)
# ahixac_eda_df_128hz

# Downsampling 128hz signals to 16hz

In [None]:
# ahixac_eda_df_16hz = interpolate_signals(ahixac_eda_df_128hz, sample_rate=128, start_time=start_time, target_hz=16)
# ahixac_eda_df_16hz

# Low-pass filtering raw 128hz and 16hz signals

In [None]:
# ahixac_eda_df_128hz['filtered_signal'] = butter_lowpass_filter(ahixac_eda_df_128hz['raw_signal'], cutoff=1.0, samp_freq=128, order=6)
# ahixac_eda_df_16hz['filtered_signal'] = butter_lowpass_filter(ahixac_eda_df_16hz['raw_signal'], cutoff=1.0, samp_freq=16, order=6)

In [None]:
# ahixac_eda_df_128hz

In [None]:
# ahixac_eda_df_128hz.iloc[63]

In [None]:
# timestamp_list = ahixac_eda_df_128hz.index.tolist()[::64]
# timestamp_list

In [None]:
# timestamp_list[-1].timestamp()

In [None]:
# ahixac_eda_df_16hz

In [None]:
# ahixac_eda_df_16hz[:8]

In [None]:
# view_time_frame(ahixac_eda_df_128hz, samp_freq=128, cols_to_use=['raw_signal', 'filtered_signal'], img_title='subject ahixac 128hz time frame')
# view_time_frame(ahixac_eda_df_16hz, samp_freq=16, cols_to_use=['raw_signal', 'filtered_signal'], img_title='subject ahixac 16hz time frame')

# Iterate through signals per hour

In [None]:
# data_128hz = extract_features_per_hour(ahixac_eda_df_128hz, hertz=128, window_size=0.5, verbose=True)
# data_128hz

In [None]:
# data_16hz = extract_features_per_hour(ahixac_eda_df_16hz, hertz=16, window_size=0.5, verbose=True)
# data_16hz

#### if we had a 128hz dataset with derived timestamps that increase every 0.5s such as this [0.0, 0.5, 1.0, 1.5, ..., 6506.0] then our segments would be:
```
[0.0, 0.5)
[0.5, 1.0)
[1.0, 1.5)
...
[6504.5, 6506.0)
```

#### 832830 / 64 is 13012.96875 or when "`math.ceil()`ed" is 13013

In [None]:
# math.ceil(13012.96875), math.floor(13012.96875)

In [None]:
# for feature_segments, labels in data_128hz:
#     print(labels.value_counts())

#### here in the first hour of our data the number of artifacts out of all 7200 0.5s segments is 716 or roughly 9.9% of our data, and the number of non-artifacts out of all 7200 0.5s segments is 6484 or roughly 90% of our data

#### For the second hour of our data the number of artifacts out of all 5813 0.5s segments is 208 or roughly 3.58% of our data, and the number of non-artifacts out of all 5813 0.5s segments is 5605 or roughly 96.42% of our data

In [None]:
# for feature_segments, labels in data_16hz:
#     print(labels.value_counts())

#### Here the reason why we have almost the same number of artifact and non-artifact labels to the 128hz data is because we interpolated our 128hz data to 16hz thus losing some of our labels

In [None]:
# ahixac_eda_data = rejoin_data(data_128hz, data_16hz)
# ahixac_eda_data

#### concatenating calculated features from 128hz and 16hz data of the first hour

In [None]:
# ahixac_eda_data[0].columns

# Now we ought to do these for all subjects

# scanning train folder

In [None]:
train_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/')
train_files

# Concurrently read each .csv file and use functions that will spit out the features

In [None]:
# train_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/', train_files, arch="ml")
# train_eda_data

#### Above code takes about 204 minutes or 3 hrs and 20 minutes to run

In [None]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels) in train_eda_data:
#     feature_segments.to_csv(f'./data/Artifact Detection Data/train/{subject_name}_features.csv')
#     labels.to_csv(f'./data/Artifact Detection Data/train/{subject_name}_labels.csv')

In [None]:
test_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/')
test_files

In [None]:
# test_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/', test_files, arch="ml")
# test_eda_data

In [None]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels)  in test_eda_data:
#     feature_segments.to_csv(f'./data/Artifact Detection Data/test/{subject_name}_features.csv')
#     labels.to_csv(f'./data/Artifact Detection Data/test/{subject_name}_labels.csv')

# This section attempts outlier datapoint removal i.e. rows with purely zeros from newly generated features resulting from above lines 

In [None]:
# ahixac_features = pd.read_csv(f'./data/Artifact Detection Data/train/ahixac_expert1_features.csv', index_col=0)
# ahixac_features

#### recall axis 1 is the x axis and axis 0 is the y axis

In [None]:
# non_zero_rows = (ahixac_features != 0).any(axis=1)
# non_zero_rows

In [None]:
# ahixac_features.index[~non_zero_rows]

In [None]:
# ahixac_features[non_zero_rows]

In [None]:
# non_zero_rows_alt = ~(ahixac_features == 0).all(axis=1)
# non_zero_rows_alt

In [None]:
# import re

# # what I want is to individually open all the feature segment files as well as their corresponding label files
# for train_subject_name in train_files:
#     train_subject_name = re.sub(r".csv", "", train_subject_name)
#     subject_features = pd.read_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_features.csv', index_col=0)
#     subject_labels = pd.read_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_labels.csv', index_col=0)

#     # allow modifications to the dataframe here i.e.
#     # removing rows with purely 0.0 values for every feature/column
#     # as these outliers can negatively impact the training of the 
#     # ml model
#     non_zero_rows = (subject_features != 0).any(axis=1)

#     # keep only the rows that are non zero rows
#     # this goes also for rows in the subjects labels
#     subject_features[non_zero_rows].to_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_features.csv')
#     subject_labels[non_zero_rows].to_csv(f'./data/Artifact Detection Data/train/{train_subject_name}_labels.csv')

# This section attempts to use lstm feature extractor model to convert eda signals to lstm features that a scikit learn svm can use as input. This will implement high level feature engineering for the hybrid lstm-svm model

In [None]:
train_subjects_signals, train_subjects_labels, train_subjects_names, train_subject_to_id = concur_load_data(feat_config="cueva",)

In [None]:
len(train_subjects_signals)

In [None]:
train_subjects_labels

In [None]:
train_subjects_names

In [None]:
test_subjects_signals, test_subjects_labels, test_subjects_names, test_subject_to_id = concur_load_data(feat_config="cueva", data_split="test")

In [None]:
len(test_subjects_signals)

In [None]:
import tensorflow as tf
from models.cueva import LSTM_FE 

# using tensorflow load weights of LSTM model
# load train and cross signals of model
lstm_fe_hp = load_meta_data('./saved/misc/cueva_lstm-fe_meta_data.json')
lstm_fe = LSTM_FE(**lstm_fe_hp)
lstm_fe.load_weights('./saved/weights/cueva_lstm-fe_21_0.7489.weights.h5')

In [None]:
lstm_fe.summary()

In [None]:
lstm_layer_2 = lstm_fe.get_layer('lstm-layer-2')
lstm_layer_2.output

In [None]:
lstm_fe.inputs

In [None]:
lstm_fe_main = tf.keras.Model(inputs=lstm_fe.inputs, outputs=lstm_layer_2.output)
lstm_fe_main

In [None]:
lstm_fe_main.summary()

# subjects_signals[0] for instance previously had a shape (10701, 640, 1) and after feature extraction its shape  will now be (10701, 32) since the number of output units of the LSTM set was 32

In [None]:
for index, train_subject_name in enumerate(train_subjects_names):
    # use last lstm layer of trained side task model to predict
    # output that will be used as features given the original signals 
    print(f'subject: {train_subject_name}')
    print(f'initial shape: {train_subjects_signals[index].shape}')
    train_subject_hof = lstm_fe_main.predict(train_subjects_signals[index])
    print(f'output shape: {train_subject_hof.shape}')

    # create columns with its length the same as the number of columns
    # of the higher order features matrix 
    columns = [f'HOF_{i}' for i in range(1, train_subject_hof.shape[1] + 1)]
    train_subject_hof_df = pd.DataFrame(train_subject_hof, columns=columns)

    # save both lstm features and lstm labels
    train_subject_hof_df.to_csv(f'./data/Hybrid Artifact Detection Data/train/{train_subject_name}_hof.csv')

In [None]:
for index, test_subject_name in enumerate(test_subjects_names):
    # use last lstm layer of tested side task model to predict
    # output that will be used as features given the original signals 
    print(f'subject: {test_subject_name}')
    print(f'initial shape: {test_subjects_signals[index].shape}')
    test_subject_hof = lstm_fe_main.predict(test_subjects_signals[index])
    print(f'output shape: {test_subject_hof.shape}')

    # create columns with its length the same as the number of columns
    # of the higher order features matrix 
    columns = [f'HOF_{i}' for i in range(1, test_subject_hof.shape[1] + 1)]
    test_subject_hof_df = pd.DataFrame(test_subject_hof, columns=columns)

    # save both lstm features and lstm labels
    test_subject_hof_df.to_csv(f'./data/Hybrid Artifact Detection Data/test/{test_subject_name}_hof.csv')

# this next section will implement low level feature engineering for hybrid lstm-svm model

In [None]:
# train_files[:1]

In [None]:
# train_hybrid_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/', train_files, arch="hybrid")
# train_hybrid_eda_data

In [None]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels) in train_hybrid_eda_data:
#     feature_segments.to_csv(f'./data/Hybrid Artifact Detection Data/train/{subject_name}_lof.csv')
#     labels.to_csv(f'./data/Hybrid Artifact Detection Data/train/{subject_name}_labels.csv')

In [None]:
# test_hybrid_eda_data = concur_extract_features_from_all('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/', test_files, arch="hybrid")
# test_hybrid_eda_data

In [None]:
# # save each feature dataframe as a .csv file in the folder created earlier with the same names
# for subject_name, (feature_segments, labels) in test_hybrid_eda_data:
#     feature_segments.to_csv(f'./data/Hybrid Artifact Detection Data/test/{subject_name}_lof.csv')
#     labels.to_csv(f'./data/Hybrid Artifact Detection Data/test/{subject_name}_labels.csv')

In [None]:
# # once notebook reaches end remove data to clear space
# os.remove('./data/EDABE dataset.zip')
# os.remove('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)')