# This notebook will just attempt to correct some columns of the generated dataframes from the feature extractor functions

In [1]:
import datetime
import math
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import requests
import re
import tensorflow as tf

from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# import and load model architectures as well as decoder
from models.cueva import LSTM_FE
from models.llanes_jurado import LSTM_CNN
from utilities.preprocessors import correct_signals
from utilities.loaders import load_meta_data, save_meta_data, concur_load_data, charge_raw_data, _combine_data, load_model, load_lookup_array

from utilities.visualizers import (
    view_time_frame,
    view_wavelet_coeffs,
    analyze,
    data_split_metric_values,
    view_value_frequency,
    multi_class_heatmap,
    view_metric_values,
    view_classified_labels,
    view_label_freq,
    disp_cat_feat,
    plot_all_features,
    describe_col,
    ModelResults,
    view_all_splits_results)

from utilities.feature_extractors import (
    concur_extract_features_from_all,
    extract_features,
    extract_features_hybrid,
    extract_features_per_hour)

%load_ext autoreload
%autoreload 2

In [2]:
train_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/')
train_files

['ahixac_expert1.csv',
 'akakip_expert2.csv',
 'aqamom_expert2.csv',
 'aretez_expert1.csv',
 'asifex_expert2.csv',
 'axeyoh_expert2.csv',
 'efawep_expert2.csv',
 'egemow_expert2.csv',
 'ejofeq_expert2.csv',
 'erecij_expert1.csv',
 'esirur_expert1.csv',
 'ewehov_expert2.csv',
 'exozef_expert2.csv',
 'idagah_expert2.csv',
 'ihikay_expert1.csv',
 'ihinot_expert1.csv',
 'imocac_expert2.csv',
 'iqiyat_expert2.csv',
 'obujoh_expert2.csv',
 'ohayeh_expert1.csv',
 'ohufow_expert1.csv',
 'ojotew_expert1.csv',
 'onivuk_expert1.csv',
 'opunad_expert1.csv',
 'otecab_expert2.csv',
 'otuqom_expert1.csv',
 'owegud_expert2.csv',
 'oxisux_expert1.csv',
 'tchgij_expert2.csv',
 'ufoyek_expert2.csv',
 'uqozew_expert1.csv',
 'urogif_expert1.csv',
 'uzefow_expert1.csv']

In [3]:
test_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/')
test_files

['afegip_expert1.csv',
 'ajeric_expert2.csv',
 'ekamis_expert2.csv',
 'iguted_expert1.csv',
 'inefoh_expert1.csv',
 'otafeh_expert1.csv',
 'oxused_expert2.csv',
 'pqbqpr_expert2.csv',
 'uhepah_expert1.csv',
 'ukudab_expert2.csv']

In [4]:
train_subjects_names = list(set([re.sub(r".csv", "", file) for file in train_files]))
train_subjects_names

['uqozew_expert1',
 'ihikay_expert1',
 'ewehov_expert2',
 'ahixac_expert1',
 'ihinot_expert1',
 'efawep_expert2',
 'ohayeh_expert1',
 'imocac_expert2',
 'tchgij_expert2',
 'onivuk_expert1',
 'otecab_expert2',
 'otuqom_expert1',
 'opunad_expert1',
 'akakip_expert2',
 'uzefow_expert1',
 'esirur_expert1',
 'asifex_expert2',
 'exozef_expert2',
 'aqamom_expert2',
 'egemow_expert2',
 'iqiyat_expert2',
 'ufoyek_expert2',
 'obujoh_expert2',
 'urogif_expert1',
 'oxisux_expert1',
 'ohufow_expert1',
 'erecij_expert1',
 'owegud_expert2',
 'ejofeq_expert2',
 'ojotew_expert1',
 'aretez_expert1',
 'axeyoh_expert2',
 'idagah_expert2']

In [5]:
test_subjects_names = list(set([re.sub(r".csv", "", file) for file in test_files]))
test_subjects_names

['otafeh_expert1',
 'ajeric_expert2',
 'ekamis_expert2',
 'ukudab_expert2',
 'uhepah_expert1',
 'iguted_expert1',
 'pqbqpr_expert2',
 'oxused_expert2',
 'inefoh_expert1',
 'afegip_expert1']

In [6]:
all_models_results = load_meta_data('./results/all_models_results.json')
all_models_results

{'cueva_second_phase-1-5-weighted-svm': {'selected_feats': ['HOF_2',
   'HOF_5',
   'HOF_7',
   'HOF_9',
   'HOF_10',
   'HOF_12',
   'HOF_13',
   'HOF_17',
   'HOF_18',
   'HOF_19',
   'HOF_20',
   'HOF_21',
   'HOF_22',
   'HOF_26',
   'HOF_32',
   'raw_128hz_std',
   'raw_128hz_range',
   'raw_128hz_shannon_entropy',
   'raw_128hz_1d_shannon_entropy',
   'filt_128hz_min',
   'filt_128hz_1d_range',
   'filt_128hz_1d_max_abs',
   'ar_coeff_1_128hz',
   'ar_coeff_2_128hz',
   'raw_128hz_skewness',
   'raw_128hz_kurt',
   'third_16thofa_sec_max',
   'first_16thofa_sec_mean',
   'second_16thofa_sec_mean',
   'third_16thofa_sec_std',
   'first_16thofa_sec_median',
   'second_16thofa_sec_median',
   'third_16thofa_sec_range',
   'second_32thofa_sec_max',
   'first_32thofa_sec_mean',
   'second_32thofa_sec_mean',
   'first_32thofa_sec_std',
   'second_32thofa_sec_std',
   'first_32thofa_sec_median',
   'second_32thofa_sec_median',
   'first_32thofa_sec_range',
   'raw_16hz_min',
   'raw_16h

In [7]:
models = {
    'cueva_second_phase-svm':{

    },
}

In [8]:
def load_miscs():
    """
    loads miscellaneous variables to be used by the model
    """

    global models

    print('loading miscellaneous...')
    cueva_second_phase_svm_red_feats = load_lookup_array(f'./data/Artifact Detection Data/reduced_cueva_second_phase_svm_feature_set.txt')
    models['cueva_second_phase-svm']['selected_feats'] = cueva_second_phase_svm_red_feats

    print('miscellaneous loaded.')

def load_preprocessors():
    """
    prepares and loads the saved encoders, normalizers of
    the dataset to later transform raw user input from
    client-side
    """
    global models

    print('loading preprocessors...')

    # pre-load here scaler of hossain used during training
    cueva_second_phase_svm_scaler = load_model('./saved/misc/cueva_second_phase_svm_scaler.pkl')

    models['cueva_second_phase-svm']['scaler'] = cueva_second_phase_svm_scaler

    print('preprocessors loaded.')

def load_models():
    """
    prepares and loads sample input and custom model in
    order to use trained weights/parameters/coefficients
    """
    global models
    
    print('loading models...')
    cueva_second_phase_svm = load_model('./saved/models/cueva_second_phase_svm_clf.pkl')
    models['cueva_second_phase-svm']['model'] = cueva_second_phase_svm

    print('models loaded.')

In [9]:
load_miscs()
load_preprocessors()
load_models()

loading miscellaneous...
miscellaneous loaded.
loading preprocessors...
preprocessors loaded.
loading models...
models loaded.


In [10]:
models

{'cueva_second_phase-svm': {'selected_feats': ['HOF_2',
   'HOF_7',
   'HOF_8',
   'HOF_9',
   'HOF_10',
   'HOF_12',
   'HOF_14',
   'HOF_16',
   'HOF_17',
   'HOF_18',
   'HOF_19',
   'HOF_20',
   'HOF_21',
   'HOF_22',
   'HOF_26',
   'HOF_27',
   'HOF_32',
   'raw_128hz_min',
   'raw_128hz_amp',
   'raw_128hz_std',
   'raw_128hz_range',
   'raw_128hz_shannon_entropy',
   'raw_128hz_1d_min',
   'raw_128hz_1d_range',
   'raw_128hz_1d_shannon_entropy',
   'filt_128hz_median',
   'filt_128hz_1d_min',
   'ar_coeff_1_128hz',
   'ar_coeff_2_128hz',
   'raw_128hz_skewness',
   'third_16thofa_sec_max',
   'first_16thofa_sec_mean',
   'second_16thofa_sec_std',
   'third_16thofa_sec_std',
   'first_16thofa_sec_median',
   'second_16thofa_sec_median',
   'third_16thofa_sec_range',
   'first_32thofa_sec_mean',
   'second_32thofa_sec_mean',
   'first_32thofa_sec_std',
   'first_32thofa_sec_median',
   'second_32thofa_sec_median',
   'second_32thofa_sec_range',
   'raw_16hz_min',
   'raw_16hz_ran

In [11]:
data_splits = ["test", "train"]

In [12]:
for model_name in ["cueva_second_phase-svm"]:
    selector_config, estimator_name = model_name.split('-', 1)
    
    for data_split in data_splits:
        loader_args = {
            'feat_config': selector_config, 
            'data_split': data_split,
            'exc_lof': False
        } if selector_config == 'cueva_second_phase' else {
            'feat_config': selector_config, 
            'data_split': data_split,
            'exc_lof': False
        }

        subjects_features, subjects_labels, subjects_names, subject_to_id = concur_load_data(**loader_args)
        print(f'selector config: {selector_config}')
        print(f'estimator name: {estimator_name}')
        # print(f'subjects features shape: {subjects_features.shape}')
        # print(f'subjects labels shape: {subjects_labels.shape}\n')
        
        # loop through each generated features dataframes from test subjects signals and feed repeatedly to a trained ml models
        for index, subject_name in enumerate(subjects_names):
            # print(f'subject features columns: {subjects_features[subjects_features['subject_id'] == index].columns}')
            print(f'subject: {subject_name}')
            # once features are extracted features selected during
            # tuning will be used in testing as done also during training

            selected_feats = models[model_name]['selected_feats']

            if loader_args.get('exc_lof') == None or loader_args.get('exc_lof') == False:
                subject_features = subjects_features.loc[subjects_features['subject_id'] == index, selected_feats]
                subject_labels = subjects_labels[subjects_labels['subject_id'] == index].drop(columns=['subject_id'])
                print(f'subject features shape: {subject_features.shape}')
                print(f'subject labels shape: {subject_labels.shape}\n')

            # this will only fire if exc_lof is not None or is false
            else:
                # if user excludes lower order features, higher order features will only be loaded
                subject_features = subjects_features[subjects_features['subject_id'] == index].drop(columns=['subject_id'])
                subject_labels = subjects_labels[subjects_labels['subject_id'] == index].drop(columns=['subject_id'])
                
                print(f'subject features shape: {subject_features.shape}')
                print(f'subject labels shape: {subject_labels.shape}\n')

            # convert features and labels into numpy matrices
            X = subject_features.to_numpy()
            Y = subject_labels.to_numpy().ravel()

            # if hossain is the researcher chosen the scaler used during training
            # will be used to scale the test subject features
            if selector_config == "hossain" or selector_config == "cueva_second_phase-svm":    
                scaler = models[model_name]['scaler']
                X = scaler.transform(X)

            model = models[model_name]['model']
            Y_pred = model.predict(X)
            Y_pred_prob = model.predict_proba(X)
            print(f"predicted Y: {Y_pred}")
            print(f"unique values and counts: {np.unique(Y_pred, return_counts=True)}")
            print(f"true Y: {Y}")
            print(f"unique values and counts: {np.unique(Y, return_counts=True)}")

            # compute performance metric values for test subject
            acc = accuracy_score(y_true=Y, y_pred=Y_pred)
            prec = precision_score(y_true=Y, y_pred=Y_pred, average="weighted")
            rec = recall_score(y_true=Y, y_pred=Y_pred, average="weighted")
            f1 = f1_score(y_true=Y, y_pred=Y_pred, average="weighted")
            roc_auc = roc_auc_score(y_true=Y, y_score=Y_pred_prob[:, 1], average="weighted", multi_class="ovo")
            conf_matrix = confusion_matrix(Y, Y_pred).tolist()
            true_neg = conf_matrix[0][0]
            false_pos = conf_matrix[0][1]
            false_neg = conf_matrix[1][0]
            true_pos = conf_matrix[1][1]
            tpr = true_pos / (true_pos + false_neg)
            tnr = true_neg / (true_neg + false_pos)
            fpr = false_pos / (false_pos + true_neg)
            fnr = false_neg / (false_neg + true_pos)

            print(f"{data_split} acc: {acc} \
                \n{data_split} prec: {prec} \
                \n{data_split} rec: {rec} \
                \n{data_split} f1: {f1} \
                \n{data_split} roc_auc: {roc_auc} \
                \n{data_split} conf_matrix: {conf_matrix} \
                \n{data_split} tpr: {tpr} \
                \n{data_split} tnr: {tnr} \
                \n{data_split} fpr: {fpr} \
                \n{data_split} fnr: {fnr}")
            
            results = models[model_name].get(f'{data_split}_results', [])
            results.append(
                (subject_name, {
                    f'{data_split}_acc': acc,
                    f'{data_split}_prec': prec, 
                    f'{data_split}_rec': rec,
                    f'{data_split}_f1': f1,
                    f'{data_split}_roc_auc': roc_auc,
                    f'{data_split}_conf_matrix': conf_matrix,
                    f'{data_split}_tpr': tpr,
                    f'{data_split}_tnr': tnr,
                    f'{data_split}_fpr': fpr,
                    f'{data_split}_fnr': fnr,
                })
            )
            models[f'{model_name}'][f'{data_split}_results'] = results

subjects features, labels, names and subject to id lookup loaded
selector config: cueva_second_phase
estimator name: svm
subject: otafeh_expert1
subject features shape: (11912, 60)
subject labels shape: (11912, 1)

predicted Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0], dtype=int64), array([11912], dtype=int64))
true Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0, 1], dtype=int64), array([11311,   601], dtype=int64))
test acc: 0.9495466756212223                 
test prec: 0.9016388891833147                 
test rec: 0.9495466756212223                 
test f1: 0.9249728672395165                 
test roc_auc: 0.7356679132751223                 
test conf_matrix: [[11311, 0], [601, 0]]                 
test tpr: 0.0                 
test tnr: 1.0                 
test fpr: 0.0                 
test fnr: 1.0
subject: ajeric_expert2
subject features shape: (12761, 60)
subject labels shape: (12761, 1)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


predicted Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0], dtype=int64), array([12761], dtype=int64))
true Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0, 1], dtype=int64), array([11360,  1401], dtype=int64))
test acc: 0.8902123658020531                 
test prec: 0.7924780562268885                 
test rec: 0.8902123658020531                 
test f1: 0.8385069006684069                 
test roc_auc: 0.7859117858471314                 
test conf_matrix: [[11360, 0], [1401, 0]]                 
test tpr: 0.0                 
test tnr: 1.0                 
test fpr: 0.0                 
test fnr: 1.0
subject: ekamis_expert2
subject features shape: (13609, 60)
subject labels shape: (13609, 1)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


predicted Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0, 1], dtype=int64), array([13607,     2], dtype=int64))
true Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0, 1], dtype=int64), array([12418,  1191], dtype=int64))
test acc: 0.9123374237636858                 
test prec: 0.8326160158960425                 
test rec: 0.9123374237636858                 
test f1: 0.8706556102438002                 
test roc_auc: 0.7300089426266873                 
test conf_matrix: [[12416, 2], [1191, 0]]                 
test tpr: 0.0                 
test tnr: 0.9998389434691577                 
test fpr: 0.00016105653084232566                 
test fnr: 1.0
subject: ukudab_expert2
subject features shape: (14287, 60)
subject labels shape: (14287, 1)

predicted Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0], dtype=int64), array([14287], dtype=int64))
true Y: [0 0 0 ... 1 1 1]
unique values and counts: (array([0, 1], dtype=int64), array([12703,  1584], dtype=int64))
test

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


predicted Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0], dtype=int64), array([12034], dtype=int64))
true Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0, 1], dtype=int64), array([11098,   936], dtype=int64))
test acc: 0.9222203756024597                 
test prec: 0.8504904211763419                 
test rec: 0.9222203756024597                 
test f1: 0.8849041784917946                 
test roc_auc: 0.8730398986188318                 
test conf_matrix: [[11098, 0], [936, 0]]                 
test tpr: 0.0                 
test tnr: 1.0                 
test fpr: 0.0                 
test fnr: 1.0
subject: iguted_expert1
subject features shape: (13125, 60)
subject labels shape: (13125, 1)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


predicted Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0], dtype=int64), array([13125], dtype=int64))
true Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0, 1], dtype=int64), array([11645,  1480], dtype=int64))
test acc: 0.8872380952380953                 
test prec: 0.7871914376417235                 
test rec: 0.8872380952380953                 
test f1: 0.8342258876905627                 
test roc_auc: 0.6910428440462789                 
test conf_matrix: [[11645, 0], [1480, 0]]                 
test tpr: 0.0                 
test tnr: 1.0                 
test fpr: 0.0                 
test fnr: 1.0
subject: pqbqpr_expert2
subject features shape: (15305, 60)
subject labels shape: (15305, 1)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


predicted Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0], dtype=int64), array([15305], dtype=int64))
true Y: [0 0 0 ... 0 0 0]
unique values and counts: (array([0, 1], dtype=int64), array([14314,   991], dtype=int64))
test acc: 0.935249918327344                 
test prec: 0.8746924097313036                 
test rec: 0.935249918327344                 
test f1: 0.903958089803005                 
test roc_auc: 0.6377581973968031                 
test conf_matrix: [[14314, 0], [991, 0]]                 
test tpr: 0.0                 
test tnr: 1.0                 
test fpr: 0.0                 
test fnr: 1.0
subject: oxused_expert2
subject features shape: (11944, 60)
subject labels shape: (11944, 1)



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

In [None]:
copy = models.copy()

In [None]:
for key, value in copy.items():
    if copy[key].get('model') is not None:
        del copy[key]['model']

    if copy[key].get('scaler') is not None:
        del copy[key]['scaler']

In [None]:
copy

In [None]:
all_models_results

In [None]:
all_models_results['cueva_second_phase-svm'] = copy['cueva_second_phase-svm']
all_models_results

In [None]:
save_meta_data('./results/all_models_results.json', all_models_results)

In [83]:
# model_names = ['cueva_second_phase-1-5-weighted-svm',
#     'cueva_second_phase-1-9-weighted-svm',
#     'cueva_second_phase-1-2-weighted-svm',
#     'taylor-lr',
#     'taylor-rf',
#     'taylor-svm',
#     'hossain-lr',
#     'hossain-gbt',
#     'hossain-svm',
#     'jurado-lstm-cnn'
# ]
# data_splits = ["test", "train"]

In [84]:
# for model_name in model_names:
#     selector_config, estimator_name = model_name.split('-', 1)
    
#     for data_split in data_splits:
        
#         # # loop through each generated features dataframes from test subjects signals and feed repeatedly to a trained ml models
#         # subjects_names = train_subjects_names if data_split == "train" else test_subjects_names
#         # for index, subject_name in enumerate(subjects_names):
#         #     # print(f'subject features columns: {subjects_features[subjects_features['subject_id'] == index].columns}')
#         #     print(f'subject: {subject_name}')
#         for i, result in enumerate(results[model_name][f'{data_split}_results']):
#             print(result[0])
#             print(results[model_name][f'{data_split}_results'][i][0])

#             conf_matrix = result[1][f'{data_split}_conf_matrix']
#             true_neg = conf_matrix[0][0]
#             false_pos = conf_matrix[0][1]
#             false_neg = conf_matrix[1][0]
#             true_pos = conf_matrix[1][1]
#             tpr = true_pos / (true_pos + false_neg)
#             tnr = true_neg / (true_neg + false_pos)
#             fpr = false_pos / (false_pos + true_neg)
#             fnr = false_neg / (false_neg + true_pos)

#             print(f"{data_split} tpr: {tpr} \
#                 \n{data_split} tnr: {tnr} \
#                 \n{data_split} fpr: {fpr} \
#                 \n{data_split} fnr: {fnr}")
            
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_tpr'] = tpr
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_tnr'] = tnr
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_fpr'] = fpr
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_fnr'] = fnr

In [85]:
# results

In [86]:
# save_meta_data('./results/all_models_results.json', results)

In [87]:
# ahixac_lof = pd.read_csv(f'./data/Hybrid Artifact Detection Data/train/ahixac_expert1_lof.csv', index_col=0)
# ahixac_lof

In [88]:
# ahixac_lof['raw_128hz_skewness.1']

In [89]:
# ahixac_lof['filt_16hz_skewness.1']

In [90]:
# ren_ahixac_lof = ahixac_lof.rename(columns={
#         'raw_128hz_skewness.1': 'raw_128hz_kurt',
#         'filt_128hz_skewness.1': 'filt_128hz_kurt',
#         'raw_16hz_skewness.1': 'raw_16hz_kurt',
#         'filt_16hz_skewness.1': 'filt_16hz_kurt',
# })

In [91]:
# ren_ahixac_lof['raw_128hz_kurt']

In [92]:
# for index, train_subject_name in enumerate(train_subjects_names):
#     print(f'subject: {train_subject_name}')

#     # save both lstm features and lstm labels
#     train_subject_lof = pd.read_csv(f'./data/Hybrid Artifact Detection Data/train/{train_subject_name}_lof.csv', index_col=0)
#     train_subject_lof.rename(columns={
#         'raw_128hz_skewness.1': 'raw_128hz_kurt',
#         'filt_128hz_skewness.1': 'filt_128hz_kurt',
#         'raw_16hz_skewness.1': 'raw_16hz_kurt',
#         'filt_16hz_skewness.1': 'filt_16hz_kurt',
#     }, inplace=True)
#     train_subject_lof.to_csv(f'./data/Hybrid Artifact Detection Data/train/{train_subject_name}_lof.csv')

In [93]:
# for index, test_subject_name in enumerate(test_subjects_names):
#     print(f'subject: {test_subject_name}')

#     # save both lstm features and lstm labels
#     test_subject_lof = pd.read_csv(f'./data/Hybrid Artifact Detection Data/test/{test_subject_name}_lof.csv', index_col=0)
#     test_subject_lof.rename(columns={
#         'raw_128hz_skewness.1': 'raw_128hz_kurt',
#         'filt_128hz_skewness.1': 'filt_128hz_kurt',
#         'raw_16hz_skewness.1': 'raw_16hz_kurt',
#         'filt_16hz_skewness.1': 'filt_16hz_kurt',
#     }, inplace=True)
#     test_subject_lof.to_csv(f'./data/Hybrid Artifact Detection Data/test/{test_subject_name}_lof.csv')