# This notebook will just attempt to correct some columns of the generated dataframes from the feature extractor functions

In [1]:
import datetime
import math
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile
import requests
import re
import tensorflow as tf

from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# import and load model architectures as well as decoder
from models.cueva import LSTM_FE
from models.llanes_jurado import LSTM_CNN
from utilities.preprocessors import correct_signals
from utilities.loaders import load_meta_data, save_meta_data, concur_load_data, charge_raw_data, _combine_data, load_model, load_lookup_array

from utilities.visualizers import (
    view_time_frame,
    view_wavelet_coeffs,
    analyze,
    data_split_metric_values,
    view_value_frequency,
    multi_class_heatmap,
    view_metric_values,
    view_classified_labels,
    view_label_freq,
    disp_cat_feat,
    plot_all_features,
    describe_col,
    ModelResults,
    view_all_splits_results)

from utilities.feature_extractors import (
    concur_extract_features_from_all,
    extract_features,
    extract_features_hybrid,
    extract_features_per_hour)

%load_ext autoreload
%autoreload 2

In [2]:
res_test_df = pd.read_csv('./results/inefoh_expert1.csv', index_col=0)
res_test_df

Unnamed: 0,time,raw_signal,clean_signal,label,auto_signal,pred_art,post_proc_pred_art,new_signal,stress_level
0,0.000000,0.000000,-0.164015,0.0,0.000000,0.0,0.0,0.000000,2.0
1,0.007812,0.000000,-0.164015,0.0,0.000000,0.0,0.0,0.000000,2.0
2,0.015625,0.000000,-0.164015,0.0,0.000000,0.0,0.0,0.000000,2.0
3,0.023438,0.000000,-0.164015,0.0,0.000000,0.0,0.0,0.000000,2.0
4,0.031250,0.000000,-0.164015,0.0,0.000000,0.0,0.0,0.000000,2.0
...,...,...,...,...,...,...,...,...,...
856081,6688.132812,0.000222,0.000222,0.0,0.000222,0.0,0.0,0.000222,0.0
856082,6688.140625,0.000222,0.000222,0.0,0.000222,0.0,0.0,0.000222,0.0
856083,6688.148438,0.000222,0.000222,0.0,0.000222,0.0,0.0,0.000222,0.0
856084,6688.156250,0.000222,0.000222,0.0,0.000222,0.0,0.0,0.000222,0.0


In [3]:
res_test_df['stress_level'].value_counts()

stress_level
2.0    540160
1.0    280320
0.0     35606
Name: count, dtype: int64

In [None]:
# 0 is baseline level of stress, 1 is medium level of stress, and 2 is high level of stress

In [None]:
train_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Train/')
train_files

In [None]:
test_files = os.listdir('./data/Electrodermal Activity artifact correction BEnchmark (EDABE)/Test/')
test_files

In [None]:
train_subjects_names = list(set([re.sub(r".csv", "", file) for file in train_files]))
train_subjects_names

In [None]:
test_subjects_names = list(set([re.sub(r".csv", "", file) for file in test_files]))
test_subjects_names

In [None]:
all_models_results = load_meta_data('./results/all_models_results.json')
all_models_results

In [7]:
# models = {
#     'cueva_second_phase-svm':{

#     },
# }

In [8]:
# def load_miscs():
#     """
#     loads miscellaneous variables to be used by the model
#     """

#     global models

#     print('loading miscellaneous...')
#     cueva_second_phase_svm_red_feats = load_lookup_array(f'./data/Artifact Detection Data/reduced_cueva_second_phase_svm_feature_set.txt')
#     models['cueva_second_phase-svm']['selected_feats'] = cueva_second_phase_svm_red_feats

#     print('miscellaneous loaded.')

# def load_preprocessors():
#     """
#     prepares and loads the saved encoders, normalizers of
#     the dataset to later transform raw user input from
#     client-side
#     """
#     global models

#     print('loading preprocessors...')

#     # pre-load here scaler of hossain used during training
#     cueva_second_phase_svm_scaler = load_model('./saved/misc/cueva_second_phase_svm_scaler.pkl')

#     models['cueva_second_phase-svm']['scaler'] = cueva_second_phase_svm_scaler

#     print('preprocessors loaded.')

# def load_models():
#     """
#     prepares and loads sample input and custom model in
#     order to use trained weights/parameters/coefficients
#     """
#     global models
    
#     print('loading models...')
#     cueva_second_phase_svm = load_model('./saved/models/cueva_second_phase_svm_clf.pkl')
#     models['cueva_second_phase-svm']['model'] = cueva_second_phase_svm

#     print('models loaded.')

In [None]:
# load_miscs()
# load_preprocessors()
# load_models()

In [None]:
# models

In [11]:
# data_splits = ["test", "train"]

In [None]:
# for model_name in ["cueva_second_phase-svm"]:
#     selector_config, estimator_name = model_name.split('-', 1)
    
#     for data_split in data_splits:
#         loader_args = {
#             'feat_config': selector_config, 
#             'data_split': data_split,
#             'exc_lof': False
#         } if selector_config == 'cueva_second_phase' else {
#             'feat_config': selector_config, 
#             'data_split': data_split,
#             'exc_lof': False
#         }

#         subjects_features, subjects_labels, subjects_names, subject_to_id = concur_load_data(**loader_args)
#         print(f'selector config: {selector_config}')
#         print(f'estimator name: {estimator_name}')
#         # print(f'subjects features shape: {subjects_features.shape}')
#         # print(f'subjects labels shape: {subjects_labels.shape}\n')
        
#         # loop through each generated features dataframes from test subjects signals and feed repeatedly to a trained ml models
#         for index, subject_name in enumerate(subjects_names):
#             # print(f'subject features columns: {subjects_features[subjects_features['subject_id'] == index].columns}')
#             print(f'subject: {subject_name}')
#             # once features are extracted features selected during
#             # tuning will be used in testing as done also during training

#             selected_feats = models[model_name]['selected_feats']

#             if loader_args.get('exc_lof') == None or loader_args.get('exc_lof') == False:
#                 subject_features = subjects_features.loc[subjects_features['subject_id'] == index, selected_feats]
#                 subject_labels = subjects_labels[subjects_labels['subject_id'] == index].drop(columns=['subject_id'])
#                 print(f'subject features shape: {subject_features.shape}')
#                 print(f'subject labels shape: {subject_labels.shape}\n')

#             # this will only fire if exc_lof is not None or is false
#             else:
#                 # if user excludes lower order features, higher order features will only be loaded
#                 subject_features = subjects_features[subjects_features['subject_id'] == index].drop(columns=['subject_id'])
#                 subject_labels = subjects_labels[subjects_labels['subject_id'] == index].drop(columns=['subject_id'])
                
#                 print(f'subject features shape: {subject_features.shape}')
#                 print(f'subject labels shape: {subject_labels.shape}\n')

#             # convert features and labels into numpy matrices
#             X = subject_features.to_numpy()
#             Y = subject_labels.to_numpy().ravel()

#             # if hossain is the researcher chosen the scaler used during training
#             # will be used to scale the test subject features
#             if selector_config == "hossain" or selector_config == "cueva_second_phase-svm":    
#                 scaler = models[model_name]['scaler']
#                 X = scaler.transform(X)

#             model = models[model_name]['model']
#             Y_pred = model.predict(X)
#             Y_pred_prob = model.predict_proba(X)
#             print(f"predicted Y: {Y_pred}")
#             print(f"unique values and counts: {np.unique(Y_pred, return_counts=True)}")
#             print(f"true Y: {Y}")
#             print(f"unique values and counts: {np.unique(Y, return_counts=True)}")

#             # compute performance metric values for test subject
#             acc = accuracy_score(y_true=Y, y_pred=Y_pred)
#             prec = precision_score(y_true=Y, y_pred=Y_pred, average="weighted")
#             rec = recall_score(y_true=Y, y_pred=Y_pred, average="weighted")
#             f1 = f1_score(y_true=Y, y_pred=Y_pred, average="weighted")
#             roc_auc = roc_auc_score(y_true=Y, y_score=Y_pred_prob[:, 1], average="weighted", multi_class="ovo")
#             conf_matrix = confusion_matrix(Y, Y_pred).tolist()
#             true_neg = conf_matrix[0][0]
#             false_pos = conf_matrix[0][1]
#             false_neg = conf_matrix[1][0]
#             true_pos = conf_matrix[1][1]
#             tpr = true_pos / (true_pos + false_neg)
#             tnr = true_neg / (true_neg + false_pos)
#             fpr = false_pos / (false_pos + true_neg)
#             fnr = false_neg / (false_neg + true_pos)

#             print(f"{data_split} acc: {acc} \
#                 \n{data_split} prec: {prec} \
#                 \n{data_split} rec: {rec} \
#                 \n{data_split} f1: {f1} \
#                 \n{data_split} roc_auc: {roc_auc} \
#                 \n{data_split} conf_matrix: {conf_matrix} \
#                 \n{data_split} tpr: {tpr} \
#                 \n{data_split} tnr: {tnr} \
#                 \n{data_split} fpr: {fpr} \
#                 \n{data_split} fnr: {fnr}")
            
#             results = models[model_name].get(f'{data_split}_results', [])
#             results.append(
#                 (subject_name, {
#                     f'{data_split}_acc': acc,
#                     f'{data_split}_prec': prec, 
#                     f'{data_split}_rec': rec,
#                     f'{data_split}_f1': f1,
#                     f'{data_split}_roc_auc': roc_auc,
#                     f'{data_split}_conf_matrix': conf_matrix,
#                     f'{data_split}_tpr': tpr,
#                     f'{data_split}_tnr': tnr,
#                     f'{data_split}_fpr': fpr,
#                     f'{data_split}_fnr': fnr,
#                 })
#             )
#             models[f'{model_name}'][f'{data_split}_results'] = results

In [None]:
# copy = models.copy()

In [None]:
# for key, value in copy.items():
#     if copy[key].get('model') is not None:
#         del copy[key]['model']

#     if copy[key].get('scaler') is not None:
#         del copy[key]['scaler']

In [None]:
# copy

In [None]:
# all_models_results

In [None]:
# all_models_results['cueva_second_phase-svm'] = copy['cueva_second_phase-svm']
# all_models_results

In [None]:
# save_meta_data('./results/all_models_results.json', all_models_results)

In [83]:
# model_names = ['cueva_second_phase-1-5-weighted-svm',
#     'cueva_second_phase-1-9-weighted-svm',
#     'cueva_second_phase-1-2-weighted-svm',
#     'taylor-lr',
#     'taylor-rf',
#     'taylor-svm',
#     'hossain-lr',
#     'hossain-gbt',
#     'hossain-svm',
#     'jurado-lstm-cnn'
# ]
# data_splits = ["test", "train"]

In [84]:
# for model_name in model_names:
#     selector_config, estimator_name = model_name.split('-', 1)
    
#     for data_split in data_splits:
        
#         # # loop through each generated features dataframes from test subjects signals and feed repeatedly to a trained ml models
#         # subjects_names = train_subjects_names if data_split == "train" else test_subjects_names
#         # for index, subject_name in enumerate(subjects_names):
#         #     # print(f'subject features columns: {subjects_features[subjects_features['subject_id'] == index].columns}')
#         #     print(f'subject: {subject_name}')
#         for i, result in enumerate(results[model_name][f'{data_split}_results']):
#             print(result[0])
#             print(results[model_name][f'{data_split}_results'][i][0])

#             conf_matrix = result[1][f'{data_split}_conf_matrix']
#             true_neg = conf_matrix[0][0]
#             false_pos = conf_matrix[0][1]
#             false_neg = conf_matrix[1][0]
#             true_pos = conf_matrix[1][1]
#             tpr = true_pos / (true_pos + false_neg)
#             tnr = true_neg / (true_neg + false_pos)
#             fpr = false_pos / (false_pos + true_neg)
#             fnr = false_neg / (false_neg + true_pos)

#             print(f"{data_split} tpr: {tpr} \
#                 \n{data_split} tnr: {tnr} \
#                 \n{data_split} fpr: {fpr} \
#                 \n{data_split} fnr: {fnr}")
            
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_tpr'] = tpr
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_tnr'] = tnr
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_fpr'] = fpr
#             results[model_name][f'{data_split}_results'][i][1][f'{data_split}_fnr'] = fnr

In [85]:
# results

In [86]:
# save_meta_data('./results/all_models_results.json', results)

In [87]:
# ahixac_lof = pd.read_csv(f'./data/Hybrid Artifact Detection Data/train/ahixac_expert1_lof.csv', index_col=0)
# ahixac_lof

In [88]:
# ahixac_lof['raw_128hz_skewness.1']

In [89]:
# ahixac_lof['filt_16hz_skewness.1']

In [90]:
# ren_ahixac_lof = ahixac_lof.rename(columns={
#         'raw_128hz_skewness.1': 'raw_128hz_kurt',
#         'filt_128hz_skewness.1': 'filt_128hz_kurt',
#         'raw_16hz_skewness.1': 'raw_16hz_kurt',
#         'filt_16hz_skewness.1': 'filt_16hz_kurt',
# })

In [91]:
# ren_ahixac_lof['raw_128hz_kurt']

In [92]:
# for index, train_subject_name in enumerate(train_subjects_names):
#     print(f'subject: {train_subject_name}')

#     # save both lstm features and lstm labels
#     train_subject_lof = pd.read_csv(f'./data/Hybrid Artifact Detection Data/train/{train_subject_name}_lof.csv', index_col=0)
#     train_subject_lof.rename(columns={
#         'raw_128hz_skewness.1': 'raw_128hz_kurt',
#         'filt_128hz_skewness.1': 'filt_128hz_kurt',
#         'raw_16hz_skewness.1': 'raw_16hz_kurt',
#         'filt_16hz_skewness.1': 'filt_16hz_kurt',
#     }, inplace=True)
#     train_subject_lof.to_csv(f'./data/Hybrid Artifact Detection Data/train/{train_subject_name}_lof.csv')

In [93]:
# for index, test_subject_name in enumerate(test_subjects_names):
#     print(f'subject: {test_subject_name}')

#     # save both lstm features and lstm labels
#     test_subject_lof = pd.read_csv(f'./data/Hybrid Artifact Detection Data/test/{test_subject_name}_lof.csv', index_col=0)
#     test_subject_lof.rename(columns={
#         'raw_128hz_skewness.1': 'raw_128hz_kurt',
#         'filt_128hz_skewness.1': 'filt_128hz_kurt',
#         'raw_16hz_skewness.1': 'raw_16hz_kurt',
#         'filt_16hz_skewness.1': 'filt_16hz_kurt',
#     }, inplace=True)
#     test_subject_lof.to_csv(f'./data/Hybrid Artifact Detection Data/test/{test_subject_name}_lof.csv')