In [None]:
from pathlib import Path
from glob import glob

In [None]:
import datasets

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
base_path = './data/ecg_final'

In [None]:
participants = [Path(path).stem for path in glob(f'{base_path}/*.csv')]

In [None]:
dataset = datasets.load_dataset(
    f'{base_path}', 
    train_participants=participants[:10],
    trust_remote_code=True
)

In [None]:
def encode(samples):
    baseline = ['Sitting', 'Recov1', 'Recov2', 'Recov3', 'Recov4', 'Recov5', 'Recov6']
    mental_stress = ['TA', 'SSST_Sing_countdown', 'Pasat', 'Raven', 'TA_repeat', 'Pasat_repeat']
    high_physical_stress = ['Treadmill1', 'Treadmill2', 'Treadmill3', 'Treadmill4', 'Walking_fast_pace', 'Cycling', 'stairs_up_and_down']
    moderate_physical_stress = ['Walking_own_pace', 'Dishes', 'Vacuum']
    low_physical_stress = ['Standing', 'Lying_supine', 'Recov_standing']
    
    def encode_multiclass(label):
        if label in baseline:
            return 0
        elif label in mental_stress:
            return 1
        elif label in high_physical_stress:
            return 2
        elif label in moderate_physical_stress:
            return 3
        elif label in low_physical_stress:
            return 4
        else:
            return -1
        
    return {
        'category': [encode_multiclass(label) for label in samples['label']],
    }


In [None]:
dataset = dataset.map(encode, batched=True, batch_size=2048, num_proc=4)
dataset = dataset.filter(lambda x: x['category'] != -1)

In [None]:
# dataset = dataset.select_columns([
#     "label",

#     "hrv_mean",
#     "hrv_min",
#     "hrv_max",
#     "hrv_std",
#     "hrv_rms",
#     "hr_max",
#     "rr_mean",
#     "rr_min",
#     "rr_max",
#     "rr_std",
#     "nn50",
#     "pnn50",
#     "rmssd",
#     "MeanNN",
#     "SDNN",
#     "SDANN1",
#     "SDNNI1",
#     "SDANN2",
#     "SDNNI2",
#     "SDANN5",
#     "SDNNI5",
#     "RMSSD",
#     "SDSD",
#     "CVNN",
#     "CVSD",
#     "MedianNN",
#     "MadNN",
#     "MCVNN",
#     "IQRNN",
#     "SDRMSSD",
#     "Prc20NN",
#     "Prc80NN",
#     "pNN50",
#     "pNN20",
#     "MinNN",
#     "MaxNN",
#     "HTI",
#     "TINN",
#     "twa",

#     # new
#     'vhf_entropy',  # 0.28
#     'lp_vhf_entropy', # 0.28
#     'lp_vhf_max',   # 0.25
#     'vhf_max',   # 0.25
#     'lp_vhf_mean',   # 0.24
#     'lp_vhf_std',   # 0.24
#     'lp_vhf_energy', # 0.22
#     'lp_vhf_power', # 0.22
#     'lp_vhf_median',      # 0.21
#     'vhf_std',      # 0.21
#     'vhf_power',    # 0.21
#     'vhf_mean',    # 0.21
#     'tp_entropy',   # 0.21
#     'vhf_median', # 0.19
#     'lp_vhf_covariance', # 0.17
#     'lp_lf_min', # 0.17
#     'w',            # 0.17
#     'PSS',          # 0.17
#     'wmax',         # 0.16
#     'hr_min',       # 0.16
#     'lp_uhf_entropy', # 0.16
#     'wen',          # 0.15
#     'hr_mean',      # 0.15
#     'PIP',          # 0.15
#     'hf_entropy',   # 0.15
#     'uhf_entropy',  # 0.14
#     'IALS',         # 0.14
#     'FuzzyEn',      # 0.14
#     'SampEn',       # 0.13
# ])

In [None]:
df = dataset['fit'].to_pandas()
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(axis=1, how='all')
df = df.fillna(0)

In [None]:
sns.set_theme(rc={'figure.figsize':(40,20)})

In [None]:
df = df.loc[:, (df != 0).any(axis=0)]

In [None]:
# df = df[
# ]]

## Non-Linearity

In [None]:
import phik
from phik import resources, report

In [None]:
from pandas.plotting import scatter_matrix 

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows", None)

In [None]:
sample = df[(df['category'] == 0) | (df['category'] == 1)]

In [None]:
sample.drop(['label'], axis=1).corrwith(sample['category']).sort_values()

In [None]:
for label in df[(df['category'] == 1)]['label'].unique():
    sample = df[(df['category'] == 0) | (df['label'] == label)]
    print(label)
    display(sample.phik_matrix())

In [None]:
phik_matrix = df[(df['label'] == 0) | (df['label'] == 1)].phik_matrix(njobs=6)
phik_matrix

In [None]:
# sns.set_theme(rc={'figure.figsize':(160,80)})
sns.set_theme(rc={'figure.figsize':(80,40)})

In [None]:
sns.heatmap(phik_matrix, cmap='Blues', annot=True)

In [None]:
df[(df['label'] == 0) | (df['label'] == 2)].phik_matrix(njobs=6)

In [None]:
sns.heatmap(df[(df['label'] == 0) | (df['label'] == 2)].phik_matrix(njobs=6), cmap='Blues', annot=True)

In [None]:
significance_matrix = df[(df['label'] == 0) | (df['label'] == 1)].significance_matrix(njobs=6)
significance_matrix

In [None]:
sns.heatmap(significance_matrix, cmap='Blues', annot=True)

## Linearity

In [None]:
sns.heatmap(df[(df['label'] == 0) | (df['label'] == 1)].corr(), cmap="Blues", annot=True)

In [None]:
sns.heatmap(df[(df['label'] == 0) | (df['label'] == 2)].corr(), cmap="Blues", annot=True)

In [None]:
sns.heatmap(df[(df['label'] == 1) | (df['label'] == 2)].corr(), cmap="Blues", annot=True)

In [None]:
sns.heatmap(df[(df['label'] == 0) | (df['label'] == 1) | (df['label'] == 2)].corr(), cmap="Blues", annot=True)

In [None]:
sns.heatmap(df.corr(), cmap="Blues", annot=True)