# Step 1: Dataset Inspection
## 1.1 Structure Analysis | 1.2 Data Location Mapping | 1.3 Outputs

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

BASE_PATH = Path('/home/alvaro-ibarra/smartwatch-stress-detection')
WESAD_PATH = BASE_PATH / 'WESAD'
EPM_PATH = BASE_PATH / 'EPM-E4' / 'empatica_wearable_data' / 'raw'
PHYSIONET_PATH = BASE_PATH / 'wearable-device-dataset' / 'wearable-device-dataset-from-induced-stress-and-structured-exercise-sessions-1.0.1' / 'Wearable_Dataset'
OUTPUT_PATH = BASE_PATH / 'outputs' / 'tables'

## 1.1 WESAD Structure Analysis

In [2]:
# WESAD: Subject mapping and E4 data availability
wesad_subjects = [d for d in os.listdir(WESAD_PATH) if d.startswith('S') and os.path.isdir(WESAD_PATH / d)]
wesad_subjects = sorted(wesad_subjects, key=lambda x: int(x[1:]))

wesad_data = []
e4_signals = ['ACC.csv', 'BVP.csv', 'EDA.csv', 'HR.csv', 'IBI.csv', 'TEMP.csv']

for subj in wesad_subjects:
    e4_path = WESAD_PATH / subj / f'{subj}_E4_Data'
    pkl_path = WESAD_PATH / subj / f'{subj}.pkl'
    
    row = {
        'subject_id': subj,
        'dataset': 'WESAD',
        'pkl_exists': pkl_path.exists(),
        'e4_folder_exists': e4_path.exists()
    }
    
    for signal in e4_signals:
        signal_path = e4_path / signal
        row[signal.replace('.csv', '')] = signal_path.exists()
    
    wesad_data.append(row)

wesad_df = pd.DataFrame(wesad_data)
print(f"WESAD: {len(wesad_subjects)} subjects")
print(f"Subject IDs: {wesad_subjects}")
wesad_df

WESAD: 15 subjects
Subject IDs: ['S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17']


Unnamed: 0,subject_id,dataset,pkl_exists,e4_folder_exists,ACC,BVP,EDA,HR,IBI,TEMP
0,S2,WESAD,True,True,True,True,True,True,True,True
1,S3,WESAD,True,True,True,True,True,True,True,True
2,S4,WESAD,True,True,True,True,True,True,True,True
3,S5,WESAD,True,True,True,True,True,True,True,True
4,S6,WESAD,True,True,True,True,True,True,True,True
5,S7,WESAD,True,True,True,True,True,True,True,True
6,S8,WESAD,True,True,True,True,True,True,True,True
7,S9,WESAD,True,True,True,True,True,True,True,True
8,S10,WESAD,True,True,True,True,True,True,True,True
9,S11,WESAD,True,True,True,True,True,True,True,True


In [3]:
# WESAD: Sampling rates verification
def get_e4_sampling_rate(filepath):
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()
            return float(lines[1].strip())
    except:
        return None

# Check sampling rates from first available subject
sample_subj = wesad_subjects[0]
e4_path = WESAD_PATH / sample_subj / f'{sample_subj}_E4_Data'

wesad_sampling_rates = {}
for signal in e4_signals:
    sr = get_e4_sampling_rate(e4_path / signal)
    wesad_sampling_rates[signal.replace('.csv', '')] = sr

print("WESAD E4 Sampling Rates (Hz):")
for k, v in wesad_sampling_rates.items():
    print(f"  {k}: {v}")

WESAD E4 Sampling Rates (Hz):
  ACC: None
  BVP: 64.0
  EDA: 4.0
  HR: 1.0
  IBI: None
  TEMP: 4.0


In [4]:
# WESAD: Label mapping from pkl file
sample_pkl = WESAD_PATH / sample_subj / f'{sample_subj}.pkl'
with open(sample_pkl, 'rb') as f:
    wesad_sample = pickle.load(f, encoding='latin1')

print("WESAD pkl structure keys:", wesad_sample.keys())
print("\nLabel values:", np.unique(wesad_sample['label']))
print("Label mapping: 0=undefined, 1=baseline, 2=stress, 3=amusement, 4=meditation")

WESAD pkl structure keys: dict_keys(['signal', 'label', 'subject'])

Label values: [0 1 2 3 4 6 7]
Label mapping: 0=undefined, 1=baseline, 2=stress, 3=amusement, 4=meditation


## 1.1 EPM-E4 Structure Analysis

In [5]:
# EPM-E4: Subject mapping and signal availability
epm_subjects = [d for d in os.listdir(EPM_PATH) if os.path.isdir(EPM_PATH / d) and d != '.DS_Store']
epm_subjects = sorted(epm_subjects, key=lambda x: int(x))

epm_data = []
epm_signals = ['ACC.csv', 'BVP.csv', 'EDA.csv', 'HR.csv', 'IBI.csv', 'TEMP.csv']

for subj in epm_subjects:
    empatica_path = EPM_PATH / subj / 'empatica'
    
    row = {
        'subject_id': subj,
        'dataset': 'EPM-E4',
        'empatica_folder_exists': empatica_path.exists()
    }
    
    for signal in epm_signals:
        signal_path = empatica_path / signal
        row[signal.replace('.csv', '')] = signal_path.exists() if empatica_path.exists() else False
    
    epm_data.append(row)

epm_df = pd.DataFrame(epm_data)
print(f"EPM-E4: {len(epm_subjects)} subjects")
print(f"Subject IDs: {epm_subjects}")
epm_df

EPM-E4: 47 subjects
Subject IDs: ['1', '2', '4', '12', '14', '15', '16', '24', '25', '34', '35', '36', '39', '40', '45', '47', '49', '55', '56', '61', '66', '68', '72', '76', '77', '78', '79', '81', '86', '91', '92', '103', '106', '107', '117', '121', '122', '124', '126', '128', '129', '131', '137', '138', '141', '159', '160']


Unnamed: 0,subject_id,dataset,empatica_folder_exists,ACC,BVP,EDA,HR,IBI,TEMP
0,1,EPM-E4,True,True,True,True,True,True,True
1,2,EPM-E4,True,True,True,True,True,True,True
2,4,EPM-E4,True,True,True,True,True,True,True
3,12,EPM-E4,True,True,True,True,True,True,True
4,14,EPM-E4,True,True,True,True,True,True,True
5,15,EPM-E4,True,True,True,True,True,True,True
6,16,EPM-E4,True,True,True,True,True,True,True
7,24,EPM-E4,True,True,True,True,True,True,True
8,25,EPM-E4,True,True,True,True,True,True,True
9,34,EPM-E4,True,True,True,True,True,True,True


In [6]:
# EPM-E4: Sampling rates verification
sample_subj_epm = epm_subjects[0]
empatica_path = EPM_PATH / sample_subj_epm / 'empatica'

epm_sampling_rates = {}
for signal in epm_signals:
    sr = get_e4_sampling_rate(empatica_path / signal)
    epm_sampling_rates[signal.replace('.csv', '')] = sr

print("EPM-E4 Sampling Rates (Hz):")
for k, v in epm_sampling_rates.items():
    print(f"  {k}: {v}")

EPM-E4 Sampling Rates (Hz):
  ACC: None
  BVP: 64.0
  EDA: 4.0
  HR: 1.0
  IBI: None
  TEMP: 4.0


In [7]:
# EPM-E4: Key moments (emotion labels)
key_moments_path = BASE_PATH / 'EPM-E4' / 'key_moments'
emotions = ['ANGER', 'FEAR', 'HAPPINESS', 'SADNESS']

print("EPM-E4 Key Moments:")
for emotion in emotions:
    km_file = key_moments_path / f'{emotion}.csv'
    if km_file.exists():
        km_df = pd.read_csv(km_file)
        print(f"  {emotion}: {len(km_df)} entries")

print("\nEPM-E4 Classes: Baseline, Anger, Sadness, Happiness, Fear")

EPM-E4 Key Moments:
  ANGER: 5 entries
  FEAR: 9 entries
  HAPPINESS: 5 entries
  SADNESS: 5 entries

EPM-E4 Classes: Baseline, Anger, Sadness, Happiness, Fear


## 1.1 PhysioNet Structure Analysis

In [8]:
# PhysioNet: Subject mapping per protocol
protocols = ['STRESS', 'AEROBIC', 'ANAEROBIC']
physionet_signals = ['ACC.csv', 'BVP.csv', 'EDA.csv', 'HR.csv', 'IBI.csv', 'TEMP.csv', 'tags.csv']

physionet_data = []

for protocol in protocols:
    protocol_path = PHYSIONET_PATH / protocol
    subjects = [d for d in os.listdir(protocol_path) if os.path.isdir(protocol_path / d)]
    subjects = sorted(subjects)
    
    for subj in subjects:
        subj_path = protocol_path / subj
        
        row = {
            'subject_id': subj,
            'dataset': 'PhysioNet',
            'protocol': protocol
        }
        
        for signal in physionet_signals:
            signal_path = subj_path / signal
            row[signal.replace('.csv', '')] = signal_path.exists()
        
        physionet_data.append(row)

physionet_df = pd.DataFrame(physionet_data)

print("PhysioNet subjects per protocol:")
for protocol in protocols:
    count = len(physionet_df[physionet_df['protocol'] == protocol])
    subjects = physionet_df[physionet_df['protocol'] == protocol]['subject_id'].tolist()
    print(f"  {protocol}: {count} subjects")
    print(f"    IDs: {subjects}")

PhysioNet subjects per protocol:
  STRESS: 37 subjects
    IDs: ['S01', 'S02', 'S03', 'S04', 'S05', 'S06', 'S07', 'S08', 'S09', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'f01', 'f02', 'f03', 'f04', 'f05', 'f06', 'f07', 'f08', 'f09', 'f10', 'f11', 'f12', 'f13', 'f14_a', 'f14_b', 'f15', 'f16', 'f17', 'f18']
  AEROBIC: 31 subjects
    IDs: ['S01', 'S02', 'S03', 'S04', 'S05', 'S06', 'S07', 'S08', 'S09', 'S10', 'S11_a', 'S11_b', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'f01', 'f02', 'f03', 'f04', 'f05', 'f06', 'f07', 'f08', 'f09', 'f10', 'f11', 'f12', 'f13']
  ANAEROBIC: 32 subjects
    IDs: ['S01', 'S02', 'S03', 'S04', 'S05', 'S06', 'S07', 'S08', 'S09', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16_a', 'S16_b', 'S17', 'S18', 'f01', 'f02', 'f03', 'f04', 'f05', 'f06', 'f07', 'f08', 'f09', 'f10', 'f11', 'f12', 'f13']


In [9]:
# PhysioNet: Sampling rates verification
sample_protocol = 'STRESS'
sample_subj_pn = 'S01'
sample_path = PHYSIONET_PATH / sample_protocol / sample_subj_pn

physionet_sampling_rates = {}
for signal in physionet_signals[:-1]:  # Exclude tags.csv
    sr = get_e4_sampling_rate(sample_path / signal)
    physionet_sampling_rates[signal.replace('.csv', '')] = sr

print("PhysioNet Sampling Rates (Hz):")
for k, v in physionet_sampling_rates.items():
    print(f"  {k}: {v}")

PhysioNet Sampling Rates (Hz):
  ACC: None
  BVP: 64.0
  EDA: 4.0
  HR: 1.0
  IBI: None
  TEMP: 4.0


In [11]:
# PhysioNet: Data constraints (known issues)
constraints_path = BASE_PATH / 'wearable-device-dataset' / 'wearable-device-dataset-from-induced-stress-and-structured-exercise-sessions-1.0.1' / 'data_constraints.txt'

with open(constraints_path, 'r', encoding='latin-1') as f:
    constraints = f.read()

print("PhysioNet Data Constraints:")
print(constraints)

# Parse constraints into structured format
constraints_data = [
    {'subject_id': 'S02', 'protocol': 'STRESS', 'issue': 'Duplicated signals'},
    {'subject_id': 'f07', 'protocol': 'STRESS', 'issue': 'PPG/TEMP sensors covered - only EDA/ACC valid'},
    {'subject_id': 'f14', 'protocol': 'STRESS', 'issue': 'Connection loss - split files (f14_a, f14_b)'},
    {'subject_id': 'S03', 'protocol': 'AEROBIC', 'issue': 'Protocol ended early (90 rpm)'},
    {'subject_id': 'S07', 'protocol': 'AEROBIC', 'issue': 'Protocol ended early (95 rpm)'},
    {'subject_id': 'S11', 'protocol': 'AEROBIC', 'issue': 'Connection loss - split files (S11_a, S11_b)'},
    {'subject_id': 'S12', 'protocol': 'AEROBIC', 'issue': 'No data - protocol not performed'},
    {'subject_id': 'S01', 'protocol': 'ANAEROBIC', 'issue': 'Empty IBI file'},
    {'subject_id': 'S06', 'protocol': 'ANAEROBIC', 'issue': 'Protocol ended early (missing last sprint)'},
    {'subject_id': 'S16', 'protocol': 'ANAEROBIC', 'issue': 'Connection loss - split files (S16_a, S16_b)'}
]

constraints_df = pd.DataFrame(constraints_data)
constraints_df

PhysioNet Data Constraints:
STRESS
	S02: Files downloaded from E4 Connect have duplicated signals. Duplicated raw values start in: ACC.csv: row 49,545; BVP.csv: row 99,091; EDA.csv and TEMP.csv: row 6,195. As IBI and HR files are obtained from the BVP signal through Empatica's algorithm, it is not evident where the duplicated data start.
	f07: The protection dock was never removed from the wristband, covering the PPG and TEMPERATURE sensors. Thus, only the ELECTRODERMAL ACTIVITY AND ACCELEROMETER DATA measurements are valid.
	f14: Bluetooth connection was lost during registration. The baseline is in file f14_a, and the rest of the protocol is in file f14_b.

AEROBIC
	S03: Ended the protocol early, only reaching 90 rpm.
	S07: Ended the protocol early, only reaching 95 rpm.
	S11: Bluetooth connection was lost during registration. File S11_a includes data up to 105 rpm, and S11_b includes the last 110 rpm block.
	S12: Did not perform the aerobic protocol, so there are no data.

ANA

Unnamed: 0,subject_id,protocol,issue
0,S02,STRESS,Duplicated signals
1,f07,STRESS,PPG/TEMP sensors covered - only EDA/ACC valid
2,f14,STRESS,"Connection loss - split files (f14_a, f14_b)"
3,S03,AEROBIC,Protocol ended early (90 rpm)
4,S07,AEROBIC,Protocol ended early (95 rpm)
5,S11,AEROBIC,"Connection loss - split files (S11_a, S11_b)"
6,S12,AEROBIC,No data - protocol not performed
7,S01,ANAEROBIC,Empty IBI file
8,S06,ANAEROBIC,Protocol ended early (missing last sprint)
9,S16,ANAEROBIC,"Connection loss - split files (S16_a, S16_b)"


## 1.2 Data Location Mapping

In [12]:
# Combined subject inventory
all_subjects = []

# WESAD subjects
for _, row in wesad_df.iterrows():
    all_subjects.append({
        'subject_id': row['subject_id'],
        'dataset': 'WESAD',
        'protocol': 'TSST',
        'classes': 'Baseline, Stress, Amusement',
        'e4_data': row['e4_folder_exists'],
        'data_path': str(WESAD_PATH / row['subject_id'] / f"{row['subject_id']}_E4_Data")
    })

# EPM-E4 subjects
for _, row in epm_df.iterrows():
    all_subjects.append({
        'subject_id': row['subject_id'],
        'dataset': 'EPM-E4',
        'protocol': 'Film_Elicitation',
        'classes': 'Baseline, Anger, Sadness, Happiness, Fear',
        'e4_data': row['empatica_folder_exists'],
        'data_path': str(EPM_PATH / row['subject_id'] / 'empatica')
    })

# PhysioNet subjects
for _, row in physionet_df.iterrows():
    all_subjects.append({
        'subject_id': row['subject_id'],
        'dataset': 'PhysioNet',
        'protocol': row['protocol'],
        'classes': row['protocol'],
        'e4_data': row['BVP'] and row['EDA'] and row['TEMP'] and row['ACC'],
        'data_path': str(PHYSIONET_PATH / row['protocol'] / row['subject_id'])
    })

all_subjects_df = pd.DataFrame(all_subjects)
print(f"Total subjects across all datasets: {len(all_subjects_df)}")
print(f"\nBreakdown:")
print(all_subjects_df.groupby('dataset').size())

Total subjects across all datasets: 162

Breakdown:
dataset
EPM-E4        47
PhysioNet    100
WESAD         15
dtype: int64


## 1.3 Output Generation

In [14]:
# Sampling Rate Table (ACC has multi-column format)
sampling_rates_table = pd.DataFrame({
    'Signal': ['BVP', 'EDA', 'TEMP', 'ACC', 'HR', 'IBI'],
    'WESAD (Hz)': [64.0, 4.0, 4.0, 32.0, 1.0, 'Variable'],
    'EPM-E4 (Hz)': [64.0, 4.0, 4.0, 32.0, 1.0, 'Variable'],
    'PhysioNet (Hz)': [64.0, 4.0, 4.0, 32.0, 1.0, 'Variable']
})

print("Sampling Rate Table (All Datasets):")
sampling_rates_table

Sampling Rate Table (All Datasets):


Unnamed: 0,Signal,WESAD (Hz),EPM-E4 (Hz),PhysioNet (Hz)
0,BVP,64.0,64.0,64.0
1,EDA,4.0,4.0,4.0
2,TEMP,4.0,4.0,4.0
3,ACC,32.0,32.0,32.0
4,HR,1.0,1.0,1.0
5,IBI,Variable,Variable,Variable


In [15]:
# Dataset Structure Summary
structure_summary = pd.DataFrame({
    'Dataset': ['WESAD', 'EPM-E4', 'PhysioNet'],
    'Subjects': [len(wesad_subjects), len(epm_subjects), len(physionet_df['subject_id'].unique())],
    'File_Format': ['pkl + CSV', 'CSV', 'CSV'],
    'E4_Signals': ['BVP, EDA, TEMP, ACC, HR, IBI', 'BVP, EDA, TEMP, ACC, HR, IBI', 'BVP, EDA, TEMP, ACC, HR, IBI'],
    'Classes': ['Baseline, Stress, Amusement', 'Baseline, Anger, Sadness, Happiness, Fear', 'Stress, Aerobic, Anaerobic'],
    'Label_Source': ['pkl file (label array)', 'key_moments CSV + tags', 'Protocol folder + tags']
})

print("Dataset Structure Summary:")
structure_summary

Dataset Structure Summary:


Unnamed: 0,Dataset,Subjects,File_Format,E4_Signals,Classes,Label_Source
0,WESAD,15,pkl + CSV,"BVP, EDA, TEMP, ACC, HR, IBI","Baseline, Stress, Amusement",pkl file (label array)
1,EPM-E4,47,CSV,"BVP, EDA, TEMP, ACC, HR, IBI","Baseline, Anger, Sadness, Happiness, Fear",key_moments CSV + tags
2,PhysioNet,41,CSV,"BVP, EDA, TEMP, ACC, HR, IBI","Stress, Aerobic, Anaerobic",Protocol folder + tags


In [16]:
# Missing Data Log
missing_data = []

# WESAD missing
for _, row in wesad_df.iterrows():
    if not row['e4_folder_exists']:
        missing_data.append({'dataset': 'WESAD', 'subject_id': row['subject_id'], 'issue': 'E4 folder missing'})
    for signal in ['BVP', 'EDA', 'TEMP', 'ACC']:
        if not row.get(signal, True):
            missing_data.append({'dataset': 'WESAD', 'subject_id': row['subject_id'], 'issue': f'{signal} missing'})

# EPM-E4 missing
for _, row in epm_df.iterrows():
    if not row['empatica_folder_exists']:
        missing_data.append({'dataset': 'EPM-E4', 'subject_id': row['subject_id'], 'issue': 'Empatica folder missing'})
    for signal in ['BVP', 'EDA', 'TEMP', 'ACC']:
        if not row.get(signal, True):
            missing_data.append({'dataset': 'EPM-E4', 'subject_id': row['subject_id'], 'issue': f'{signal} missing'})

# PhysioNet - add constraints as missing/issues
for _, row in constraints_df.iterrows():
    missing_data.append({'dataset': 'PhysioNet', 'subject_id': row['subject_id'], 'issue': row['issue']})

missing_df = pd.DataFrame(missing_data)
print(f"Missing/Problematic Data Entries: {len(missing_df)}")
missing_df

Missing/Problematic Data Entries: 10


Unnamed: 0,dataset,subject_id,issue
0,PhysioNet,S02,Duplicated signals
1,PhysioNet,f07,PPG/TEMP sensors covered - only EDA/ACC valid
2,PhysioNet,f14,"Connection loss - split files (f14_a, f14_b)"
3,PhysioNet,S03,Protocol ended early (90 rpm)
4,PhysioNet,S07,Protocol ended early (95 rpm)
5,PhysioNet,S11,"Connection loss - split files (S11_a, S11_b)"
6,PhysioNet,S12,No data - protocol not performed
7,PhysioNet,S01,Empty IBI file
8,PhysioNet,S06,Protocol ended early (missing last sprint)
9,PhysioNet,S16,"Connection loss - split files (S16_a, S16_b)"


In [17]:
# Save outputs
structure_summary.to_csv(OUTPUT_PATH / 'dataset_structure_summary.csv', index=False)
sampling_rates_table.to_csv(OUTPUT_PATH / 'sampling_rates.csv', index=False)
all_subjects_df.to_csv(OUTPUT_PATH / 'all_subjects_inventory.csv', index=False)
missing_df.to_csv(OUTPUT_PATH / 'missing_data_log.csv', index=False)
constraints_df.to_csv(OUTPUT_PATH / 'physionet_constraints.csv', index=False)

# Individual dataset inventories
wesad_df.to_csv(OUTPUT_PATH / 'wesad_subjects.csv', index=False)
epm_df.to_csv(OUTPUT_PATH / 'epm_subjects.csv', index=False)
physionet_df.to_csv(OUTPUT_PATH / 'physionet_subjects.csv', index=False)

print("Outputs saved to:", OUTPUT_PATH)
print("Files created:")
for f in os.listdir(OUTPUT_PATH):
    print(f"  - {f}")

Outputs saved to: /home/alvaro-ibarra/smartwatch-stress-detection/outputs/tables
Files created:
  - all_subjects_inventory.csv
  - dataset_structure_summary.csv
  - missing_data_log.csv
  - epm_subjects.csv
  - physionet_subjects.csv
  - sampling_rates.csv
  - physionet_constraints.csv
  - wesad_subjects.csv


In [18]:
# Final Summary
print("="*60)
print("STEP 1 COMPLETE: Dataset Inspection Summary")
print("="*60)
print(f"\nWESAD: {len(wesad_subjects)} subjects | Classes: Baseline, Stress, Amusement")
print(f"EPM-E4: {len(epm_subjects)} subjects | Classes: Baseline, Anger, Sadness, Happiness, Fear")
print(f"PhysioNet: {len(physionet_df['subject_id'].unique())} unique subjects | Classes: Stress, Aerobic, Anaerobic")
print(f"\nTotal unique subject-protocol combinations: {len(all_subjects_df)}")
print(f"\nCommon E4 Signals: BVP (64Hz), EDA (4Hz), TEMP (4Hz), ACC (32Hz)")
print(f"\nData issues identified: {len(missing_df)}")
print("="*60)

STEP 1 COMPLETE: Dataset Inspection Summary

WESAD: 15 subjects | Classes: Baseline, Stress, Amusement
EPM-E4: 47 subjects | Classes: Baseline, Anger, Sadness, Happiness, Fear
PhysioNet: 41 unique subjects | Classes: Stress, Aerobic, Anaerobic

Total unique subject-protocol combinations: 162

Common E4 Signals: BVP (64Hz), EDA (4Hz), TEMP (4Hz), ACC (32Hz)

Data issues identified: 10
