# Step 2: Data Preprocessing
## 2.1 Subject Profile Inspection | 2.2 Signal Quality Check

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

BASE_PATH = Path('/home/alvaro-ibarra/smartwatch-stress-detection')
WESAD_PATH = BASE_PATH / 'WESAD'
EPM_PATH = BASE_PATH / 'EPM-E4' / 'empatica_wearable_data' / 'raw'
PHYSIONET_PATH = BASE_PATH / 'wearable-device-dataset' / 'wearable-device-dataset-from-induced-stress-and-structured-exercise-sessions-1.0.1' / 'Wearable_Dataset'
PHYSIONET_META = BASE_PATH / 'wearable-device-dataset' / 'wearable-device-dataset-from-induced-stress-and-structured-exercise-sessions-1.0.1'
OUTPUT_PATH = BASE_PATH / 'outputs' / 'tables'

## 2.1 Subject Profile Inspection

In [2]:
# WESAD Subject Demographics
wesad_subjects = [d for d in os.listdir(WESAD_PATH) if d.startswith('S') and os.path.isdir(WESAD_PATH / d)]
wesad_subjects = sorted(wesad_subjects, key=lambda x: int(x[1:]))

wesad_profiles = []
for subj in wesad_subjects:
    readme_path = WESAD_PATH / subj / f'{subj}_readme.txt'
    profile = {'subject_id': subj, 'dataset': 'WESAD'}
    
    if readme_path.exists():
        with open(readme_path, 'r') as f:
            content = f.read()
        
        for line in content.split('\n'):
            if 'Age:' in line:
                profile['age'] = int(line.split(':')[1].strip())
            elif 'Height' in line:
                profile['height_cm'] = int(line.split(':')[1].strip())
            elif 'Weight' in line:
                profile['weight_kg'] = int(line.split(':')[1].strip())
            elif 'Gender:' in line:
                profile['gender'] = line.split(':')[1].strip()
    
    wesad_profiles.append(profile)

wesad_profiles_df = pd.DataFrame(wesad_profiles)
print(f"WESAD: {len(wesad_profiles_df)} subjects")
wesad_profiles_df

WESAD: 15 subjects


Unnamed: 0,subject_id,dataset,age,height_cm,weight_kg,gender
0,S2,WESAD,27,175,80,male
1,S3,WESAD,27,173,69,male
2,S4,WESAD,25,175,90,male
3,S5,WESAD,35,189,80,male
4,S6,WESAD,27,170,66,male
5,S7,WESAD,28,184,74,male
6,S8,WESAD,27,172,64,female
7,S9,WESAD,26,181,75,male
8,S10,WESAD,28,178,76,male
9,S11,WESAD,26,171,54,female


In [3]:
# PhysioNet Subject Demographics
subject_info_path = PHYSIONET_META / 'subject-info.csv'
physionet_info = pd.read_csv(subject_info_path)

# Clean column names
physionet_info.columns = physionet_info.columns.str.strip()
physionet_info = physionet_info.rename(columns={
    'Info': 'subject_id',
    'Gender': 'gender',
    'Age': 'age',
    'Height (cm)': 'height_cm',
    'Weight (kg)': 'weight_kg'
})

# Filter valid rows
physionet_info = physionet_info[physionet_info['subject_id'].notna()]
physionet_info = physionet_info[~physionet_info['subject_id'].str.contains('Reference|m:|f:|V1|V2|\*', na=False)]
physionet_info['dataset'] = 'PhysioNet'

# Map gender
physionet_info['gender'] = physionet_info['gender'].map({'m': 'male', 'f': 'female'})

# Convert numeric columns
for col in ['age', 'height_cm', 'weight_kg']:
    physionet_info[col] = pd.to_numeric(physionet_info[col], errors='coerce')

physionet_profiles_df = physionet_info[['subject_id', 'dataset', 'age', 'gender', 'height_cm', 'weight_kg']].copy()
print(f"PhysioNet: {len(physionet_profiles_df)} subjects")
physionet_profiles_df

PhysioNet: 36 subjects


Unnamed: 0,subject_id,dataset,age,gender,height_cm,weight_kg
0,S01,PhysioNet,21.0,male,192.0,84.0
1,S02,PhysioNet,20.0,male,185.0,95.0
2,S03,PhysioNet,20.0,male,175.0,62.0
3,S04,PhysioNet,21.0,male,174.0,70.0
4,S05,PhysioNet,21.0,male,173.0,72.0
5,S06,PhysioNet,21.0,male,172.0,70.0
6,S07,PhysioNet,19.0,male,184.0,88.0
7,S08,PhysioNet,20.0,male,174.0,67.0
8,S09,PhysioNet,19.0,male,174.0,63.0
9,S10,PhysioNet,21.0,male,180.0,80.0


In [4]:
# EPM-E4 Subject Demographics (limited info available)
epm_subjects = [d for d in os.listdir(EPM_PATH) if os.path.isdir(EPM_PATH / d) and d != '.DS_Store']
epm_subjects = sorted(epm_subjects, key=lambda x: int(x))

epm_profiles = []
for subj in epm_subjects:
    epm_profiles.append({
        'subject_id': subj,
        'dataset': 'EPM-E4',
        'age': np.nan,
        'gender': np.nan,
        'height_cm': np.nan,
        'weight_kg': np.nan
    })

epm_profiles_df = pd.DataFrame(epm_profiles)
print(f"EPM-E4: {len(epm_profiles_df)} subjects (demographics not available in raw data)")
epm_profiles_df.head()

EPM-E4: 47 subjects (demographics not available in raw data)


Unnamed: 0,subject_id,dataset,age,gender,height_cm,weight_kg
0,1,EPM-E4,,,,
1,2,EPM-E4,,,,
2,4,EPM-E4,,,,
3,12,EPM-E4,,,,
4,14,EPM-E4,,,,


In [5]:
# Combined Subject Profiles
all_profiles = pd.concat([wesad_profiles_df, epm_profiles_df, physionet_profiles_df], ignore_index=True)
print(f"Total subjects: {len(all_profiles)}")
print(f"\nDemographics Summary (where available):")
print(all_profiles.groupby('dataset')[['age', 'height_cm', 'weight_kg']].describe())

Total subjects: 98

Demographics Summary (where available):
            age                                                    height_cm  \
          count       mean       std   min   25%   50%   75%   max     count   
dataset                                                                        
EPM-E4      0.0        NaN       NaN   NaN   NaN   NaN   NaN   NaN       0.0   
PhysioNet  31.0  22.322581  3.590609  18.0  20.5  21.0  22.0  31.0      31.0   
WESAD      15.0  27.466667  2.445599  24.0  26.5  27.0  28.0  35.0      15.0   

                       ...               weight_kg                        \
                 mean  ...    75%    max     count       mean        std   
dataset                ...                                                 
EPM-E4            NaN  ...    NaN    NaN       0.0        NaN        NaN   
PhysioNet  171.677419  ...  176.0  192.0      31.0  69.870968  13.076039   
WESAD      177.600000  ...  182.5  189.0      15.0  73.133333  10.273869   

  

In [6]:
# Load constraints and create exclusion list
constraints_df = pd.read_csv(OUTPUT_PATH / 'physionet_constraints.csv')

# Exclusion criteria
excluded_subjects = []

# PhysioNet exclusions based on constraints
for _, row in constraints_df.iterrows():
    if 'No data' in row['issue'] or 'sensors covered' in row['issue']:
        excluded_subjects.append({
            'subject_id': row['subject_id'],
            'dataset': 'PhysioNet',
            'protocol': row['protocol'],
            'reason': row['issue'],
            'action': 'EXCLUDE'
        })
    elif 'split files' in row['issue'] or 'Duplicated' in row['issue']:
        excluded_subjects.append({
            'subject_id': row['subject_id'],
            'dataset': 'PhysioNet',
            'protocol': row['protocol'],
            'reason': row['issue'],
            'action': 'HANDLE_SPECIAL'
        })
    else:
        excluded_subjects.append({
            'subject_id': row['subject_id'],
            'dataset': 'PhysioNet',
            'protocol': row['protocol'],
            'reason': row['issue'],
            'action': 'INCLUDE_WITH_NOTE'
        })

excluded_df = pd.DataFrame(excluded_subjects)
print("Exclusion/Special Handling List:")
excluded_df

Exclusion/Special Handling List:


Unnamed: 0,subject_id,dataset,protocol,reason,action
0,S02,PhysioNet,STRESS,Duplicated signals,HANDLE_SPECIAL
1,f07,PhysioNet,STRESS,PPG/TEMP sensors covered - only EDA/ACC valid,EXCLUDE
2,f14,PhysioNet,STRESS,"Connection loss - split files (f14_a, f14_b)",HANDLE_SPECIAL
3,S03,PhysioNet,AEROBIC,Protocol ended early (90 rpm),INCLUDE_WITH_NOTE
4,S07,PhysioNet,AEROBIC,Protocol ended early (95 rpm),INCLUDE_WITH_NOTE
5,S11,PhysioNet,AEROBIC,"Connection loss - split files (S11_a, S11_b)",HANDLE_SPECIAL
6,S12,PhysioNet,AEROBIC,No data - protocol not performed,EXCLUDE
7,S01,PhysioNet,ANAEROBIC,Empty IBI file,INCLUDE_WITH_NOTE
8,S06,PhysioNet,ANAEROBIC,Protocol ended early (missing last sprint),INCLUDE_WITH_NOTE
9,S16,PhysioNet,ANAEROBIC,"Connection loss - split files (S16_a, S16_b)",HANDLE_SPECIAL


## 2.2 Signal Quality Check

In [7]:
def load_e4_signal(filepath):
    """Load E4 CSV signal file, return data and sampling rate."""
    try:
        with open(filepath, 'r') as f:
            lines = f.readlines()
        
        # First line: timestamp, Second line: sampling rate
        timestamp = lines[0].strip()
        sr_line = lines[1].strip()
        
        # Handle multi-column files (ACC)
        if ',' in sr_line:
            sr = float(sr_line.split(',')[0])
        else:
            sr = float(sr_line)
        
        # Load data
        data = pd.read_csv(filepath, skiprows=2, header=None)
        return data, sr, timestamp
    except Exception as e:
        return None, None, None

def check_signal_quality(data, signal_name):
    """Check signal quality metrics."""
    if data is None:
        return {'valid': False, 'reason': 'Load failed'}
    
    quality = {
        'valid': True,
        'n_samples': len(data),
        'n_nan': data.isna().sum().sum(),
        'n_inf': np.isinf(data.select_dtypes(include=[np.number])).sum().sum(),
        'pct_nan': (data.isna().sum().sum() / data.size) * 100,
    }
    
    # Check for flat signal (stuck sensor)
    if data.shape[1] == 1:
        std = data.iloc[:, 0].std()
        quality['std'] = std
        quality['flat_signal'] = std < 1e-6
    else:
        quality['std'] = data.std().mean()
        quality['flat_signal'] = data.std().mean() < 1e-6
    
    return quality

In [8]:
# Signal Quality Check - WESAD
signals_to_check = ['BVP', 'EDA', 'TEMP', 'ACC']
wesad_quality = []

for subj in wesad_subjects:
    e4_path = WESAD_PATH / subj / f'{subj}_E4_Data'
    
    for signal in signals_to_check:
        filepath = e4_path / f'{signal}.csv'
        data, sr, ts = load_e4_signal(filepath)
        quality = check_signal_quality(data, signal)
        
        wesad_quality.append({
            'subject_id': subj,
            'dataset': 'WESAD',
            'signal': signal,
            'sampling_rate': sr,
            **quality
        })

wesad_quality_df = pd.DataFrame(wesad_quality)
print("WESAD Signal Quality Summary:")
print(wesad_quality_df.groupby('signal')[['n_samples', 'n_nan', 'pct_nan', 'flat_signal']].describe())

WESAD Signal Quality Summary:
       n_samples                                                             \
           count           mean           std       min       25%       50%   
signal                                                                        
ACC         15.0  228152.000000  20246.040967  199314.0  212499.0  223560.0   
BVP         15.0  456304.200000  40493.744507  398629.0  425001.5  447128.0   
EDA         15.0   28517.600000   2531.750631   24912.0   26559.0   27942.0   
TEMP        15.0   28515.733333   2529.794782   24912.0   26560.0   27944.0   

                           n_nan       ...           pct_nan                 \
             75%       max count mean  ...  75%  max   count mean  std  min   
signal                                 ...                                    
ACC     244386.0  266352.0  15.0  0.0  ...  0.0  0.0    15.0  0.0  0.0  0.0   
BVP     488774.0  532708.0  15.0  0.0  ...  0.0  0.0    15.0  0.0  0.0  0.0   
EDA      30549.0   33

In [9]:
# Signal Quality Check - EPM-E4
epm_quality = []

for subj in epm_subjects:
    empatica_path = EPM_PATH / subj / 'empatica'
    
    for signal in signals_to_check:
        filepath = empatica_path / f'{signal}.csv'
        data, sr, ts = load_e4_signal(filepath)
        quality = check_signal_quality(data, signal)
        
        epm_quality.append({
            'subject_id': subj,
            'dataset': 'EPM-E4',
            'signal': signal,
            'sampling_rate': sr,
            **quality
        })

epm_quality_df = pd.DataFrame(epm_quality)
print("EPM-E4 Signal Quality Summary:")
print(epm_quality_df.groupby('signal')[['n_samples', 'n_nan', 'pct_nan', 'flat_signal']].describe())

EPM-E4 Signal Quality Summary:
       n_samples                                                           \
           count           mean           std      min      25%       50%   
signal                                                                      
ACC         47.0   52742.042553   6380.166135  44640.0  48396.0   51030.0   
BVP         47.0  105483.914894  12758.749000  89287.0  96789.0  102058.0   
EDA         47.0    6591.702128    797.831842   5580.0   6048.0    6378.0   
TEMP        47.0    6590.297872    797.748024   5576.0   6044.0    6376.0   

                           n_nan       ...           pct_nan                 \
             75%       max count mean  ...  75%  max   count mean  std  min   
signal                                 ...                                    
ACC      54222.0   70596.0  47.0  0.0  ...  0.0  0.0    47.0  0.0  0.0  0.0   
BVP     108443.5  141185.0  47.0  0.0  ...  0.0  0.0    47.0  0.0  0.0  0.0   
EDA       6777.0    8826.0  47.0  

In [10]:
# Signal Quality Check - PhysioNet (sample from each protocol)
protocols = ['STRESS', 'AEROBIC', 'ANAEROBIC']
physionet_quality = []

for protocol in protocols:
    protocol_path = PHYSIONET_PATH / protocol
    subjects = [d for d in os.listdir(protocol_path) if os.path.isdir(protocol_path / d)]
    
    for subj in subjects:
        subj_path = protocol_path / subj
        
        for signal in signals_to_check:
            filepath = subj_path / f'{signal}.csv'
            data, sr, ts = load_e4_signal(filepath)
            quality = check_signal_quality(data, signal)
            
            physionet_quality.append({
                'subject_id': subj,
                'dataset': 'PhysioNet',
                'protocol': protocol,
                'signal': signal,
                'sampling_rate': sr,
                **quality
            })

physionet_quality_df = pd.DataFrame(physionet_quality)
print("PhysioNet Signal Quality Summary:")
print(physionet_quality_df.groupby(['protocol', 'signal'])[['n_samples', 'n_nan', 'pct_nan']].mean())

PhysioNet Signal Quality Summary:
                      n_samples  n_nan  pct_nan
protocol  signal                               
AEROBIC   ACC      67970.129032    0.0      0.0
          BVP     135939.064516    0.0      0.0
          EDA       8493.290323    0.0      0.0
          TEMP      8495.483871    0.0      0.0
ANAEROBIC ACC      50049.000000    0.0      0.0
          BVP     100114.093750    0.0      0.0
          EDA       6254.437500    0.0      0.0
          TEMP      6255.500000    0.0      0.0
STRESS    ACC      80455.783784    0.0      0.0
          BVP     160927.918919    0.0      0.0
          EDA      10056.648649    0.0      0.0
          TEMP     10054.054054    0.0      0.0


In [11]:
# Combine all quality reports
all_quality = pd.concat([wesad_quality_df, epm_quality_df, physionet_quality_df], ignore_index=True)

# Identify problematic signals
problematic = all_quality[(all_quality['flat_signal'] == True) | 
                          (all_quality['pct_nan'] > 5) | 
                          (all_quality['valid'] == False)]

print(f"Total signal checks: {len(all_quality)}")
print(f"Problematic signals: {len(problematic)}")
if len(problematic) > 0:
    print("\nProblematic Signals:")
    print(problematic[['dataset', 'subject_id', 'signal', 'flat_signal', 'pct_nan', 'valid']])

Total signal checks: 648
Problematic signals: 2

Problematic Signals:
   dataset subject_id signal  flat_signal  pct_nan  valid
66  EPM-E4          2   TEMP         True      0.0   True
82  EPM-E4         15   TEMP         True      0.0   True


In [12]:
# Create inclusion criteria
inclusion_criteria = """
INCLUSION CRITERIA FOR SUBJECTS:
================================
1. Must have complete E4 data (BVP, EDA, TEMP, ACC)
2. No flat signals (std > 1e-6)
3. Less than 5% missing values per signal
4. Sampling rates match expected values:
   - BVP: 64 Hz
   - EDA: 4 Hz
   - TEMP: 4 Hz
   - ACC: 32 Hz

EXCLUSION CRITERIA:
==================
1. PhysioNet S12 (AEROBIC): No data collected
2. PhysioNet f07 (STRESS): PPG/TEMP sensors covered
3. Subjects with >50% corrupted/missing data

SPECIAL HANDLING:
================
1. Split files (f14, S11, S16): Concatenate parts
2. Duplicated signals (S02): Truncate at duplication point
"""

print(inclusion_criteria)


INCLUSION CRITERIA FOR SUBJECTS:
1. Must have complete E4 data (BVP, EDA, TEMP, ACC)
2. No flat signals (std > 1e-6)
3. Less than 5% missing values per signal
4. Sampling rates match expected values:
   - BVP: 64 Hz
   - EDA: 4 Hz
   - TEMP: 4 Hz
   - ACC: 32 Hz

EXCLUSION CRITERIA:
1. PhysioNet S12 (AEROBIC): No data collected
2. PhysioNet f07 (STRESS): PPG/TEMP sensors covered
3. Subjects with >50% corrupted/missing data

SPECIAL HANDLING:
1. Split files (f14, S11, S16): Concatenate parts
2. Duplicated signals (S02): Truncate at duplication point



In [13]:
# Save outputs
all_profiles.to_csv(OUTPUT_PATH / 'subject_profiles.csv', index=False)
excluded_df.to_csv(OUTPUT_PATH / 'excluded_subjects.csv', index=False)
all_quality.to_csv(OUTPUT_PATH / 'signal_quality_report.csv', index=False)

with open(OUTPUT_PATH / 'inclusion_criteria.txt', 'w') as f:
    f.write(inclusion_criteria)

print("Outputs saved:")
print("  - subject_profiles.csv")
print("  - excluded_subjects.csv")
print("  - signal_quality_report.csv")
print("  - inclusion_criteria.txt")

Outputs saved:
  - subject_profiles.csv
  - excluded_subjects.csv
  - signal_quality_report.csv
  - inclusion_criteria.txt


In [14]:
# Final Summary
print("="*60)
print("STEP 2.1-2.2 COMPLETE")
print("="*60)
print(f"\nSubject Profiles:")
print(f"  WESAD: {len(wesad_profiles_df)} subjects (full demographics)")
print(f"  EPM-E4: {len(epm_profiles_df)} subjects (no demographics)")
print(f"  PhysioNet: {len(physionet_profiles_df)} subjects (partial demographics)")
print(f"\nSignal Quality:")
print(f"  Total checks: {len(all_quality)}")
print(f"  Problematic: {len(problematic)}")
print(f"\nExclusions: {len(excluded_df[excluded_df['action'] == 'EXCLUDE'])} subjects")
print("="*60)

STEP 2.1-2.2 COMPLETE

Subject Profiles:
  WESAD: 15 subjects (full demographics)
  EPM-E4: 47 subjects (no demographics)
  PhysioNet: 36 subjects (partial demographics)

Signal Quality:
  Total checks: 648
  Problematic: 2

Exclusions: 2 subjects
