In [1]:
import pandas as pd
import numpy as np
import pickle
import os

# 1. Define the path for Subject 2 (S2)
subject_id = 'S2'
# Note: Adjust 'base_path' if your folder is named differently (e.g., just 'data' or 'WESAD')
base_path = 'data' 
file_path = os.path.join(base_path, 'WESAD', subject_id, f'{subject_id}.pkl')

print(f"--> Attempting to read file: {file_path}")

# 2. Load the file (Unpickle)
# Mode 'rb' means 'read binary'
# Encoding 'latin1' is required because this dataset was originally created in Python 2
with open(file_path, 'rb') as file:
    data = pickle.load(file, encoding='latin1')

print("Success! File loaded into memory.")

# 3. Explore the initial structure
print("\nWhat is inside this 'box'? (Dictionary Keys):")
print(data.keys())

--> Attempting to read file: data\WESAD\S2\S2.pkl
Success! File loaded into memory.

What is inside this 'box'? (Dictionary Keys):
dict_keys(['signal', 'label', 'subject'])


In [2]:
# --- STEP 2: INSPECT THE SIGNALS ---

print("1. What lies inside 'data[signal]'?")
print(data['signal'].keys())

print("\n" + "-"*30 + "\n")

print("2. Now, let's look specifically inside 'wrist':")
# We access the 'wrist' dictionary inside 'signal'
wrist_data = data['signal']['wrist']
print(wrist_data.keys())

print("\n" + "-"*30 + "\n")

# Let's check the shape (size) of one sensor, e.g., BVP (Blood Volume Pulse)
# This tells us how many data points we have
bvp_shape = wrist_data['BVP'].shape
print(f"3. Shape of BVP data (Heart Rate sensor): {bvp_shape}")

1. What lies inside 'data[signal]'?
dict_keys(['chest', 'wrist'])

------------------------------

2. Now, let's look specifically inside 'wrist':
dict_keys(['ACC', 'BVP', 'EDA', 'TEMP'])

------------------------------

3. Shape of BVP data (Heart Rate sensor): (389056, 1)


In [3]:
# --- STEP 3: CHECK DATA SIZES ---

# We suspect that sensors have different lengths because they record at different frequencies.
# Let's verify this hypothesis.

print(f"BVP shape (Heart): {wrist_data['BVP'].shape}")
print(f"ACC shape (Move):  {wrist_data['ACC'].shape}")
print(f"EDA shape (Sweat): {wrist_data['EDA'].shape}")
print(f"TEMP shape (Heat): {wrist_data['TEMP'].shape}")

# Also check the labels (answers)
print(f"Label shape:       {data['label'].shape}")

BVP shape (Heart): (389056, 1)
ACC shape (Move):  (194528, 3)
EDA shape (Sweat): (24316, 1)
TEMP shape (Heat): (24316, 1)
Label shape:       (4255300,)


In [4]:
# --- VISUALIZE RAW DATA AS DATAFRAMES ---

# Since they have different lengths, we look at them separately.

# 1. Inspect BVP (Heart Rate Data) - High Frequency (64Hz)
df_bvp_raw = pd.DataFrame(wrist_data['BVP'], columns=['BVP_Signal'])
print("--- BVP HEAD (Raw Heart Signal) ---")
display(df_bvp_raw.head())

# 2. Inspect EDA (Sweat Data) - Low Frequency (4Hz)
df_eda_raw = pd.DataFrame(wrist_data['EDA'], columns=['EDA_Signal'])
print("\n--- EDA HEAD (Raw Sweat Signal) ---")
display(df_eda_raw.head())

# 3. Inspect ACC (Accelerometer) - 3 Axes (x, y, z)
df_acc_raw = pd.DataFrame(wrist_data['ACC'], columns=['x', 'y', 'z'])
print("\n--- ACC HEAD (Raw Movement) ---")
display(df_acc_raw.head())

--- BVP HEAD (Raw Heart Signal) ---


Unnamed: 0,BVP_Signal
0,-59.37
1,-53.42
2,-44.4
3,-33.17
4,-20.79



--- EDA HEAD (Raw Sweat Signal) ---


Unnamed: 0,EDA_Signal
0,1.138257
1,1.125444
2,1.011405
3,1.033188
4,0.935807



--- ACC HEAD (Raw Movement) ---


Unnamed: 0,x,y,z
0,62.0,-21.0,107.0
1,66.0,13.0,53.0
2,41.0,9.0,15.0
3,52.0,16.0,24.0
4,54.0,15.0,34.0


In [5]:
# --- STEP 4: CREATE A COMMON TIME INDEX (SYNCHRONIZATION) ---

# Define the frequency for BVP
fs_bvp = 64

# 1. Create the DataFrame again
df_bvp = pd.DataFrame(wrist_data['BVP'], columns=['BVP_Signal'])

# 2. Create the 'Time_Group' column (The Magic Step)
# The symbol '//' means integer division (drops the decimal part)
# Row 0 // 64 = 0
# Row 63 // 64 = 0
# Row 64 // 64 = 1
df_bvp['Time_Group'] = df_bvp.index // fs_bvp

print("--- BEFORE AGGREGATION (Raw Data) ---")
display(df_bvp.head(10)) # See how the first rows all belong to Time 0?

# 3. Aggregate! (Compress 64 rows into 1 row)
# We calculate the MEAN and STANDARD DEVIATION for each second
df_bvp_sec = df_bvp.groupby('Time_Group')['BVP_Signal'].agg(['mean', 'std'])

# Rename columns to be clear
df_bvp_sec.columns = ['BVP_Mean', 'BVP_Std']

print("\n--- AFTER AGGREGATION (One row per second) ---")
display(df_bvp_sec.head())

print(f"\nOriginal Shape: {df_bvp.shape}")
print(f"New Shape:      {df_bvp_sec.shape}")

--- BEFORE AGGREGATION (Raw Data) ---


Unnamed: 0,BVP_Signal,Time_Group
0,-59.37,0
1,-53.42,0
2,-44.4,0
3,-33.17,0
4,-20.79,0
5,-8.33,0
6,3.4,0
7,13.85,0
8,22.76,0
9,30.04,0



--- AFTER AGGREGATION (One row per second) ---


Unnamed: 0_level_0,BVP_Mean,BVP_Std
Time_Group,Unnamed: 1_level_1,Unnamed: 2_level_1
0,32.057187,32.897052
1,-19.376719,203.102094
2,-20.59625,178.915959
3,23.686094,49.465675
4,-11.179531,36.916528



Original Shape: (389056, 2)
New Shape:      (6079, 2)


In [6]:
import scipy.stats

# --- STEP 5: PROCESS AND MERGE ALL SENSORS (SUBJECT 2) ---

print("--> Processing all sensors for S2...")

# 1. DEFINE FREQUENCIES
fs_dict = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4, 'label': 700}

# 2. HELPER FUNCTION (To avoid repeating code)
def process_sensor(sensor_name, signal_data):
    # Get frequency
    fs = fs_dict[sensor_name]
    
    # Create DataFrame
    if sensor_name == 'ACC':
        df = pd.DataFrame(signal_data, columns=['x', 'y', 'z'])
        # Calculate Magnitude (Total Movement)
        df['Mag'] = (df['x']**2 + df['y']**2 + df['z']**2)**0.5
        features = ['Mag'] # We only care about magnitude for now
    else:
        df = pd.DataFrame(signal_data, columns=[sensor_name])
        features = [sensor_name]
        
    # Create Time Index
    df['Time_Group'] = df.index // fs
    
    # Aggregate (Mean and Std)
    df_agg = df.groupby('Time_Group')[features].agg(['mean', 'std'])
    
    # Flatten column names (e.g. "BVP_mean", "BVP_std")
    df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
    
    return df_agg

# 3. APPLY TO ALL
df_bvp_final = process_sensor('BVP', wrist_data['BVP'])
df_eda_final = process_sensor('EDA', wrist_data['EDA'])
df_temp_final = process_sensor('TEMP', wrist_data['TEMP'])
df_acc_final = process_sensor('ACC', wrist_data['ACC'])

print("Sensors processed! Now merging...")

# 4. MERGE (JOIN) EVERYTHING
# Since they all share the same index (Time_Group 0, 1, 2...), we can just stick them together
df_main = pd.concat([df_acc_final, df_bvp_final, df_eda_final, df_temp_final], axis=1)

# 5. PROCESS LABELS (Special Case: Mode)
# We need the most common label for each second
labels = data['label']
df_label = pd.DataFrame(labels, columns=['label'])
df_label['Time_Group'] = df_label.index // fs_dict['label']

# Function to find the Mode (Most frequent value)
def get_mode(x):
    return scipy.stats.mode(x)[0]

df_label_sec = df_label.groupby('Time_Group')['label'].apply(get_mode)
df_main['label'] = df_label_sec

# Remove empty rows (NaN)
df_main.dropna(inplace=True)

print("-" * 30)
print("FINAL RESULT (One row per second):")
print(df_main.shape)
display(df_main.head())

--> Processing all sensors for S2...
Sensors processed! Now merging...
------------------------------
FINAL RESULT (One row per second):
(6079, 9)


Unnamed: 0_level_0,Mag_mean,Mag_std,BVP_mean,BVP_std,EDA_mean,EDA_std,TEMP_mean,TEMP_std,label
Time_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,69.272521,18.823603,32.057187,32.897052,1.077074,0.064087,35.41,0.0,0
1,68.211627,15.8517,-19.376719,203.102094,0.942139,0.013666,35.41,0.0,0
2,63.050699,2.859614,-20.59625,178.915959,1.116375,0.113237,35.41,0.0,0
3,63.32406,0.861913,23.686094,49.465675,1.288394,0.017518,35.41,0.0,0
4,63.433868,0.854197,-11.179531,36.916528,1.261486,0.025377,35.43,0.0,0


In [9]:
# --- STEP 6: FILTER AND BINARIZE LABELS ---

print("Before Cleaning (Label Counts):")
# This shows we have 1, 2, 3, 4 (and maybe 0)
print(df_main['label'].value_counts())

# 1. FILTER: Keep only Baseline (1) and Stress (2)
# We use .isin() to select only the rows with the labels we want
# We use .copy() to ensure we create a new independent DataFrame
df_s2_clean = df_main[df_main['label'].isin([1, 2])].copy()

# 2. MAP: Convert to Binary System (0 and 1)
# 1 (Baseline) -> 0 (Our "Normal")
# 2 (Stress)   -> 1 (Our "Target")
mapping = {1: 0, 2: 1}
df_s2_clean['label'] = df_s2_clean['label'].map(mapping)

print("-" * 30)
print("After Cleaning (Only 0 and 1):")
print(df_s2_clean['label'].value_counts())

print("-" * 30)
print(f"Original Rows: {len(df_main)}")
print(f"Final Rows:    {len(df_s2_clean)}")
print("We lost some rows (Amusement/Meditation), but kept the essential data.")

Before Cleaning (Label Counts):
label
0    3061
1    1144
4     768
2     615
3     362
6      65
7      64
Name: count, dtype: int64
------------------------------
After Cleaning (Only 0 and 1):
label
0    1144
1     615
Name: count, dtype: int64
------------------------------
Original Rows: 6079
Final Rows:    1759
We lost some rows (Amusement/Meditation), but kept the essential data.


In [10]:
import pandas as pd
import numpy as np
import pickle
import os
import scipy.stats

# --- STEP 7: FINAL EXECUTION (ALL SUBJECTS) ---
# Goal: Process all subjects with enhanced features and save to CSV.

subjects = ['S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17']
fs_dict = {'ACC': 32, 'BVP': 64, 'EDA': 4, 'TEMP': 4, 'label': 700}

# 1. DEFINE HELPER FUNCTIONS (With EXTRA Features!)
def amp(x):
    return x.max() - x.min()

def process_sensor_features(sensor_name, signal_data):
    # Determine frequency
    fs = fs_dict[sensor_name]
    
    # Create DF
    if sensor_name == 'ACC':
        df = pd.DataFrame(signal_data, columns=['x', 'y', 'z'])
        df['Mag'] = (df['x']**2 + df['y']**2 + df['z']**2)**0.5
        features = ['Mag'] 
    else:
        df = pd.DataFrame(signal_data, columns=[sensor_name])
        features = [sensor_name]
        
    # Synchronization (Time Index)
    df['Time_Group'] = df.index // fs
    
    # --- ENHANCED FEATURE ENGINEERING ---
    # We now calculate 5 metrics per second instead of just 2
    aggs = ['mean', 'std', 'min', 'max', amp]
    
    df_agg = df.groupby('Time_Group')[features].agg(aggs)
    
    # Flatten names (e.g. "EDA_amp", "BVP_max")
    df_agg.columns = ['_'.join(col).strip() for col in df_agg.columns.values]
    
    return df_agg

def get_mode(x):
    return scipy.stats.mode(x)[0]

# 2. MAIN PROCESSING LOOP
all_data = []

print(f"--> Starting Batch Processing for {len(subjects)} subjects...")

for sub in subjects:
    try:
        # Load File
        path = os.path.join('data', 'WESAD', sub, f'{sub}.pkl')
        with open(path, 'rb') as file:
            data = pickle.load(file, encoding='latin1')
            
        wrist = data['signal']['wrist']
        
        # Process Sensors (Extract 5 features each)
        df_bvp = process_sensor_features('BVP', wrist['BVP'])
        df_eda = process_sensor_features('EDA', wrist['EDA'])
        df_temp = process_sensor_features('TEMP', wrist['TEMP'])
        df_acc = process_sensor_features('ACC', wrist['ACC'])
        
        # Process Labels
        df_lbl = pd.DataFrame(data['label'], columns=['label'])
        df_lbl['Time_Group'] = df_lbl.index // 700
        df_lbl_sec = df_lbl.groupby('Time_Group')['label'].apply(get_mode)
        
        # Merge
        df_main = pd.concat([df_acc, df_bvp, df_eda, df_temp], axis=1)
        df_main['label'] = df_lbl_sec
        df_main['subject'] = sub  # Track who is who!
        
        # Clean & Binarize (The step we just practiced)
        df_main.dropna(inplace=True)
        df_main = df_main[df_main['label'].isin([1, 2])].copy()
        df_main['label'] = df_main['label'].map({1: 0, 2: 1})
        
        # Store
        all_data.append(df_main)
        print(f"    {sub}: Success! Shape={df_main.shape}")
        
    except Exception as e:
        print(f"    {sub}: Failed ({e})")

# 3. SAVE FINAL CSV
print("-" * 30)
if len(all_data) > 0:
    df_final_wrist = pd.concat(all_data)
    
    # Save with a specific name so we don't mix up with chest data
    output_file = 'features_wrist.csv'
    df_final_wrist.to_csv(output_file, index=False)
    
    print("DONE! Wrist processing complete.")
    print(f"Total Database Size: {df_final_wrist.shape}")
    print(f"Saved to: {output_file}")
    
    # Show the first rows to confirm the new columns exist
    display(df_final_wrist.head())

--> Starting Batch Processing for 15 subjects...
    S2: Success! Shape=(1759, 22)
    S3: Success! Shape=(1780, 22)
    S4: Success! Shape=(1793, 22)
    S5: Success! Shape=(1843, 22)
    S6: Success! Shape=(1830, 22)
    S7: Success! Shape=(1826, 22)
    S8: Success! Shape=(1839, 22)
    S9: Success! Shape=(1825, 22)
    S10: Success! Shape=(1905, 22)
    S11: Success! Shape=(1860, 22)
    S13: Success! Shape=(1844, 22)
    S14: Success! Shape=(1855, 22)
    S15: Success! Shape=(1861, 22)
    S16: Success! Shape=(1853, 22)
    S17: Success! Shape=(1904, 22)
------------------------------
DONE! Wrist processing complete.
Total Database Size: (27577, 22)
Saved to: features_wrist.csv


Unnamed: 0_level_0,Mag_mean,Mag_std,Mag_min,Mag_max,Mag_amp,BVP_mean,BVP_std,BVP_min,BVP_max,BVP_amp,...,EDA_min,EDA_max,EDA_amp,TEMP_mean,TEMP_std,TEMP_min,TEMP_max,TEMP_amp,label,subject
Time_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
307,65.80049,7.477827,53.31979,90.476516,37.156726,-18.120156,108.425814,-309.08,95.35,404.43,...,1.558534,1.634132,0.075598,35.81,0.0,35.81,35.81,0.0,0,S2
308,62.567415,1.13868,59.084685,64.699304,5.614619,51.961562,90.008943,-152.79,170.7,323.49,...,1.53547,1.616194,0.080724,35.81,0.0,35.81,35.81,0.0,0,S2
309,62.74122,0.370601,62.016127,63.34035,1.324224,-48.005156,71.973877,-218.6,82.76,301.36,...,1.550846,1.57391,0.023064,35.83,0.0,35.83,35.83,0.0,0,S2
310,62.619851,0.262577,62.016127,63.34035,1.324224,14.090938,51.328673,-68.71,97.34,166.05,...,1.527782,1.547002,0.01922,35.83,0.0,35.83,35.83,0.0,0,S2
311,62.744316,0.301973,62.016127,63.356136,1.340009,6.260469,38.991434,-65.63,79.78,145.41,...,1.522656,1.532907,0.010251,35.83,0.0,35.83,35.83,0.0,0,S2


In [11]:
df_final_wrist.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27577 entries, 307 to 4234
Data columns (total 22 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Mag_mean   27577 non-null  float64
 1   Mag_std    27577 non-null  float64
 2   Mag_min    27577 non-null  float64
 3   Mag_max    27577 non-null  float64
 4   Mag_amp    27577 non-null  float64
 5   BVP_mean   27577 non-null  float64
 6   BVP_std    27577 non-null  float64
 7   BVP_min    27577 non-null  float64
 8   BVP_max    27577 non-null  float64
 9   BVP_amp    27577 non-null  float64
 10  EDA_mean   27577 non-null  float64
 11  EDA_std    27577 non-null  float64
 12  EDA_min    27577 non-null  float64
 13  EDA_max    27577 non-null  float64
 14  EDA_amp    27577 non-null  float64
 15  TEMP_mean  27577 non-null  float64
 16  TEMP_std   27577 non-null  float64
 17  TEMP_min   27577 non-null  float64
 18  TEMP_max   27577 non-null  float64
 19  TEMP_amp   27577 non-null  float64
 20  label     

In [12]:

# --- STEP 4: NEW FEATURE ENGINEERING (TRENDS & LAGS) ---
# We'll create columns that calculate the difference between the current value and the value from 30 and 60 secs ago.
# If the EDA has risen significantly in the last 30 seconds, that's a very strong indicator of stress.

print("--> Loading existing features...")
df = pd.read_csv('features_wrist.csv')

# Ensure the data is sorted correctly by subject to apply the shift
df = df.sort_values(by=['subject'])

# We will calculate the difference (trend) for the most important sensors
sensors_to_trend = ['EDA_mean', 'BVP_mean', 'TEMP_mean']

print("--> Calculating trends for 30s and 60s windows...")

for col in sensors_to_trend:
    # 1. 30-second trend: current value minus value 30 seconds ago
    # We group by subject to ensure we don't calculate trends between different people
    df[f'{col}_trend_30s'] = df.groupby('subject')[col].diff(periods=30)
    
    # 2. 60-second trend: current value minus value 60 seconds ago
    df[f'{col}_trend_60s'] = df.groupby('subject')[col].diff(periods=60)

# 3. Clean up
# The first 60 seconds of each subject will now have NaN values (no history)
# We drop them to keep the dataset clean for Machine Learning
df_enriched = df.dropna()

# Save the new version
output_file = 'features_wrist_enriched.csv'
df_enriched.to_csv(output_file, index=False)

print("-" * 40)
print(f"✅ Enriched dataset saved: {output_file}")
print(f"New columns added: {[c for c in df_enriched.columns if 'trend' in c]}")
print(f"Total Features: {len(df_enriched.columns) - 2}") # -2 for label and subject

--> Loading existing features...
--> Calculating trends for 30s and 60s windows...
----------------------------------------
✅ Enriched dataset saved: features_wrist_enriched.csv
New columns added: ['EDA_mean_trend_30s', 'EDA_mean_trend_60s', 'BVP_mean_trend_30s', 'BVP_mean_trend_60s', 'TEMP_mean_trend_30s', 'TEMP_mean_trend_60s']
Total Features: 26


In [13]:


# --- STEP 5: ANOTHER NEW STEP in FEATURE ENGINEERING (ACC VARIANCE & CROSS-FEATURES) ---

print("--> Loading enriched features...")
df = pd.read_csv('features_wrist_enriched.csv')

# Ensure sorting by subject
df = df.sort_values(by=['subject'])

print("--> Adding Movement Instability and Cross-Features...")

# 1. ACC Magnitude Variance (Rolling 30s)
# This measures how "unstable" the movement has been in the last 30 seconds
df['Mag_var_30s'] = df.groupby('subject')['Mag_mean'].transform(lambda x: x.rolling(window=30).var())

# 2. Cross-Feature: EDA * ACC Magnitude
# Helps distinguish between emotional stress (High EDA, Low ACC) 
# and physical activity (High EDA, High ACC)
df['EDA_ACC_interaction'] = df['EDA_mean'] * df['Mag_mean']

# 3. Clean up
# Drop the new NaNs created by the rolling variance window
df_final = df.dropna()

# Save the final version
output_file = 'features_wrist_final.csv'
df_final.to_csv(output_file, index=False)

print("-" * 40)
print(f"✅ Final dataset ready: {output_file}")
print(f"New features added: Mag_var_30s, EDA_ACC_interaction")
print(f"Total features now: {len(df_final.columns) - 2}")

--> Loading enriched features...
--> Adding Movement Instability and Cross-Features...
----------------------------------------
✅ Final dataset ready: features_wrist_final.csv
New features added: Mag_var_30s, EDA_ACC_interaction
Total features now: 28
