# Feature Engineering Notebook

This notebook demonstrates feature extraction from physiological signals for stress detection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append('../src')

# Import our modules
from preprocessing import preprocess_ecg, preprocess_eda, segment_signals
from features import extract_time_domain_features, extract_frequency_domain_features, \
                     extract_eda_features, extract_accelerometer_features, extract_all_features
from utils import load_dataset, save_dataset

%matplotlib inline

## Load and Prepare Data

In [None]:
# For demonstration, we'll create sample data
# In practice, you would load your actual dataset
sample_rate = 256  # Hz
duration = 60  # seconds
time_points = np.linspace(0, duration, duration * sample_rate)

# Simulate ECG signal with stress-related changes
# Baseline ECG
ecg_baseline = np.sin(2 * np.pi * 1.2 * time_points) + 0.5 * np.random.normal(size=len(time_points))
# Stress ECG (higher frequency, more variability)
ecg_stress = np.sin(2 * np.pi * 1.8 * time_points) + 0.8 * np.random.normal(size=len(time_points))

# Combine baseline and stress periods
split_point = len(time_points) // 2
ecg = np.concatenate([ecg_baseline[:split_point], ecg_stress[split_point:]])

# Simulate EDA signal
eda_baseline = 5 + 1 * np.sin(2 * np.pi * 0.05 * time_points) + np.random.normal(size=len(time_points))
eda_stress = 7 + 2 * np.sin(2 * np.pi * 0.1 * time_points) + np.random.normal(size=len(time_points))
eda = np.concatenate([eda_baseline[:split_point], eda_stress[split_point:]])

# Simulate accelerometer data
acc_x_baseline = np.random.normal(0, 0.1, size=len(time_points))
acc_x_stress = np.random.normal(0, 0.3, size=len(time_points))
acc_x = np.concatenate([acc_x_baseline[:split_point], acc_x_stress[split_point:]])

acc_y_baseline = np.random.normal(0, 0.1, size=len(time_points))
acc_y_stress = np.random.normal(0, 0.3, size=len(time_points))
acc_y = np.concatenate([acc_y_baseline[:split_point], acc_y_stress[split_point:]])

acc_z_baseline = 9.8 + np.random.normal(0, 0.1, size=len(time_points))
acc_z_stress = 9.8 + np.random.normal(0, 0.3, size=len(time_points))
acc_z = np.concatenate([acc_z_baseline[:split_point], acc_z_stress[split_point:]])

# Create DataFrame
data = pd.DataFrame({
    'timestamp': time_points,
    'ecg': ecg,
    'eda': eda,
    'acc_x': acc_x,
    'acc_y': acc_y,
    'acc_z': acc_z
})

print("Sample data created successfully!")
print(f"Data shape: {data.shape}")

## Preprocess Signals

In [None]:
# Preprocess ECG signal
ecg_processed = preprocess_ecg(data['ecg'].values, sample_rate)
if ecg_processed:
    print("ECG preprocessing completed.")
    print(f"Number of R-peaks detected: {len(ecg_processed['r_peaks'])}")
else:
    print("ECG preprocessing failed.")

In [None]:
# Preprocess EDA signal
eda_processed = preprocess_eda(data['eda'].values, sample_rate)
if eda_processed:
    print("EDA preprocessing completed.")
else:
    print("EDA preprocessing failed.")

## Segment Signals into Windows

In [None]:
# Segment data into windows
window_size = sample_rate * 30  # 30 seconds
overlap = 0.5  # 50% overlap

windows = segment_signals(data, window_size, overlap)
print(f"Segmented data into {len(windows)} windows of {window_size} samples each.")

## Extract Features from Windows

In [None]:
# Extract features from each window
feature_list = []

for i, window in enumerate(windows[:5]):  # Process first 5 windows for demo
    print(f"Processing window {i+1}/{len(windows)}")
    
    # Extract all features
    features = extract_all_features(window)
    
    # Add window information
    features['window_id'] = i
    
    # Add artificial label for demonstration
    # In practice, you would have actual labels
    features['label'] = 1 if i >= len(windows[:5]) // 2 else 0  # Simulate stress in second half
    
    feature_list.append(features)

if feature_list:
    # Combine all features into a single DataFrame
    features_df = pd.concat(feature_list, ignore_index=True)
    print(f"\nExtracted features for {len(features_df)} windows.")
    print(f"Features shape: {features_df.shape}")
    print("\nFeature columns:")
    print(features_df.columns.tolist())
    
    # Display first few rows
    print("\nFirst few rows of extracted features:")
    print(features_df.head())

## Analyze Feature Distributions

In [None]:
# Plot distribution of some key features
if 'mean_eda' in features_df.columns and 'rmssd' in features_df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    axes[0].hist(features_df['mean_eda'], bins=10, alpha=0.7)
    axes[0].set_title('Distribution of Mean EDA')
    axes[0].set_xlabel('Mean EDA (ÂµS)')
    axes[0].set_ylabel('Frequency')
    
    axes[1].hist(features_df['rmssd'], bins=10, alpha=0.7)
    axes[1].set_title('Distribution of RMSSD')
    axes[1].set_xlabel('RMSSD (ms)')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## Save Processed Features

In [None]:
# Save features to processed data directory
processed_dir = '../data/processed'
if not os.path.exists(processed_dir):
    os.makedirs(processed_dir)

if 'features_df' in locals():
    save_path = os.path.join(processed_dir, 'extracted_features.csv')
    save_dataset(features_df, save_path)
    print(f"Features saved to {save_path}")

## Next Steps

After feature engineering, the next steps would be:
1. Feature selection to identify the most relevant features
2. Training and evaluating machine learning models
3. Model optimization and validation

Continue to the next notebook: `03-modeling.ipynb`