# Data Exploration Notebook

This notebook explores the physiological data for stress detection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Import our modules
import sys
sys.path.append('../src')

from utils import load_dataset, handle_missing_values

%matplotlib inline

## Load Dataset

In [None]:
# Check available datasets
data_dir = '../data/raw'
if os.path.exists(data_dir):
    files = os.listdir(data_dir)
    print("Available datasets:")
    for file in files:
        print(f"- {file}")
else:
    print("Data directory not found. Please download a dataset and place it in data/raw/")

In [None]:
# Load dataset (example with WESAD-like structure)
# You would replace this with the actual path to your dataset
try:
    # Example loading code - replace with your actual dataset
    # data = load_dataset('../data/raw/Sxx.pkl')  # WESAD format
    
    # For demonstration, we'll create sample data
    sample_rate = 256  # Hz
    duration = 60  # seconds
    time_points = np.linspace(0, duration, duration * sample_rate)
    
    # Simulate ECG signal (simplified)
    ecg = np.sin(2 * np.pi * 1.2 * time_points) + 0.5 * np.random.normal(size=len(time_points))
    
    # Simulate EDA signal
    eda = 5 + 2 * np.sin(2 * np.pi * 0.1 * time_points) + np.random.normal(size=len(time_points))
    
    # Simulate accelerometer data
    acc_x = np.random.normal(0, 0.1, size=len(time_points))
    acc_y = np.random.normal(0, 0.1, size=len(time_points))
    acc_z = 9.8 + np.random.normal(0, 0.1, size=len(time_points))
    
    # Create DataFrame
    data = pd.DataFrame({
        'timestamp': time_points,
        'ecg': ecg,
        'eda': eda,
        'acc_x': acc_x,
        'acc_y': acc_y,
        'acc_z': acc_z
    })
    
    print("Sample data loaded successfully!")
    print(f"Data shape: {data.shape}")
    print(data.head())
except Exception as e:
    print(f"Error loading data: {e}")

## Data Exploration

In [None]:
# Basic statistics
data.describe()

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:")
print(missing_values)

In [None]:
# Handle missing values if any
if missing_values.sum() > 0:
    data = handle_missing_values(data, method='interpolate')
    print("Missing values handled.")

## Visualize Signals

In [None]:
# Plot ECG signal
plt.figure(figsize=(12, 4))
plt.plot(data['timestamp'][:1000], data['ecg'][:1000])
plt.title('ECG Signal (First 1000 samples)')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.grid(True)
plt.show()

In [None]:
# Plot EDA signal
plt.figure(figsize=(12, 4))
plt.plot(data['timestamp'][:1000], data['eda'][:1000])
plt.title('EDA Signal (First 1000 samples)')
plt.xlabel('Time (s)')
plt.ylabel('Conductance (ÂµS)')
plt.grid(True)
plt.show()

In [None]:
# Plot accelerometer data
fig, axes = plt.subplots(3, 1, figsize=(12, 8), sharex=True)

axes[0].plot(data['timestamp'][:1000], data['acc_x'][:1000])
axes[0].set_title('Accelerometer X-axis')
axes[0].set_ylabel('Acceleration (g)')
axes[0].grid(True)

axes[1].plot(data['timestamp'][:1000], data['acc_y'][:1000])
axes[1].set_title('Accelerometer Y-axis')
axes[1].set_ylabel('Acceleration (g)')
axes[1].grid(True)

axes[2].plot(data['timestamp'][:1000], data['acc_z'][:1000])
axes[2].set_title('Accelerometer Z-axis')
axes[2].set_xlabel('Time (s)')
axes[2].set_ylabel('Acceleration (g)')
axes[2].grid(True)

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation matrix
plt.figure(figsize=(8, 6))
correlation_matrix = data[['ecg', 'eda', 'acc_x', 'acc_y', 'acc_z']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Physiological Signals')
plt.show()

## Next Steps

After exploring the data, the next steps would be:
1. Preprocessing the signals (filtering, artifact removal)
2. Segmenting the data into windows
3. Extracting features from each window
4. Training and evaluating models

Continue to the next notebook: `02-feature-engineering.ipynb`