# Human Activity Recognition - Data Exploration

This notebook explores the UCI HAR Dataset used for training our deep learning model.

## Dataset Overview
- **Source**: UCI Machine Learning Repository
- **Activities**: 6 classes (WALKING, WALKING_UPSTAIRS, WALKING_DOWNSTAIRS, SITTING, STANDING, LAYING)
- **Subjects**: 30 volunteers aged 19-48 years
- **Sensors**: Accelerometer and Gyroscope (3-axial data)
- **Sampling Rate**: 50 Hz
- **Window Size**: 2.56 seconds (128 readings)


In [None]:
# Import necessary libraries
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import get_data_loaders
import torch

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 6)

print("Libraries imported successfully!")


## 1. Load the Dataset


In [None]:
# Load the data using our custom DataLoader
train_loader, test_loader = get_data_loaders(batch_size=64)

print(f"Training samples: {len(train_loader.dataset)}")
print(f"Test samples: {len(test_loader.dataset)}")
print(f"Number of batches (train): {len(train_loader)}")
print(f"Number of batches (test): {len(test_loader)}")

# Get a sample batch
sample_signals, sample_labels = next(iter(train_loader))
print(f"\nBatch shape: {sample_signals.shape}")
print(f"Labels shape: {sample_labels.shape}")
print(f"Data type: {sample_signals.dtype}")


## 2. Activity Distribution


In [None]:
# Collect all labels from train and test sets
train_labels = []
test_labels = []

for _, labels in train_loader:
    train_labels.extend(labels.numpy())

for _, labels in test_loader:
    test_labels.extend(labels.numpy())

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

activity_names = ["WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS", 
                  "SITTING", "STANDING", "LAYING"]

# Plot distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Training set distribution
train_counts = np.bincount(train_labels)
ax1.bar(activity_names, train_counts, color='steelblue', alpha=0.8)
ax1.set_title('Training Set Activity Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Activity')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)
for i, v in enumerate(train_counts):
    ax1.text(i, v + 20, str(v), ha='center', va='bottom', fontweight='bold')

# Test set distribution
test_counts = np.bincount(test_labels)
ax2.bar(activity_names, test_counts, color='coral', alpha=0.8)
ax2.set_title('Test Set Activity Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Activity')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=45)
for i, v in enumerate(test_counts):
    ax2.text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("Activity distribution:")
print(f"{'Activity':<20} {'Train':<10} {'Test':<10}")
print("-" * 40)
for i, name in enumerate(activity_names):
    print(f"{name:<20} {train_counts[i]:<10} {test_counts[i]:<10}")


## 3. Visualize Sample Signals


In [None]:
# Get one sample of each activity
activity_samples = {}
loader_iter = iter(train_loader)

while len(activity_samples) < 6:
    signals, labels = next(loader_iter)
    for i, label in enumerate(labels):
        label_val = label.item()
        if label_val not in activity_samples:
            activity_samples[label_val] = signals[i].numpy()

# Plot signals for each activity
fig, axes = plt.subplots(6, 2, figsize=(16, 18))
signal_names = ['Acc X', 'Acc Y', 'Acc Z', 'Gyro X', 'Gyro Y', 'Gyro Z']
time_steps = np.arange(128) / 50  # 50 Hz sampling rate

for activity_idx in range(6):
    signal = activity_samples[activity_idx]
    
    # Plot accelerometer data
    ax1 = axes[activity_idx, 0]
    for i in range(3):
        ax1.plot(time_steps, signal[i], label=signal_names[i], linewidth=1.5)
    ax1.set_title(f'{activity_names[activity_idx]} - Accelerometer', 
                  fontsize=12, fontweight='bold')
    ax1.set_xlabel('Time (seconds)')
    ax1.set_ylabel('Acceleration')
    ax1.legend(loc='upper right')
    ax1.grid(True, alpha=0.3)
    
    # Plot gyroscope data
    ax2 = axes[activity_idx, 1]
    for i in range(3, 6):
        ax2.plot(time_steps, signal[i], label=signal_names[i], linewidth=1.5)
    ax2.set_title(f'{activity_names[activity_idx]} - Gyroscope', 
                  fontsize=12, fontweight='bold')
    ax2.set_xlabel('Time (seconds)')
    ax2.set_ylabel('Angular Velocity')
    ax2.legend(loc='upper right')
    ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 4. Statistical Analysis


In [None]:
# Calculate statistics for each activity
stats_data = []

for activity_idx in range(6):
    signal = activity_samples[activity_idx]
    
    for channel_idx, channel_name in enumerate(signal_names):
        channel_data = signal[channel_idx]
        stats_data.append({
            'Activity': activity_names[activity_idx],
            'Signal': channel_name,
            'Mean': np.mean(channel_data),
            'Std': np.std(channel_data),
            'Min': np.min(channel_data),
            'Max': np.max(channel_data)
        })

stats_df = pd.DataFrame(stats_data)
print("Signal Statistics by Activity:")
print(stats_df.to_string(index=False))


## 5. Signal Characteristics Heatmap


In [None]:
# Create heatmap of mean signal values
mean_values = np.zeros((6, 6))  # 6 activities x 6 signals

for activity_idx in range(6):
    signal = activity_samples[activity_idx]
    for channel_idx in range(6):
        mean_values[activity_idx, channel_idx] = np.mean(signal[channel_idx])

plt.figure(figsize=(10, 8))
sns.heatmap(mean_values, annot=True, fmt='.3f', cmap='coolwarm', 
            xticklabels=signal_names, yticklabels=activity_names,
            cbar_kws={'label': 'Mean Value'})
plt.title('Mean Signal Values by Activity', fontsize=14, fontweight='bold')
plt.xlabel('Signal Channel')
plt.ylabel('Activity')
plt.tight_layout()
plt.show()


## 6. Key Observations

1. **Dataset Balance**: The dataset appears to be relatively balanced across all 6 activity classes
2. **Signal Patterns**: Different activities show distinct signal patterns:
   - Walking activities show periodic patterns in accelerometer data
   - Stationary activities (sitting, standing, laying) show more stable signals
3. **Sensor Characteristics**:
   - Accelerometer captures body motion and orientation
   - Gyroscope captures rotational movements
4. **Temporal Dependencies**: The 128-timestep windows capture enough information to distinguish activities

This data is suitable for sequence modeling using CNN-LSTM architecture!
