# Feature Extraction for HMM Activity Recognition
## Step 2: Extract Time-Domain and Frequency-Domain Features

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.fft import fft
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [None]:
# Load all activity data
jumping = pd.read_csv('Jumping_all.csv')
standing = pd.read_csv('Standing_all.csv')
still = pd.read_csv('Still_all.csv')
walking = pd.read_csv('Walking_all.csv')

# Combine all data
data = pd.concat([jumping, standing, still, walking], ignore_index=True)
print(f"Total samples: {len(data)}")
print(f"\nActivity distribution:\n{data['activity'].value_counts()}")
data.head()

## Feature Extraction Functions

In [None]:
def extract_features(window):
    """Extract time-domain and frequency-domain features from a window of sensor data"""
    features = {}
    
    # Sensor columns
    acc_cols = ['acc_x', 'acc_y', 'acc_z']
    gyro_cols = ['gyro_x', 'gyro_y', 'gyro_z']
    
    # TIME-DOMAIN FEATURES
    for col in acc_cols + gyro_cols:
        signal = window[col].values
        features[f'{col}_mean'] = np.mean(signal)
        features[f'{col}_std'] = np.std(signal)
        features[f'{col}_var'] = np.var(signal)
        features[f'{col}_min'] = np.min(signal)
        features[f'{col}_max'] = np.max(signal)
        features[f'{col}_range'] = np.max(signal) - np.min(signal)
    
    # Signal Magnitude Area (SMA)
    features['acc_sma'] = np.sum(np.abs(window[acc_cols].values)) / len(window)
    features['gyro_sma'] = np.sum(np.abs(window[gyro_cols].values)) / len(window)
    
    # Correlation between axes
    features['acc_xy_corr'] = np.corrcoef(window['acc_x'], window['acc_y'])[0, 1]
    features['acc_xz_corr'] = np.corrcoef(window['acc_x'], window['acc_z'])[0, 1]
    features['acc_yz_corr'] = np.corrcoef(window['acc_y'], window['acc_z'])[0, 1]
    features['gyro_xy_corr'] = np.corrcoef(window['gyro_x'], window['gyro_y'])[0, 1]
    features['gyro_xz_corr'] = np.corrcoef(window['gyro_x'], window['gyro_z'])[0, 1]
    features['gyro_yz_corr'] = np.corrcoef(window['gyro_y'], window['gyro_z'])[0, 1]
    
    # FREQUENCY-DOMAIN FEATURES
    for col in acc_cols + gyro_cols:
        signal = window[col].values
        fft_vals = np.abs(fft(signal))
        fft_vals = fft_vals[:len(fft_vals)//2]  # Take positive frequencies
        
        features[f'{col}_spectral_energy'] = np.sum(fft_vals**2)
        features[f'{col}_dominant_freq_idx'] = np.argmax(fft_vals)
        features[f'{col}_spectral_entropy'] = stats.entropy(fft_vals + 1e-10)
    
    return features

## Apply Feature Extraction with Sliding Window

In [None]:
# Window parameters
WINDOW_SIZE = 50  # Number of samples per window
OVERLAP = 25      # Overlap between windows

feature_list = []

# Process each session separately
for session in data['session'].unique():
    session_data = data[data['session'] == session].reset_index(drop=True)
    activity = session_data['activity'].iloc[0]
    
    # Sliding window
    for start in range(0, len(session_data) - WINDOW_SIZE + 1, WINDOW_SIZE - OVERLAP):
        window = session_data.iloc[start:start + WINDOW_SIZE]
        features = extract_features(window)
        features['activity'] = activity
        features['session'] = session
        feature_list.append(features)

# Create feature dataframe
features_df = pd.DataFrame(feature_list)
print(f"\nExtracted features shape: {features_df.shape}")
print(f"\nFeatures per activity:\n{features_df['activity'].value_counts()}")
features_df.head()

## Save Extracted Features

In [None]:
features_df.to_csv('extracted_features.csv', index=False)
print("Features saved to 'extracted_features.csv'")

## Visualize Feature Distributions

In [None]:
# Select key features for visualization
key_features = ['acc_x_mean', 'acc_y_mean', 'acc_z_mean', 'acc_sma', 
                'gyro_x_std', 'gyro_y_std', 'gyro_z_std', 'gyro_sma']

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.flatten()

for idx, feature in enumerate(key_features):
    for activity in features_df['activity'].unique():
        data_subset = features_df[features_df['activity'] == activity][feature]
        axes[idx].hist(data_subset, alpha=0.5, label=activity, bins=20)
    axes[idx].set_title(feature)
    axes[idx].legend()
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=150)
plt.show()

## Feature Summary Statistics

In [None]:
# Summary by activity
numeric_cols = features_df.select_dtypes(include=[np.number]).columns
summary = features_df.groupby('activity')[numeric_cols].mean()
print("\nMean features by activity:")
summary