In [None]:
# --- Load and combine all CSV files ---
import os
import pandas as pd

data_folder = "../merged"

# List all CSV files
csv_files = [f for f in os.listdir(data_folder) if f.endswith(".csv")]

# Read and combine them into one DataFrame
all_data = []
for file in csv_files:
    df = pd.read_csv(os.path.join(data_folder, file))
    df["source_file"] = file  # keep track of which file this data came from
    all_data.append(df)

data = pd.concat(all_data, ignore_index=True)

# Check structure
print("Data shape:", data.shape)
print("Columns:", data.columns.tolist())
print("Activities:", data["activity"].unique())
data.head()


Data shape: (41267, 9)
Columns: ['timestamp', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'activity', 'source_file']
Activities: ['jump' 'standing' 'still' 'walk']


Unnamed: 0,timestamp,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,activity,source_file
0,0.119341,0.318755,-0.55975,0.184729,0.208629,-0.565315,-0.338257,jump,jump_01_merged.csv
1,0.129311,0.422586,-0.373497,0.384686,0.233211,-0.661429,-0.369003,jump,jump_01_merged.csv
2,0.13928,0.504881,-0.216885,0.659927,0.249413,-0.733556,-0.361532,jump,jump_01_merged.csv
3,0.14925,0.575756,-0.138521,0.548277,0.243897,-0.71584,-0.332538,jump,jump_01_merged.csv
4,0.159219,0.627653,-0.130732,0.586392,0.19136,-0.605196,-0.27319,jump,jump_01_merged.csv


In [None]:

import numpy as np
import pandas as pd
from scipy import signal
from scipy.fft import rfft, rfftfreq

# Set/confirm these (based on data collection settings)
window_size = 100   # samples per window
step_size   = 50    # hop size
sampling_rate = 50  # Hz

def sliding_windows(df: pd.DataFrame, window_size: int, step_size: int):
    """Split one recording into overlapping windows."""
    n = len(df)
    windows = []
    start = 0
    while start + window_size <= n:
        w = df.iloc[start:start + window_size].reset_index(drop=True)
        windows.append(w)
        start += step_size
    return windows

def time_domain_features(win: pd.DataFrame):
    """Time-domain stats per window."""
    feats = {}
    axes = ['acc_x','acc_y','acc_z','gyro_x','gyro_y','gyro_z']
    for axis in axes:
        data = win[axis].values
        feats[f'{axis}_mean']   = float(np.mean(data))
        feats[f'{axis}_std']    = float(np.std(data))
        feats[f'{axis}_var']    = float(np.var(data))
        feats[f'{axis}_min']    = float(np.min(data))
        feats[f'{axis}_max']    = float(np.max(data))
        feats[f'{axis}_median'] = float(np.median(data))
    # Signal Magnitude Area (accelerometer)
    try:
        feats['acc_SMA'] = float(np.mean(np.abs(win[['acc_x','acc_y','acc_z']].values)))
    except Exception:
        feats['acc_SMA'] = 0.0
    # Correlations (guard against constant signals)
    def safe_corr(a, b):
        if np.std(a) == 0 or np.std(b) == 0: return 0.0
        return float(np.corrcoef(a, b)[0,1])
    feats['acc_xy_corr'] = safe_corr(win['acc_x'].values, win['acc_y'].values)
    feats['acc_xz_corr'] = safe_corr(win['acc_x'].values, win['acc_z'].values)
    feats['acc_yz_corr'] = safe_corr(win['acc_y'].values, win['acc_z'].values)
    return feats

def freq_domain_features(win: pd.DataFrame, fs: int = sampling_rate):
    """Frequency-domain features per window."""
    feats = {}
    axes = ['acc_x','acc_y','acc_z','gyro_x','gyro_y','gyro_z']
    n = len(win)
    for axis in axes:
        data = win[axis].values
        if n <= 1:
            feats[f'{axis}_dom_freq']    = 0.0
            feats[f'{axis}_spec_energy'] = 0.0
            continue
        data = signal.detrend(data)
        yf = np.abs(rfft(data))
        xf = rfftfreq(n, 1.0/fs)
        dom_idx = int(np.argmax(yf))
        feats[f'{axis}_dom_freq']    = float(xf[dom_idx]) if len(xf) else 0.0
        feats[f'{axis}_spec_energy'] = float(np.sum(yf**2))
    return feats

def extract_features_for_record(df_one_record: pd.DataFrame) -> pd.DataFrame:
    """
    Window a single recording and compute features per window.
    Assumes columns: timestamp, acc_x, acc_y, acc_z, gyro_x, gyro_y, gyro_z, activity, source_file
    """
    windows = sliding_windows(df_one_record, window_size, step_size)
    rows = []
    # derive label for the whole recording (mode of 'activity' in this segment)
    if 'activity' in df_one_record.columns:
        activity_label = df_one_record['activity'].mode().iat[0]
    else:
        activity_label = None
    src = df_one_record['source_file'].iat[0] if 'source_file' in df_one_record.columns else 'unknown'

    for w in windows:
        tfeats = time_domain_features(w)
        ffeats = freq_domain_features(w, fs=sampling_rate)
        feats = {**tfeats, **ffeats}
        feats['activity'] = activity_label
        feats['source_file'] = src
        rows.append(feats)

    return pd.DataFrame(rows)


In [3]:
# --- Re-run feature extraction over ALL recordings ---
feature_dfs = []

for src in data["source_file"].unique():
    df_subset = data[data["source_file"] == src].reset_index(drop=True)
    df_subset["source_file"] = src  # ensure consistent column
    feats_df = extract_features_for_record(df_subset)
    feats_df["source_file"] = src
    feature_dfs.append(feats_df)

features = pd.concat(feature_dfs, ignore_index=True)

print("Features extracted successfully!")
print("Feature dataset shape:", features.shape)
print("Unique activities:", features["activity"].unique())
print("Unique recordings processed:", features['source_file'].nunique())
features.head()


Features extracted successfully!
Feature dataset shape: (749, 54)
Unique activities: ['jump' 'standing' 'still' 'walk']
Unique recordings processed: 50


Unnamed: 0,acc_x_mean,acc_x_std,acc_x_var,acc_x_min,acc_x_max,acc_x_median,acc_y_mean,acc_y_std,acc_y_var,acc_y_min,...,acc_z_dom_freq,acc_z_spec_energy,gyro_x_dom_freq,gyro_x_spec_energy,gyro_y_dom_freq,gyro_y_spec_energy,gyro_z_dom_freq,gyro_z_spec_energy,activity,source_file
0,-0.021524,0.667775,0.445924,-1.849218,0.855231,0.140855,0.132795,0.539499,0.29106,-1.061096,...,1.0,40979.881514,0.5,842.854959,1.5,958.867761,2.5,127.40751,jump,jump_01_merged.csv
1,0.020366,0.927388,0.860049,-1.849218,1.821961,-0.24811,-0.162298,1.2211,1.491085,-2.501738,...,1.0,136381.085362,1.0,2812.576757,1.5,5819.880819,2.5,405.852354,jump,jump_01_merged.csv
2,0.318321,0.98055,0.961478,-2.890957,1.821961,0.547057,-0.040576,2.428427,5.897259,-6.293818,...,1.0,160602.088884,1.0,5133.171877,1.0,5757.219448,2.5,866.949533,jump,jump_01_merged.csv
3,0.148123,1.552531,2.410352,-5.404576,2.651305,0.485997,-0.014619,2.769314,7.669098,-8.253015,...,1.0,173011.721633,1.0,9763.992689,4.0,3286.689986,4.5,1790.809594,jump,jump_01_merged.csv
4,0.428056,1.757772,3.089764,-5.404576,3.503008,0.811033,-0.556027,2.507469,6.287402,-8.253015,...,1.0,295681.735834,3.5,10148.071948,4.5,6786.838483,4.5,3202.846257,jump,jump_01_merged.csv


In [6]:
import joblib
import os

# Create a folder for processed data if it doesn't exist
os.makedirs("../processed_data", exist_ok=True)

# Save as CSV (human-readable)
features.to_csv("../processed_data/extracted_features.csv", index=False)
print("Saved feature dataset as CSV at '../processed_data/extracted_features.csv'")

joblib.dump(features, "../processed_data/extracted_features.pkl")
print("Saved feature dataset as joblib pickle at '../processed_data/extracted_features.pkl'")


Saved feature dataset as CSV at '../processed_data/extracted_features.csv'
Saved feature dataset as joblib pickle at '../processed_data/extracted_features.pkl'
