In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
from fourier_transform import extract_fft_features

In [None]:
# Configuration
list_files = ['Accelerometer.csv', 'Gyroscope.csv', 'Location.csv']
base_data_dir = Path('test_data')
resample_interval = "1s"
anchor_time = pd.Timestamp("2025-01-01 00:00")
labels_to_include = ['train', 'biking', 'running']

# Store processed labeled dataframes
dataframes = []

# Loop through all relevant subdirectories
for folder in base_data_dir.iterdir():
    if folder.is_dir():
        folder_name = folder.name.lower()
        matched_label = next((label for label in labels_to_include if folder_name.startswith(label)), None)

        if matched_label:
            print(f"Processing folder: {folder.name} (label = '{matched_label}')")
            folder_data = {}

            for file_name in list_files:
                file_path = folder / file_name
                if file_path.exists():
                    key = file_name.replace('.csv', '')
                    df = pd.read_csv(file_path)

                    if 'Time (s)' not in df.columns:
                        print(f"Skipping {file_name} in {folder.name}: No 'Time (s)' column.")
                        continue

                    # Convert time to timestamp
                    df["timestamp"] = anchor_time + pd.to_timedelta(df["Time (s)"], unit="s")
                    df.set_index("timestamp", inplace=True)

                    # Get numeric columns
                    num_cols = df.select_dtypes(include="number").columns.tolist()

                    # Interpolate
                    df[num_cols] = df[num_cols].interpolate(method='linear', limit_direction='both')

                    # Apply FFT to columns
                    for col in num_cols:
                        df = extract_fft_features(df, col, window_size=100, sampling_rate=100)

                    # Resample and fill numeric + FFT features
                    all_cols_to_resample = df.select_dtypes(include="number").columns.tolist()
                    df_resampled = df[all_cols_to_resample].resample(resample_interval).mean()
                    df_resampled = df_resampled.ffill().bfill()

                    # Remove first and last row
                    df_resampled = df_resampled.iloc[1:-1]

                    folder_data[key] = df_resampled
                    print(f"  Loaded and resampled {file_name}")
                else:
                    print(f"  Missing {file_name} in {folder.name}")

            # Combine and label
            if folder_data:
                df_combined = pd.concat(folder_data.values(), axis=1)
                df_combined['label'] = matched_label
                df_combined.reset_index(inplace=True)
                dataframes.append(df_combined)

# Final merged dataset
final_df = pd.concat(dataframes, ignore_index=True)

# Drop all rows with any NaN values
final_df = final_df.dropna()
final_df = final_df.drop(columns=[col for col in final_df.columns if col == 'Time (s)'])
print(f"Final dataset shape after removing NaNs: {final_df.shape}")
print("Labels present:", final_df['label'].value_counts())

# Save to CSV in 'test_data' folder
output_path = Path("test_data/final_test_data.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
final_df.to_csv(output_path, index=False)
print(f"Saved final_test_data to {output_path.resolve()}")

Adding patterns

In [None]:
TRAIN = Path("data/final_data_with_patterns.csv")
TEST_SRC = Path("test_data/final_test_data.csv")
TEST_DST = Path("test_data/final_test_data_with_patterns.csv")

train_df = pd.read_csv(TRAIN, parse_dates=['timestamp'], index_col='timestamp')
used_patterns = [col for col in train_df.columns if col in {
    'sustained_low_speed',
    'sustained_medium_speed',
    'sustained_high_speed',
    'low_velocity_high_gyro',
    'high_velocity_low_gyro'
}]

# Load test data
df = pd.read_csv(TEST_SRC, parse_dates=['timestamp'], index_col='timestamp')

# Gyroscope magnitude
df['gyro'] = np.sqrt(df['X (rad/s)']**2 + df['Y (rad/s)']**2 + df['Z (rad/s)']**2)
df.drop(columns=['X (rad/s)', 'Y (rad/s)', 'Z (rad/s)'], inplace=True)

# Drop GPS-related columns
df.drop(columns=["Latitude (°)", "Longitude (°)", "Height (m)", "Horizontal Accuracy (m)", "Vertical Accuracy (°)"], inplace=True)

# Pattern definitions (same as training)
PATTERN_FUNCTIONS = {
    'sustained_low_speed': lambda df: (df['Velocity (m/s)'] < 0.5).rolling(window=10).sum() == 10,
    'sustained_medium_speed': lambda df: df['Velocity (m/s)'].between(1.5, 3.0).rolling(window=10).sum() == 10,
    'sustained_high_speed': lambda df: (df['Velocity (m/s)'] > 3.0).rolling(window=10).sum() == 10,
    'low_velocity_high_gyro': lambda df: ((df['Velocity (m/s)'] < 0.5) & (df['gyro'].abs() > 1.0)).rolling(window=10).sum() == 10,
    'high_velocity_low_gyro': lambda df: ((df['Velocity (m/s)'] > 3.0) & (df['gyro'].abs() < 0.5)).rolling(window=10).sum() == 10,
}

# Apply only the patterns used in train
for name in used_patterns:
    df[name] = PATTERN_FUNCTIONS[name](df).fillna(False)

# Convert boolean columns to int for compatibility
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

# Save
df.to_csv(TEST_DST)
print(f"Test data with consistent pattern features saved to {TEST_DST}")