In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path


In [20]:
# Configuration
list_files = ['Accelerometer.csv', 'Gyroscope.csv', 'Location.csv']
base_data_dir = Path('data')
resample_interval = "1s"
anchor_time = pd.Timestamp("2025-01-01 00:00")
labels_to_include = ['train', 'walking', 'car']

# Store processed labeled dataframes
dataframes = []

# Loop through all relevant subdirectories
for folder in base_data_dir.iterdir():
    if folder.is_dir():
        folder_name = folder.name.lower()
        matched_label = next((label for label in labels_to_include if folder_name.startswith(label)), None)

        if matched_label:
            print(f"Processing folder: {folder.name} (label = '{matched_label}')")
            folder_data = {}

            for file_name in list_files:
                file_path = folder / file_name
                if file_path.exists():
                    key = file_name.replace('.csv', '')
                    df = pd.read_csv(file_path)

                    if 'Time (s)' not in df.columns:
                        print(f"Skipping {file_name} in {folder.name}: No 'Time (s)' column.")
                        continue

                    # Convert time to timestamp
                    df["timestamp"] = anchor_time + pd.to_timedelta(df["Time (s)"], unit="s")
                    df.set_index("timestamp", inplace=True)

                    # Get numeric columns
                    num_cols = df.select_dtypes(include="number").columns.tolist()

                    # Interpolate
                    df[num_cols] = df[num_cols].interpolate(method='linear', limit_direction='both')

                    # Resample and fill
                    df_resampled = df[num_cols].resample(resample_interval).mean()
                    df_resampled = df_resampled.ffill().bfill()

                    # Remove first and last row
                    df_resampled = df_resampled.iloc[1:-1]

                    folder_data[key] = df_resampled
                    print(f"  Loaded and resampled {file_name}")
                else:
                    print(f"  Missing {file_name} in {folder.name}")

            # Combine and label
            if folder_data:
                df_combined = pd.concat(folder_data.values(), axis=1)
                df_combined['label'] = matched_label
                df_combined.reset_index(inplace=True)
                dataframes.append(df_combined)

# Final merged dataset
final_df = pd.concat(dataframes, ignore_index=True)

# Drop all rows with any NaN values
final_df = final_df.dropna()
final_df = final_df.drop(columns=[col for col in final_df.columns if col == 'Time (s)'])
print(f"Final dataset shape after removing NaNs: {final_df.shape}")
print("Labels present:", final_df['label'].value_counts())

# Save to CSV in 'data' folder
output_path = Path("data/final_data.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
final_df.to_csv(output_path, index=False)
print(f"Saved final_df to {output_path.resolve()}")


Processing folder: car (label = 'car')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: train (label = 'train')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: train2 (label = 'train')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: walking (label = 'walking')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: walking2 (label = 'walking')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Final dataset shape after removing NaNs: (3255, 15)
Labels present: label
train      1196
walking    1187
car         872
Name: count, dtype: int64
Saved final_df to C:\Users\iChin\School\MLQS\MLQS_2025\data\f

In [22]:
# NaNs are always the first or last row from the dataframes respectively
final_df = final_df.dropna()

In [30]:
final_df.columns

Index(['timestamp', 'X (m/s^2)', 'Y (m/s^2)', 'Z (m/s^2)', 'X (rad/s)',
       'Y (rad/s)', 'Z (rad/s)', 'Latitude (°)', 'Longitude (°)', 'Height (m)',
       'Velocity (m/s)', 'Direction (°)', 'Horizontal Accuracy (m)',
       'Vertical Accuracy (°)', 'label'],
      dtype='object')