In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import datetime

In [2]:
df = pd.read_csv('Data/train/Location.csv')

In [3]:
df.tail(40)

Unnamed: 0,Time (s),Latitude (°),Longitude (°),Height (m),Velocity (m/s),Direction (°),Horizontal Accuracy (m),Vertical Accuracy (°)
556,574.60073,52.258459,4.644898,-0.863315,8.650789,43.064281,21.478625,3.0
557,575.600732,52.258486,4.644945,-0.652678,8.175989,43.249389,19.804993,3.0
558,576.600736,52.258541,4.644977,-1.105063,7.798941,41.531317,18.544494,3.0
559,577.600739,52.258608,4.645058,-1.129212,7.83572,41.355644,18.643429,3.0
560,578.600742,52.258601,4.645147,-1.116615,6.826054,43.110097,14.843298,3.0
561,579.600746,52.258619,4.645196,-0.966655,6.269422,43.755491,14.103793,3.0
562,580.600749,52.258639,4.645211,-1.109023,5.632893,43.125681,13.092309,3.0
563,581.600752,52.258635,4.64519,-0.945866,5.049017,42.521463,11.819572,3.0
564,582.600756,52.258563,4.645406,-0.649473,4.662933,41.828556,17.632555,3.0
565,583.600759,52.25858,4.645431,-0.400138,4.408484,41.828556,19.303499,3.0


In [4]:
num_rows_with_nan = df.isna().any(axis=1).sum()
print(f"Rows with at least one NaN: {num_rows_with_nan}")

Rows with at least one NaN: 48


In [5]:
import pandas as pd
import os
import numpy as np
from datetime import datetime
from pathlib import Path

# Configuration
list_files = ['Accelerometer.csv', 'Gyroscope.csv', 'Location.csv']
base_data_dir = Path('Data')
resample_interval = "1s"
anchor_time = pd.Timestamp("2025-01-01 00:00")

# Folder prefixes to look for
labels_to_include = ['train', 'walking', 'car']

# Store processed labeled dataframes
dataframes = []

# Loop through all subdirectories under Data/
for folder in base_data_dir.iterdir():
    if folder.is_dir():
        folder_name = folder.name.lower()
        matched_label = next((label for label in labels_to_include if folder_name.startswith(label)), None)

        if matched_label:
            print(f"Processing folder: {folder.name} (label = '{matched_label}')")
            folder_data = {}

            # Load and process each relevant file
            for file_name in list_files:
                file_path = folder / file_name
                if file_path.exists():
                    key = file_name.replace('.csv', '')
                    df = pd.read_csv(file_path)

                    if 'Time (s)' not in df.columns:
                        print(f"Skipping {file_name} in {folder.name}: No 'Time (s)' column.")
                        continue

                    # Convert time to timestamp
                    df["timestamp"] = anchor_time + pd.to_timedelta(df["Time (s)"], unit="s")
                    df.set_index("timestamp", inplace=True)

                    # Get numeric columns
                    num_cols = df.select_dtypes(include="number").columns.tolist()

                    # Interpolate all numeric columns
                    df[num_cols] = df[num_cols].interpolate(method='linear', limit_direction='both')

                    # Resample and fill
                    df_resampled = df[num_cols].resample(resample_interval).mean()
                    df_resampled = df_resampled.ffill().bfill()

                    # Remove the first and last row
                    df_resampled = df_resampled.iloc[1:-1]

                    folder_data[key] = df_resampled
                    print(f"  Loaded and resampled {file_name}")
                else:
                    print(f"  Missing {file_name} in {folder.name}")

            # Combine and label
            if folder_data:
                df_combined = pd.concat(folder_data.values(), axis=1)
                df_combined['label'] = matched_label
                df_combined.reset_index(inplace=True)
                dataframes.append(df_combined)

# Final merged dataset
final_df = pd.concat(dataframes, ignore_index=True)

# Show summary
print("Final dataset shape:", final_df.shape)
print("Labels present:", final_df['label'].value_counts())


Processing folder: car (label = 'car')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: train (label = 'train')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: train2 (label = 'train')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: walking (label = 'walking')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Processing folder: walking2 (label = 'walking')
  Loaded and resampled Accelerometer.csv
  Loaded and resampled Gyroscope.csv
  Loaded and resampled Location.csv
Final dataset shape: (3262, 18)
Labels present: label
train      1198
walking    1190
car         874
Name: count, dtype: int64


In [20]:
# NaNs are always the first or last row from the dataframes respectively
final_df = final_df.dropna()