# In this file we collect the data used in this project

In [116]:
import pandas as pd
import numpy as np
import json
import os

### Creating a function to merge all the data from the sensors

In [117]:
def merge_sensor_csvs(folder_path, sensor_files):
    """
    Reads and merges sensor CSV files. Each CSV must have 'time' and 'seconds_elapsed' columns.
    Columns (except time & seconds_elapsed) are renamed to include the sensor name as suffix.

    Parameters:
        folder_path (str): Directory where the CSVs are stored.
        sensor_files (dict): Dictionary where keys are filenames and values are sensor names.
                             e.g., {"accelerometer.csv": "acc", "gyroscope.csv": "gyro"}

    Returns:
        pd.DataFrame: Merged dataframe containing all sensor data.
    """

    merged_df = None

    for filename, sensor_name in sensor_files.items():
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)

        # Rename sensor-specific columns
        df_renamed = df.rename(
            columns={
                col: f"{col}_{sensor_name}" for col in df.columns 
                if col not in ["time", "seconds_elapsed"]
            }
        )

        if merged_df is None:
            merged_df = df_renamed
        else:
            # Merge on time and seconds_elapsed
            merged_df = pd.merge(
                merged_df, df_renamed, on=["time", "seconds_elapsed"], how="outer"
            )

    return merged_df


### Creating a function to merge all the files related to a user

In [118]:
def merge_all_sensor_recordings(parent_folder, sensor_files):
    """
    Merges multiple sensor data folders into a single DataFrame.

    Parameters:
        parent_folder (str): Path to the folder containing subfolders of recordings.
        sensor_files (dict): Filenames and their corresponding sensor name suffixes.
                             e.g., {"accelerometer.csv": "acc", "gyroscope.csv": "gyro"}

    Returns:
        pd.DataFrame: Combined DataFrame of all recordings.
    """

    def merge_sensor_csvs(folder_path, sensor_files):
        merged_df = None
        for filename, sensor_name in sensor_files.items():
            file_path = os.path.join(folder_path, filename)
            if not os.path.exists(file_path):
                continue
            df = pd.read_csv(file_path)
            # Remove time column, so it does not leak the target
            df = df.drop(columns=['time'])
            
            df_renamed = df.rename(
                columns={
                    col: f"{col}_{sensor_name}" for col in df.columns 
                    if col not in ["seconds_elapsed"]
                }
            )
            if merged_df is None:
                merged_df = df_renamed
            else:
                merged_df = pd.merge(
                    merged_df, df_renamed, on=["seconds_elapsed"], how="outer"
                )
        return merged_df

    all_dfs = []

    for folder_name in os.listdir(parent_folder):
        folder_path = os.path.join(parent_folder, folder_name)
        if os.path.isdir(folder_path):
            merged = merge_sensor_csvs(folder_path, sensor_files)
            if merged is not None:
                # Add a column to identify which recording this came from
                #merged["session"] = folder_name
                all_dfs.append(merged)

    # return empty DataFrame if no data is found
    if not all_dfs:
        return pd.DataFrame()  

    # Concatenate all recordings into one DataFrame
    return pd.concat(all_dfs, ignore_index=True)


### Creating a dataframe for each user (we will merge them later)

In [119]:
sensor_files = {
    "Accelerometer.csv": "accelerometer",
    "Barometer.csv": "barometer",
    "Compass.csv": "compass",
    "Gravity.csv": "gravity",
    "Gyroscope.csv": "gyro",
    "WristMotion.csv": "wristMotion",
    "Magnetometer.csv": "magnetometer"
}

df_natcha = merge_all_sensor_recordings("data/Gait_Natcha", sensor_files)
df_natcha['user'] = 1
df_eliandro = merge_all_sensor_recordings("data/Gait_Eliandro", sensor_files)
df_eliandro['user'] = 2
df_houcen = merge_all_sensor_recordings("data/Gait_Houcen", sensor_files)
df_houcen['user'] = 3


#### Let's aggregate it by 0.05s using the mean for each variable.

In [120]:
def aggregate_by_time(df, interval=0.05):
    """
    Aggregates the sensor data by time.

    Parameters:
        df (pd.DataFrame): Merged DataFrame with 'seconds_elapsed' and 'recording_id'.
        interval (float): Time bin size in seconds.

    Returns:
        pd.DataFrame: Aggregated DataFrame.
    """
    df = df.copy()

    # Round seconds_elapsed to the nearest interval
    df["time_bin"] = (df["seconds_elapsed"] / interval).round() * interval

    # Group by recording and time_bin, then average all numeric values
    grouped = df.groupby(["user", "time_bin"]).mean(numeric_only=True).reset_index()

    # Optionally rename time_bin back to seconds_elapsed
    grouped = grouped.rename(columns={"time_bin": "seconds_elapsed"})

    return grouped


In [121]:
df_natcha

Unnamed: 0,seconds_elapsed,z_accelerometer,y_accelerometer,x_accelerometer,relativeAltitude_barometer,pressure_barometer,magneticBearing_compass,z_gravity,y_gravity,x_gravity,...,accelerationY_wristMotion,accelerationZ_wristMotion,quaternionW_wristMotion,quaternionX_wristMotion,quaternionY_wristMotion,quaternionZ_wristMotion,z_magnetometer,y_magnetometer,x_magnetometer,user
0,0.047745,2.575090,0.103548,0.858687,,,202.450166,-8.580944,-4.746952,-0.065009,...,,,,,,,-34.319763,-18.035522,7.452187,1
1,0.057738,2.746248,-0.088969,0.859846,,,201.835504,-8.576129,-4.755847,-0.048062,...,,,,,,,-33.866211,-18.232803,7.305710,1
2,0.067731,2.754037,-0.275779,0.818044,,,201.789311,-8.576884,-4.754588,-0.036486,...,,,,,,,-33.934814,-18.267181,7.302399,1
3,0.077724,2.613906,-0.333490,0.706105,,,201.722670,-8.582501,-4.744462,-0.033783,...,,,,,,,-34.055695,-18.421402,7.339211,1
4,0.087717,2.278119,-0.318283,0.553217,,,202.541786,-8.591330,-4.728395,-0.041457,...,,,,,,,-33.790573,-18.529175,7.690872,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422179,2906.188554,,,,,,,,,,...,0.244525,-0.210839,-0.770986,0.186314,0.367319,0.485740,,,,1
422180,2906.198618,,,,,,,,,,...,0.218662,-0.105817,-0.758795,0.186659,0.373277,0.500053,,,,1
422181,2906.208682,,,,,,,,,,...,0.165040,-0.028743,-0.746738,0.186206,0.379189,0.513737,,,,1
422182,2906.218746,,,,,,,,,,...,0.121682,0.013318,-0.734790,0.185107,0.385247,0.526691,,,,1


In [122]:
df_natcha.isna().sum()

seconds_elapsed                    0
z_accelerometer               229128
y_accelerometer               229128
x_accelerometer               229128
relativeAltitude_barometer    420399
pressure_barometer            420399
magneticBearing_compass       229128
z_gravity                     229128
y_gravity                     229128
x_gravity                     229128
z_gyro                        229128
y_gyro                        229128
x_gyro                        229128
rotationRateX_wristMotion     194841
rotationRateY_wristMotion     194841
rotationRateZ_wristMotion     194841
gravityX_wristMotion          194841
gravityY_wristMotion          194841
gravityZ_wristMotion          194841
accelerationX_wristMotion     194841
accelerationY_wristMotion     194841
accelerationZ_wristMotion     194841
quaternionW_wristMotion       194841
quaternionX_wristMotion       194841
quaternionY_wristMotion       194841
quaternionZ_wristMotion       194841
z_magnetometer                229128
y

In [123]:
df = pd.concat([df_houcen, df_natcha, df_eliandro], ignore_index = True)
df = df.sort_values(by=['seconds_elapsed'], ignore_index=True)
df

Unnamed: 0,seconds_elapsed,z_accelerometer,y_accelerometer,x_accelerometer,z_gravity,y_gravity,x_gravity,z_gyro,y_gyro,x_gyro,...,quaternionX_wristMotion,quaternionY_wristMotion,quaternionZ_wristMotion,z_magnetometer,y_magnetometer,x_magnetometer,user,relativeAltitude_barometer,pressure_barometer,magneticBearing_compass
0,-0.570446,,,,,,,,,,...,,,,,,,2,0.0,1014.732361,
1,-0.541341,,,,,,,,,,...,,,,,,,1,0.0,1016.061707,
2,-0.523916,,,,,,,,,,...,,,,,,,1,0.0,1015.169830,
3,-0.522268,,,,,,,,,,...,,,,,,,2,0.0,1014.458237,
4,-0.521896,,,,,,,,,,...,,,,,,,1,0.0,1015.913239,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170837,2906.188554,,,,,,,,,,...,0.186314,0.367319,0.485740,,,,1,,,
1170838,2906.198618,,,,,,,,,,...,0.186659,0.373277,0.500053,,,,1,,,
1170839,2906.208682,,,,,,,,,,...,0.186206,0.379189,0.513737,,,,1,,,
1170840,2906.218746,,,,,,,,,,...,0.185107,0.385247,0.526691,,,,1,,,


In [124]:
df.isna().sum() / len(df)

seconds_elapsed               0.000000
z_accelerometer               0.465247
y_accelerometer               0.465247
x_accelerometer               0.465247
z_gravity                     0.465247
y_gravity                     0.465247
x_gravity                     0.465247
z_gyro                        0.465247
y_gyro                        0.465247
x_gyro                        0.465247
rotationRateX_wristMotion     0.538032
rotationRateY_wristMotion     0.538032
rotationRateZ_wristMotion     0.538032
gravityX_wristMotion          0.538032
gravityY_wristMotion          0.538032
gravityZ_wristMotion          0.538032
accelerationX_wristMotion     0.538032
accelerationY_wristMotion     0.538032
accelerationZ_wristMotion     0.538032
quaternionW_wristMotion       0.538032
quaternionX_wristMotion       0.538032
quaternionY_wristMotion       0.538032
quaternionZ_wristMotion       0.538032
z_magnetometer                0.465247
y_magnetometer                0.465247
x_magnetometer           

### Saving the data

In [125]:
df.to_csv('data/dataFile.csv', index=False)