# In this file we collect the data used in this project

In [333]:
import pandas as pd
import numpy as np
import json
import os

### Creating a function to merge all the data from the sensors

In [334]:
def merge_sensor_csvs(folder_path, sensor_files):
    """
    Reads and merges sensor CSV files. Each CSV must have 'time' and 'seconds_elapsed' columns.
    Columns (except time & seconds_elapsed) are renamed to include the sensor name as suffix.

    Parameters:
        folder_path (str): Directory where the CSVs are stored.
        sensor_files (dict): Dictionary where keys are filenames and values are sensor names.
                             e.g., {"accelerometer.csv": "acc", "gyroscope.csv": "gyro"}

    Returns:
        pd.DataFrame: Merged dataframe containing all sensor data.
    """

    merged_df = None

    for filename, sensor_name in sensor_files.items():
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)

        # Rename sensor-specific columns
        df_renamed = df.rename(
            columns={
                col: f"{col}_{sensor_name}" for col in df.columns 
                if col not in ["time", "seconds_elapsed"]
            }
        )

        if merged_df is None:
            merged_df = df_renamed
        else:
            # Merge on time and seconds_elapsed
            merged_df = pd.merge(
                merged_df, df_renamed, on=["time", "seconds_elapsed"], how="outer"
            )

    return merged_df


### Creating a function to merge all the files related to a user

In [335]:
def merge_all_sensor_recordings(parent_folder, sensor_files):
    """
    Merges multiple sensor data folders into a single DataFrame.

    Parameters:
        parent_folder (str): Path to the folder containing subfolders of recordings.
        sensor_files (dict): Filenames and their corresponding sensor name suffixes.
                             e.g., {"accelerometer.csv": "acc", "gyroscope.csv": "gyro"}

    Returns:
        pd.DataFrame: Combined DataFrame of all recordings.
    """

    def merge_sensor_csvs(folder_path, sensor_files):
        merged_df = None
        for filename, sensor_name in sensor_files.items():
            file_path = os.path.join(folder_path, filename)
            if not os.path.exists(file_path):
                continue
            df = pd.read_csv(file_path)
            # Remove time column, so it does not leak the target
            df = df.drop(columns=['time'])
            
            df_renamed = df.rename(
                columns={
                    col: f"{col}_{sensor_name}" for col in df.columns 
                    if col not in ["seconds_elapsed"]
                }
            )
            if merged_df is None:
                merged_df = df_renamed
            else:
                merged_df = pd.merge(
                    merged_df, df_renamed, on=["seconds_elapsed"], how="outer"
                )
        return merged_df

    all_dfs = []

    for folder_name in os.listdir(parent_folder):
        folder_path = os.path.join(parent_folder, folder_name)
        if os.path.isdir(folder_path):
            merged = merge_sensor_csvs(folder_path, sensor_files)
            if merged is not None:
                # Add a column to identify which recording this came from
                #merged["session"] = folder_name
                all_dfs.append(merged)

    # return empty DataFrame if no data is found
    if not all_dfs:
        return pd.DataFrame()  

    # Concatenate all recordings into one DataFrame
    return pd.concat(all_dfs, ignore_index=True)


In [336]:
def aggregate_by_time(df, interval=0.05):
    """
    Aggregates the sensor data by time.

    Parameters:
        df (pd.DataFrame): Merged DataFrame with 'seconds_elapsed' and 'recording_id'.
        interval (float): Time bin size in seconds.

    Returns:
        pd.DataFrame: Aggregated DataFrame.
    """
    df = df.copy()

    # Round seconds_elapsed to the nearest interval
    df["time_bin"] = (df["seconds_elapsed"] / interval).round() * interval

    # Group by recording and time_bin, then average all numeric values
    grouped = df.groupby(["time_bin"]).mean(numeric_only=True).reset_index()

    # Optionally rename time_bin back to seconds_elapsed
    grouped = grouped.drop(columns=["time_bin"])

    return grouped


### Creating a dataframe for each user (we will merge them later)

In [337]:
sensor_files = {
    "Accelerometer.csv": "accelerometer",
    "Barometer.csv": "barometer",
    "Compass.csv": "compass",
    "Gravity.csv": "gravity",
    "Gyroscope.csv": "gyroscope",
    "WristMotion.csv": "wristMotion",
    "Magnetometer.csv": "magnetometer"
}

df_natcha = merge_all_sensor_recordings("data/Gait_Natcha", sensor_files)
df_natcha = aggregate_by_time(df_natcha, interval=0.01)
df_natcha['user'] = 1
df_eliandro = merge_all_sensor_recordings("data/Gait_Eliandro", sensor_files)
df_eliandro = aggregate_by_time(df_eliandro, interval=0.01)
df_eliandro['user'] = 2
df_houcen = merge_all_sensor_recordings("data/Gait_Houcen", sensor_files)
df_houcen = aggregate_by_time(df_houcen, interval=0.01)
df_houcen['user'] = 3


In [338]:
print(len(df_houcen))
print(len(df_eliandro))
print(len(df_natcha))

81429
117136
88250


In [339]:
df = pd.concat([df_houcen, df_natcha, df_eliandro], ignore_index = True)
df = df.sort_values(by=['seconds_elapsed'], ignore_index=True)
df

Unnamed: 0,seconds_elapsed,z_accelerometer,y_accelerometer,x_accelerometer,z_gravity,y_gravity,x_gravity,z_gyroscope,y_gyroscope,x_gyroscope,...,quaternionX_wristMotion,quaternionY_wristMotion,quaternionZ_wristMotion,z_magnetometer,y_magnetometer,x_magnetometer,user,relativeAltitude_barometer,pressure_barometer,magneticBearing_compass
0,-0.570446,,,,,,,,,,...,,,,,,,2,0.0,1014.732361,
1,-0.541341,,,,,,,,,,...,,,,,,,1,0.0,1016.061707,
2,-0.522906,,,,,,,,,,...,,,,,,,1,0.0,1015.541534,
3,-0.522268,,,,,,,,,,...,,,,,,,2,0.0,1014.458237,
4,0.029490,0.620725,-0.024275,0.340136,-7.344242,-4.756945,-4.427637,0.015129,-0.497939,-0.129197,...,,,,-31.496674,-60.121986,5.366821,2,,,185.101018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286810,2906.188554,,,,,,,,,,...,0.186314,0.367319,0.485740,,,,1,,,
286811,2906.198618,,,,,,,,,,...,0.186659,0.373277,0.500053,,,,1,,,
286812,2906.208682,,,,,,,,,,...,0.186206,0.379189,0.513737,,,,1,,,
286813,2906.218746,,,,,,,,,,...,0.185107,0.385247,0.526691,,,,1,,,


### Remove the instances where seconds_elapsed < 0

In [340]:
df = df[df.seconds_elapsed >= 0]
df

Unnamed: 0,seconds_elapsed,z_accelerometer,y_accelerometer,x_accelerometer,z_gravity,y_gravity,x_gravity,z_gyroscope,y_gyroscope,x_gyroscope,...,quaternionX_wristMotion,quaternionY_wristMotion,quaternionZ_wristMotion,z_magnetometer,y_magnetometer,x_magnetometer,user,relativeAltitude_barometer,pressure_barometer,magneticBearing_compass
4,0.029490,0.620725,-0.024275,0.340136,-7.344242,-4.756945,-4.427637,0.015129,-0.497939,-0.129197,...,,,,-31.496674,-60.121986,5.366821,2,,,185.101018
5,0.040027,-0.743177,-0.262221,-0.391995,-9.218947,-3.114202,-0.276137,0.203035,0.026631,-0.031968,...,,,,-41.762733,-2.015360,-14.336876,3,,,
6,0.040460,0.600845,0.465898,-1.121842,-7.730180,-6.002190,-0.623231,-0.556805,-0.030059,-0.312905,...,,,,-34.130707,-43.246155,19.207825,1,,,203.948450
7,0.041894,1.271829,-0.032704,-0.037981,-7.952026,-4.744402,-2.040784,-0.022566,-0.200779,-0.155414,...,,,,-35.730331,-41.014557,-6.380966,2,,,162.748917
8,0.049252,1.260347,-0.319836,-0.342097,-8.211313,-5.308533,-0.365590,-0.110204,0.161805,-0.306240,...,,,,-29.229975,-30.687719,15.079638,1,,,205.873741
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286810,2906.188554,,,,,,,,,,...,0.186314,0.367319,0.485740,,,,1,,,
286811,2906.198618,,,,,,,,,,...,0.186659,0.373277,0.500053,,,,1,,,
286812,2906.208682,,,,,,,,,,...,0.186206,0.379189,0.513737,,,,1,,,
286813,2906.218746,,,,,,,,,,...,0.185107,0.385247,0.526691,,,,1,,,


In [341]:
df.isna().sum() / len(df)

seconds_elapsed               0.000000
z_accelerometer               0.037342
y_accelerometer               0.037342
x_accelerometer               0.037342
z_gravity                     0.037342
y_gravity                     0.037342
x_gravity                     0.037342
z_gyroscope                   0.037342
y_gyroscope                   0.037342
x_gyroscope                   0.037342
rotationRateX_wristMotion     0.051849
rotationRateY_wristMotion     0.051849
rotationRateZ_wristMotion     0.051849
gravityX_wristMotion          0.051849
gravityY_wristMotion          0.051849
gravityZ_wristMotion          0.051849
accelerationX_wristMotion     0.051849
accelerationY_wristMotion     0.051849
accelerationZ_wristMotion     0.051849
quaternionW_wristMotion       0.051849
quaternionX_wristMotion       0.051849
quaternionY_wristMotion       0.051849
quaternionZ_wristMotion       0.051849
z_magnetometer                0.037342
y_magnetometer                0.037342
x_magnetometer           

In [342]:
df_eliandro.isna().sum() / len(df_eliandro)

seconds_elapsed               0.000000
z_accelerometer               0.000017
y_accelerometer               0.000017
x_accelerometer               0.000017
relativeAltitude_barometer    0.982465
pressure_barometer            0.982465
magneticBearing_compass       0.000017
z_gravity                     0.000017
y_gravity                     0.000017
x_gravity                     0.000017
z_gyroscope                   0.000017
y_gyroscope                   0.000017
x_gyroscope                   0.000017
rotationRateX_wristMotion     0.125896
rotationRateY_wristMotion     0.125896
rotationRateZ_wristMotion     0.125896
gravityX_wristMotion          0.125896
gravityY_wristMotion          0.125896
gravityZ_wristMotion          0.125896
accelerationX_wristMotion     0.125896
accelerationY_wristMotion     0.125896
accelerationZ_wristMotion     0.125896
quaternionW_wristMotion       0.125896
quaternionX_wristMotion       0.125896
quaternionY_wristMotion       0.125896
quaternionZ_wristMotion  

In [343]:
df_houcen.isna().sum() / len(df_houcen)

seconds_elapsed              0.000000
z_accelerometer              0.000172
y_accelerometer              0.000172
x_accelerometer              0.000172
z_gravity                    0.000172
y_gravity                    0.000172
x_gravity                    0.000172
z_gyroscope                  0.000172
y_gyroscope                  0.000172
x_gyroscope                  0.000172
rotationRateX_wristMotion    0.000970
rotationRateY_wristMotion    0.000970
rotationRateZ_wristMotion    0.000970
gravityX_wristMotion         0.000970
gravityY_wristMotion         0.000970
gravityZ_wristMotion         0.000970
accelerationX_wristMotion    0.000970
accelerationY_wristMotion    0.000970
accelerationZ_wristMotion    0.000970
quaternionW_wristMotion      0.000970
quaternionX_wristMotion      0.000970
quaternionY_wristMotion      0.000970
quaternionZ_wristMotion      0.000970
z_magnetometer               0.000172
y_magnetometer               0.000172
x_magnetometer               0.000172
user        

In [344]:
df_natcha.isna().sum() / len(df_natcha)

seconds_elapsed               0.000000
z_accelerometer               0.121224
y_accelerometer               0.121224
x_accelerometer               0.121224
relativeAltitude_barometer    0.979932
pressure_barometer            0.979932
magneticBearing_compass       0.121224
z_gravity                     0.121224
y_gravity                     0.121224
x_gravity                     0.121224
z_gyroscope                   0.121224
y_gyroscope                   0.121224
x_gyroscope                   0.121224
rotationRateX_wristMotion     0.000555
rotationRateY_wristMotion     0.000555
rotationRateZ_wristMotion     0.000555
gravityX_wristMotion          0.000555
gravityY_wristMotion          0.000555
gravityZ_wristMotion          0.000555
accelerationX_wristMotion     0.000555
accelerationY_wristMotion     0.000555
accelerationZ_wristMotion     0.000555
quaternionW_wristMotion       0.000555
quaternionX_wristMotion       0.000555
quaternionY_wristMotion       0.000555
quaternionZ_wristMotion  

### Drop columns with too many missing values or that are missing in a user's dataset

In [345]:
df = df.drop(columns=["relativeAltitude_barometer", "pressure_barometer", "magneticBearing_compass"])
df.isna().sum()/len(df)


seconds_elapsed              0.000000
z_accelerometer              0.037342
y_accelerometer              0.037342
x_accelerometer              0.037342
z_gravity                    0.037342
y_gravity                    0.037342
x_gravity                    0.037342
z_gyroscope                  0.037342
y_gyroscope                  0.037342
x_gyroscope                  0.037342
rotationRateX_wristMotion    0.051849
rotationRateY_wristMotion    0.051849
rotationRateZ_wristMotion    0.051849
gravityX_wristMotion         0.051849
gravityY_wristMotion         0.051849
gravityZ_wristMotion         0.051849
accelerationX_wristMotion    0.051849
accelerationY_wristMotion    0.051849
accelerationZ_wristMotion    0.051849
quaternionW_wristMotion      0.051849
quaternionX_wristMotion      0.051849
quaternionY_wristMotion      0.051849
quaternionZ_wristMotion      0.051849
z_magnetometer               0.037342
y_magnetometer               0.037342
x_magnetometer               0.037342
user        

#### Removing the columns that contribute too much to the task

In [None]:
df = df[[x for x in df.columns.to_list() if not ('gravity' in x or x.endswith('wristMotion') or x.endswith('magnetometer'))]]
df

### Saving the data

In [347]:
df.to_csv('data/dataFile.csv', index=False)