# IMU Data Classification

In [56]:
# Visualisation
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

# Data processing
from sklearn.impute import KNNImputer

# Helper functions
from helper_filter import *

## Filter data
### Extract data tables

In [57]:
# Read the raw data from each target action and store them in a list
lqw_raw = load_data("./IMU_Data/LGW")
ramp_ascend_raw = load_data("./IMU_Data/Ramp_ascend")
ramp_descend_raw = load_data("./IMU_Data/Ramp_descend")
sit_to_stand_raw = load_data("./IMU_Data/Sit_to_stand")
stand_to_sit_raw = load_data("./IMU_Data/Stand_to_sit")

In [58]:
# Preview some of the data to check format
lqw_raw[0].data.head()

Unnamed: 0,Thigh_R_Timestamp,Thigh_R_Gyroscope_X,Thigh_R_Gyroscope_Y,Thigh_R_Gyroscope_Z,Thigh_R_Accelerometer_X,Thigh_R_Accelerometer_Y,Thigh_R_Accelerometer_Z,Thigh_R_Magnetometer_X,Thigh_R_Magnetometer_Y,Thigh_R_Magnetometer_Z,...,Pelvis_Gyroscope_Y,Pelvis_Gyroscope_Z,Pelvis_Accelerometer_X,Pelvis_Accelerometer_Y,Pelvis_Accelerometer_Z,Pelvis_Magnetometer_X,Pelvis_Magnetometer_Y,Pelvis_Magnetometer_Z,Annotation_Pulse,Annotation_Level
0,241787.1094,0.5345,1.222,-0.8404,-2.1075,9.1086,3.2014,-0.2819,-0.7146,-0.2299,...,0.56,0.0212,-0.1209,9.6397,-1.0211,-0.1688,-0.6494,-0.1636,0.0,0.0
1,241796.875,0.5345,1.222,-0.8404,-2.109,9.0322,3.0518,-0.2819,-0.7223,-0.2321,...,0.56,0.0212,-0.1209,9.7162,-1.0218,-0.1725,-0.6264,-0.1728,0.0,0.0
2,241806.6406,-0.2664,0.2175,-1.1036,-2.1872,9.185,3.2015,-0.28,-0.7049,-0.2148,...,0.56,0.0212,-0.1209,9.7926,-1.0226,-0.1781,-0.6533,-0.1751,0.0,0.0
3,241816.4062,-0.2664,0.2175,-1.1036,-2.1082,9.185,3.1255,-0.2876,-0.7049,-0.2169,...,1.0746,-0.0255,-0.2044,9.7123,-1.4106,-0.1744,-0.6513,-0.1728,0.0,0.0
4,241826.1719,-0.2664,0.2175,-1.1036,-2.0293,9.0322,3.051,-0.2743,-0.6893,-0.2169,...,1.0746,-0.0255,-0.2793,9.7934,-0.9448,-0.1725,-0.6284,-0.1728,0.0,0.0


In [59]:
# Plot histograms to visualize all data
#lqw_raw[0].data.hist(bins=50,figsize=(30,30))

### Remove unwanted columns
From the table above, we can see how multiple timestamps have been used across different files. It was decided to investigate further whether the timestamps are aligned and can be ignored. It can be seen how, under the LGW some files are missing "Sync" and "Offset" timestamp files, so it was decided to remove all columns that contains them to ensure consistency across the data. Additionally, the LWR from SV is missing the timestamp from the Right sensors and Thigh.

The total number of entries is plotted as well, it can be seen how the majority of the data comes from the ground walking action and less from the standing and sitting actions. This might result in a bias towards the former action mentioned.

In [60]:
# Check number of columns in each dataframe
column_table = []
head = ["Action","File name", "Row Nr", "Column Nr", "Non-standard columns names"]
folders = [lqw_raw, ramp_ascend_raw, ramp_descend_raw, sit_to_stand_raw, stand_to_sit_raw]

for folder in folders:
    for file in folder:
        filtered_columns =[col for col in file.data.columns if 
                           not any(info in col.lower() for info in ["accelerometer", "magnetometer", "gyroscope"])]
        column_table.append([file.folder_name, file.file_name, file.data.index.size, len(file.data.columns), filtered_columns])

print(tabulate(column_table, headers=head, tablefmt='grid'))

+--------------+---------------------------------------+----------+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Action       | File name                             |   Row Nr |   Column Nr | Non-standard columns names                                                                                                                                                                                                                                                                                                                                                 |
| LGW          | normal_walk_J_trial_02.dat            |     1626 |          72 | ['Thigh_R_Timestamp', 'S

In [61]:
# Drop all columns that contain sync and offset timestamps
for folder in folders:
    for file in folder:
        file.data.drop(columns=[col for col in file.data.columns if 
                                any(info in col.lower() for info in ["sync", "offset"])], inplace=True)

In [62]:
# Check if all timestamps columns have the same data inside a dataframe and check what is the difference in time between them

for file in folders[0]:
    # Filter columns to get only those containing time
    time_columns = [col for col in file.data.columns if 'timestamp' and 'thigh' in col.lower()]

    # Reference column for comparison
    identical_data = pd.Series([True] * len(file.data), index=file.data.index)
    ref_column = file.data[time_columns[0]]

    for col in time_columns[1:]:
        identical_data &= (file.data[col] == ref_column)  # Element-wise AND operation

    are_time_columns_identical = identical_data.all()
    #print(f"Identical data? {are_time_columns_identical}")

### Check for NaNs
It can be observed how the only files that contains NaNs are normal_walk_lg_trial_01.dat and normal_walk_lg_trial_02.dat. Both files contain 1521 entry with 17 or 56 NaN entries in individual columns. The NaN values constitute 1.12% and 3.68%, respectively of the toal entries. A nearest neighbors imputation strategy is used to replace the missing data from the set. Originally, a simple imputation was used with a "median" strategy, but, after checking the data, all of the features that need imputation are Gaussian distributed (except the Pelvic magnetometer data that has two peaks). It is better to replace the missing data with a Gaussian distributed set of values compared to a constant. k-Nearest Neighbors offers the advantage of tuning the missing values by using the neighboring entries. 

In [63]:
# Check number of columns in each dataframe
nan_table = []
head = ["Action","File name", "NaN total number", "NaN columns"]
folders = [lqw_raw, ramp_ascend_raw, ramp_descend_raw, sit_to_stand_raw, stand_to_sit_raw]
columns_to_visualize = []

for folder in folders:
    for file in folder:
        nan_number = file.data.isnull().sum().sum()
        
        # Add to table only if there are NaN values
        if nan_number > 0:
            nan_columns = ""
            columns_to_visualize.append(file.data)
            
            # Check which columns have NaN values and how many
            for col in file.data.columns:
                if file.data[col].isnull().sum() > 0:
                    nan_columns += col + "=" + str(file.data[col].isnull().sum()) + "\n"
            
            nan_table.append([file.folder_name, file.file_name, nan_number, nan_columns])

print(tabulate(nan_table, headers=head, tablefmt='grid'))

+----------+-----------------------------+--------------------+----------------------------+
| Action   | File name                   |   NaN total number | NaN columns                |
| LGW      | normal_walk_lg_trial_01.dat |                657 | Shank_L_Timestamp=17       |
|          |                             |                    | Shank_L_Gyroscope_X=17     |
|          |                             |                    | Shank_L_Gyroscope_Y=17     |
|          |                             |                    | Shank_L_Gyroscope_Z=17     |
|          |                             |                    | Shank_L_Accelerometer_X=17 |
|          |                             |                    | Shank_L_Accelerometer_Y=17 |
|          |                             |                    | Shank_L_Accelerometer_Z=17 |
|          |                             |                    | Shank_L_Magnetometer_X=17  |
|          |                             |                    | Shank_

In [64]:
# Plot histograms to visualize all data
#for visualize in columns_to_visualize:
#    visualize.hist(bins=50,figsize=(30,30))

In [67]:
# Replace NaN values with the k-Nearest Neighbor
for folder in folders:
    for file in folder:
        if file.data.isnull().sum().sum() > 0:
            imputer = KNNImputer(n_neighbors=5)
            file = pd.DataFrame(imputer.fit_transform(file.data),columns = file.data.columns)

## Preprocess data
### Apply filtering

In [None]:
# TODO 

### Apply the slinding window technique

In [None]:
# TODO

## Train models
### ANN

In [None]:
# TODO

### SVM

In [None]:
# TODO

### CNN

In [None]:
# TODO

### Comparison

In [None]:
# TODO