# IMU Data Classification

In [19]:
# Visualisation
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

# Data processing
from sklearn.impute import KNNImputer

# Helper functions
from helper_filter import *
from helper_preprocess import *

## Filter data
### Extract data tables and visualise

In [20]:
# Read the raw data from each target action and store them in a list
lqw_raw = load_data("./IMU_Data/LGW")
ramp_ascend_raw = load_data("./IMU_Data/Ramp_ascend")
ramp_descend_raw = load_data("./IMU_Data/Ramp_descend")
sit_to_stand_raw = load_data("./IMU_Data/Sit_to_stand")
stand_to_sit_raw = load_data("./IMU_Data/Stand_to_sit")

folders = [lqw_raw, ramp_ascend_raw, ramp_descend_raw, sit_to_stand_raw, stand_to_sit_raw]

In [21]:
# Preview some of the data to check format
lqw_raw[0].data.describe()

Unnamed: 0,Thigh_R_Timestamp,Thigh_R_Gyroscope_X,Thigh_R_Gyroscope_Y,Thigh_R_Gyroscope_Z,Thigh_R_Accelerometer_X,Thigh_R_Accelerometer_Y,Thigh_R_Accelerometer_Z,Thigh_R_Magnetometer_X,Thigh_R_Magnetometer_Y,Thigh_R_Magnetometer_Z,...,Pelvis_Gyroscope_Y,Pelvis_Gyroscope_Z,Pelvis_Accelerometer_X,Pelvis_Accelerometer_Y,Pelvis_Accelerometer_Z,Pelvis_Magnetometer_X,Pelvis_Magnetometer_Y,Pelvis_Magnetometer_Z,Annotation_Pulse,Annotation_Level
count,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,...,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0,1626.0
mean,249721.679675,3.016005,-20.459045,2.968838,-1.211503,9.496468,3.54788,-0.098999,-0.685384,-0.268764,...,-21.807592,-5.205306,-0.005444,9.747295,-0.915822,-0.016177,-0.644223,0.072678,0.0,0.0
std,4585.254729,37.556036,99.867926,77.125395,4.50186,2.824768,2.773884,0.350785,0.133166,0.212483,...,52.899,20.501709,1.588494,1.822342,1.491137,0.288767,0.080635,0.278026,0.0,0.0
min,241787.1094,-119.3965,-443.2692,-193.6658,-19.0531,0.5549,-10.5419,-0.7067,-1.0039,-0.6963,...,-229.5596,-66.0044,-5.9974,5.48,-7.5381,-0.4824,-0.8333,-0.3203,0.0,0.0
25%,245754.3945,-18.7162,-85.5971,-31.4424,-3.09185,8.1158,2.16785,-0.3848,-0.7631,-0.449,...,-26.2146,-17.276975,-1.0928,8.343075,-1.7018,-0.27595,-0.7031,-0.1774,0.0,0.0
50%,249721.6797,-0.3044,1.28695,20.2579,-1.64095,9.2613,3.50975,-0.1429,-0.6816,-0.2321,...,-5.9028,-2.9679,-0.0393,9.6382,-0.9433,-0.1521,-0.6398,-0.1106,0.0,0.0
75%,253688.9648,23.6513,43.3287,63.8239,0.294725,10.7888,4.82485,0.186225,-0.6058,-0.1236,...,7.363425,9.3366,1.070175,10.804825,0.1495,0.2783,-0.592,0.3318,0.0,0.0
max,257656.25,124.4816,296.7993,107.8979,22.2332,21.5573,17.2028,0.6286,-0.3553,0.3059,...,63.3268,39.4151,4.1634,15.5634,3.69,0.4137,-0.4138,0.6567,0.0,0.0


In [22]:
lqw_raw[0].data.head()

Unnamed: 0,Thigh_R_Timestamp,Thigh_R_Gyroscope_X,Thigh_R_Gyroscope_Y,Thigh_R_Gyroscope_Z,Thigh_R_Accelerometer_X,Thigh_R_Accelerometer_Y,Thigh_R_Accelerometer_Z,Thigh_R_Magnetometer_X,Thigh_R_Magnetometer_Y,Thigh_R_Magnetometer_Z,...,Pelvis_Gyroscope_Y,Pelvis_Gyroscope_Z,Pelvis_Accelerometer_X,Pelvis_Accelerometer_Y,Pelvis_Accelerometer_Z,Pelvis_Magnetometer_X,Pelvis_Magnetometer_Y,Pelvis_Magnetometer_Z,Annotation_Pulse,Annotation_Level
0,241787.1094,0.5345,1.222,-0.8404,-2.1075,9.1086,3.2014,-0.2819,-0.7146,-0.2299,...,0.56,0.0212,-0.1209,9.6397,-1.0211,-0.1688,-0.6494,-0.1636,0.0,0.0
1,241796.875,0.5345,1.222,-0.8404,-2.109,9.0322,3.0518,-0.2819,-0.7223,-0.2321,...,0.56,0.0212,-0.1209,9.7162,-1.0218,-0.1725,-0.6264,-0.1728,0.0,0.0
2,241806.6406,-0.2664,0.2175,-1.1036,-2.1872,9.185,3.2015,-0.28,-0.7049,-0.2148,...,0.56,0.0212,-0.1209,9.7926,-1.0226,-0.1781,-0.6533,-0.1751,0.0,0.0
3,241816.4062,-0.2664,0.2175,-1.1036,-2.1082,9.185,3.1255,-0.2876,-0.7049,-0.2169,...,1.0746,-0.0255,-0.2044,9.7123,-1.4106,-0.1744,-0.6513,-0.1728,0.0,0.0
4,241826.1719,-0.2664,0.2175,-1.1036,-2.0293,9.0322,3.051,-0.2743,-0.6893,-0.2169,...,1.0746,-0.0255,-0.2793,9.7934,-0.9448,-0.1725,-0.6284,-0.1728,0.0,0.0


In [23]:
# Plot histograms to visualize all data
#lqw_raw[0].data.hist(bins=50,figsize=(30,30))

In [24]:
# Check entries that are outside of the standard deviation
std_table = []
head = ["Action","File name", "Column name", "Mean", "Standard deviation", "#entries>5std", "#entries<5std"]

for folder in folders:
    for file in folder:
        for column in file.data:
            mean = file.data[column].mean()
            std = file.data[column].std()
            count_above_std5 = 0
            count_below_std5 = 0

            for entry in file.data[column]:
                if entry < mean - std*5:
                    count_below_std5 += 1
                elif entry > mean + std*5:
                    count_above_std5 += 1

            if count_above_std5 > 0 or count_below_std5 > 0:
                std_table.append([file.folder_name, file.file_name, column, format(mean, '.4f'), format(std, '.4f'), count_above_std5, count_below_std5]) # add data for every column

print(tabulate(std_table, headers=head, tablefmt="grid"))

+--------------+---------------------------------------+-------------------------+----------+----------------------+-----------------+-----------------+
| Action       | File name                             | Column name             |     Mean |   Standard deviation |   #entries>5std |   #entries<5std |
| LGW          | normal_walk_J_trial_02.dat            | Thigh_R_Accelerometer_X |  -1.2115 |               4.5019 |               1 |               0 |
+--------------+---------------------------------------+-------------------------+----------+----------------------+-----------------+-----------------+
| LGW          | normal_walk_J_trial_02.dat            | Thigh_R_Accelerometer_Z |   3.5479 |               2.7739 |               0 |               1 |
+--------------+---------------------------------------+-------------------------+----------+----------------------+-----------------+-----------------+
| LGW          | normal_walk_J_trial_02.dat            | Shank_R_Gyroscope_Y     |

### Check for unwanted columns
From the table above, we can see how multiple timestamps have been used across different files. It was decided to investigate further whether the timestamps are aligned and can be ignored. It can be seen how, under the LGW some files are missing "Sync" and "Offset" timestamp files, so it was decided to remove all columns that contains them to ensure consistency across the data. Additionally, the LWR from SV is missing the timestamp from the Right sensors and Thigh. The timestamps that appear across al columns are 'Shank_L_Timestamp', 'Foot_L_Timestamp', 'Pelvis_Timestamp', arguebly one of them should be used as the baseline time.

The total number of entries is plotted as well, it can be seen how the majority of the data comes from the ground walking action and less from the standing and sitting actions. This might result in a bias towards the former action mentioned.

In [25]:
# Check number of columns in each dataframe
column_table = []
head = ["Action","File name", "Row Nr", "Column Nr", "Non-standard columns names"]

for folder in folders:
    for file in folder:
        filtered_columns =[col for col in file.data.columns if "timestamp" in col.lower()]
        column_table.append([file.folder_name, file.file_name, file.data.index.size, len(file.data.columns), filtered_columns])

#print(tabulate(column_table, headers=head, tablefmt='grid'))

In [26]:
# Drop all columns that contain sync, annotations and offset timestamps
for folder in folders:
    for file in folder:
        file.data_filtered.drop(columns=[col for col in file.data_filtered.columns if 
                                any(info in col.lower() for info in ["sync", "offset", "annotation"])], inplace=True)

In [27]:
# Check if all timestamps columns have the same data inside a dataframe and check what is the difference in time between them
reference_columns = ['Shank_L_Timestamp', 'Foot_L_Timestamp', 'Pelvis_Timestamp']

for folder in folders:
    for file in folder:
        for ref in reference_columns:
            # Filter columns to get only those containing time
            time_columns = [col for col in file.data_filtered.columns if 'timestamp' in col.lower()]

            # Reference column for comparison
            ref_column = file.data_filtered[ref]
            time_difference = []
            for col in time_columns:
                time_difference.append(file.data_filtered[col] - ref_column)

            means = [sum(inner_array)/len(inner_array) for inner_array in time_difference]
            if max(means) > 1000.: # if difference is bigger than 3 seconds
                #print(f"Using {ref} - Different timestamp in {file.file_name} with maximum value: {format(max(means), '.4f')}")
                pass

### Check for NaNs
It can be observed how the only files that contains NaNs are normal_walk_lg_trial_01.dat and normal_walk_lg_trial_02.dat. Both files contain 1521 entry with 17 or 56 NaN entries in individual columns. The NaN values constitute 1.12% and 3.68%, respectively of the toal entries. A nearest neighbors imputation strategy is used to replace the missing data from the set. Originally, a simple imputation was used with a "median" strategy, but, after checking the data, all of the features that need imputation are Gaussian distributed (except the Pelvic magnetometer data that has two peaks). It is better to replace the missing data with a Gaussian distributed set of values compared to a constant. k-Nearest Neighbors offers the advantage of tuning the missing values by using the neighboring entries. 

In [28]:
# Check number of columns in each dataframe
nan_table = []
head = ["Action","File name", "NaN total number", "NaN columns"]
folders = [lqw_raw, ramp_ascend_raw, ramp_descend_raw, sit_to_stand_raw, stand_to_sit_raw]
columns_to_visualize = []

for folder in folders:
    for file in folder:
        nan_number = file.data_filtered.isnull().sum().sum()
        
        # Add to table only if there are NaN values
        if nan_number > 0:
            nan_columns = ""
            columns_to_visualize.append(file.data_filtered)
            
            # Check which columns have NaN values and how many
            for col in file.data_filtered.columns:
                if file.data_filtered[col].isnull().sum() > 0:
                    nan_columns += col + "=" + str(file.data_filtered[col].isnull().sum()) + "\n"
            
            nan_table.append([file.folder_name, file.file_name, nan_number, nan_columns])

print(tabulate(nan_table, headers=head, tablefmt='grid'))

+----------+-----------------------------+--------------------+----------------------------+
| Action   | File name                   |   NaN total number | NaN columns                |
| LGW      | normal_walk_lg_trial_01.dat |                657 | Shank_L_Timestamp=17       |
|          |                             |                    | Shank_L_Gyroscope_X=17     |
|          |                             |                    | Shank_L_Gyroscope_Y=17     |
|          |                             |                    | Shank_L_Gyroscope_Z=17     |
|          |                             |                    | Shank_L_Accelerometer_X=17 |
|          |                             |                    | Shank_L_Accelerometer_Y=17 |
|          |                             |                    | Shank_L_Accelerometer_Z=17 |
|          |                             |                    | Shank_L_Magnetometer_X=17  |
|          |                             |                    | Shank_

In [29]:
# Plot histograms to visualize all data
#for visualize in columns_to_visualize:
#    visualize.hist(bins=50,figsize=(30,30))

In [30]:
# Replace NaN values with the k-Nearest Neighbor
for folder in folders:
    for file in folder:
        if file.data_filtered.isnull().sum().sum() > 0:
            imputer = KNNImputer(n_neighbors=5)
            file.data_filtered = pd.DataFrame(imputer.fit_transform(file.data_filtered),columns = file.data_filtered.columns)

## Preprocess data
### Apply filtering

In [31]:
# TODO apply a filter to the data

### Apply the slinding window technique

In [32]:
tw = 350    # window size
dt = 50     # window step

# Apply the moving average filter to the data and get all features
for folder in folders:
    for file in folder:
        file.data_processed = generate_features(file.data_filtered, tw, dt)

In [33]:
# Check if data looks as expected
folder[0].data_processed.head()

Unnamed: 0,Thigh_R_Timestamp_max,Thigh_R_Timestamp_min,Thigh_R_Timestamp_mean,Thigh_R_Timestamp_std,Thigh_R_Timestamp_rms,Thigh_R_Timestamp_maxgradient,Thigh_R_Timestamp_zero_crossings,Thigh_R_Gyroscope_X_max,Thigh_R_Gyroscope_X_min,Thigh_R_Gyroscope_X_mean,...,Pelvis_Magnetometer_Y_rms,Pelvis_Magnetometer_Y_maxgradient,Pelvis_Magnetometer_Y_zero_crossings,Pelvis_Magnetometer_Z_max,Pelvis_Magnetometer_Z_min,Pelvis_Magnetometer_Z_mean,Pelvis_Magnetometer_Z_std,Pelvis_Magnetometer_Z_rms,Pelvis_Magnetometer_Z_maxgradient,Pelvis_Magnetometer_Z_zero_crossings
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [34]:
# TODO Combine all five actions into one dataframe and set the target labels using one-hot encoding 

## Train models
### ANN

In [35]:
# TODO

### SVM

In [36]:
# TODO

### CNN

In [37]:
# TODO

### Comparison

In [38]:
# TODO