# Feature Engineering Notebook 1

## Goal
The goal of this notebook is to enhance the dataset by applying various techniques to extract valuable insights and improve the predictive power of the model. The primary objectives are as follows:

1. **Filter Subtle Noise:** Identify and filter out subtle noise from the dataset, distinguishing it from outliers, to ensure data integrity and quality.
    
2. **Feature Generation:** Create new features by leveraging numerical, temporal, frequency, and clustering techniques, enriching the dataset and capturing additional information that may be relevant for modeling.


In [2]:
DF_PATH = '../data/processed/4_cleaned_outliers_data.pkl'
FIG_DIR = '../reports/figures/'
EXPORT_PATH = '../data/processed/5a_feature_engineering_data.pkl'

In [3]:
import sys
sys.path.append('../')

In [4]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import os
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
import plotly.express as px 
import plotly.figure_factory as ff 
from matplotlib import pyplot as plt 
import matplotlib as mpl
import seaborn as sns
from IPython.display import display
from cycler import cycler
from scipy.signal import butter, lfilter, filtfilt
import copy
from sklearn.decomposition import PCA
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline

In [5]:
from scripts.LowPassFilter import *
from scripts.PrincipalComponentAnalysis import *

In [27]:
# Ajust matplotlib style Function using (GPT-3.5)
class CustomMatplotlibStyle:
    def __init__(self, custom_colors=None, legend_fontsize=10):
        if custom_colors is None:
            custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
        self.set_colors(custom_colors)
        self.set_style()
        self.set_figure_size()
        self.set_grid()
        self.set_line_width()
        self.set_tick_colors()
        self.set_font_size()
        self.set_title_size()
        self.set_legend_fontsize(legend_fontsize)
        self.set_dpi()

    def set_colors(self, custom_colors):
        plt.rcParams["axes.prop_cycle"] = cycler(color=custom_colors)

    def set_style(self):
        plt.style.use("bmh")

    def set_figure_size(self):
        plt.rcParams["figure.figsize"] = (20, 10)

    def set_grid(self):
        plt.rcParams["axes.grid"] = True
        plt.rcParams["grid.color"] = "lightgray"

    def set_line_width(self):
        plt.rcParams["axes.linewidth"] = 1.5

    def set_tick_colors(self):
        plt.rcParams["xtick.color"] = "black"
        plt.rcParams["ytick.color"] = "black"

    def set_font_size(self):
        plt.rcParams["font.size"] = 15

    def set_title_size(self):
        plt.rcParams["figure.titlesize"] = 20

    def set_legend_fontsize(self, legend_fontsize):
        plt.rcParams["legend.fontsize"] = legend_fontsize

    def set_dpi(self):
        plt.rcParams["figure.dpi"] = 100

custom_style = CustomMatplotlibStyle(legend_fontsize=8)
# Define a color palette
colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)', 'rgb(44, 160, 44)', 'rgb(214, 39, 40)', 
          'rgb(148, 103, 189)', 'rgb(140, 86, 75)', 'rgb(227, 119, 194)', 'rgb(127, 127, 127)', 
          'rgb(188, 189, 34)', 'rgb(23, 190, 207)']


------

# **Read Data**

In [7]:
df = pd.read_pickle(DF_PATH)
df

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-02-11 17:08:05.200,0.013500,0.977000,-0.071000,-1.8904,2.4392,0.9388,bench,heavy,B,64.0
2024-02-11 17:08:05.400,-0.001500,0.970500,-0.079500,-1.6826,-0.8904,2.1708,bench,heavy,B,64.0
2024-02-11 17:08:05.600,0.001333,0.971667,-0.064333,2.5608,-0.2560,-1.4146,bench,heavy,B,64.0
2024-02-11 17:08:05.800,-0.024000,0.957000,-0.073500,8.0610,-4.5244,-2.0730,bench,heavy,B,64.0
2024-02-11 17:08:06.000,-0.028000,0.957667,-0.115000,2.4390,-1.5486,-3.6098,bench,heavy,B,64.0
...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.048000,-1.041500,-0.076500,1.4146,-5.6218,0.2926,row,medium,E,71.0
2024-02-20 19:33:27.200,-0.037000,-1.030333,-0.053333,-2.7684,-0.5854,2.2440,row,medium,E,71.0
2024-02-20 19:33:27.400,-0.060000,-1.031000,-0.082000,2.8416,-5.1342,-0.1220,row,medium,E,71.0
2024-02-20 19:33:27.600,-0.038667,-1.025667,-0.044667,-0.2318,0.2562,1.1220,row,medium,E,71.0


------

# **Dealing with Missing Values:**
- In this case i will imput missing values

In [8]:
predicator_columns = list(df.columns[0:6])
predicator_columns

['mean_xc', 'mean_yc', 'mean_zc', 'mean_xg', 'mean_yg', 'mean_zg']

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2024-02-11 17:08:05.200000 to 2024-02-20 19:33:27.800000
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   mean_xc    8121 non-null   float64
 1   mean_yc    8109 non-null   float64
 2   mean_zc    8112 non-null   float64
 3   mean_xg    8108 non-null   float64
 4   mean_yg    8111 non-null   float64
 5   mean_zg    8114 non-null   float64
 6   posture    9009 non-null   object 
 7   types      9009 non-null   object 
 8   specimen   9009 non-null   object 
 9   set        9009 non-null   float64
dtypes: float64(7), object(3)
memory usage: 774.2+ KB


In [10]:
df.isnull().sum()

mean_xc      888
mean_yc      900
mean_zc      897
mean_xg      901
mean_yg      898
mean_zg      895
posture        0
types          0
specimen       0
set            0
dtype: int64

- Let's Explore more about this missing values

In [12]:
# Filter dataframe for set 1
set_1_df = df[df['set'] == 1]

# Create a scatter plot using Plotly
fig = go.Figure(go.Scatter(x=set_1_df.index, y=set_1_df['mean_xg'], mode='lines'))

# Update layout for better readability
fig.update_layout(
    title='Mean XG Plot for Set 1',
    xaxis_title='Index',
    yaxis_title='Mean XG',
    template='plotly_dark' 
)

# Show the plot
fig.show()


- Ok, as we can see there are gaps between graph that tell us there are something wrong aka (missing values)

- There are serveral ways to deal with this gaps in the data for eaxmple we can drop it but it's not effective in this case

- Imputing missing values is one of the best ways to deal with this gap like statistical imputation (mean - median - mode) but also this can interplate the data by trying to connect the points to fill the gaps 

------

## Interpolation Missing Values:

- Method is used to fill NaN values in the DataFrame or Series using various interpolation techniques to fill the missing values rather than hard-coding the value. 

In [13]:
for col in predicator_columns:
    df[col] = df[col].interpolate()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9009 entries, 2024-02-11 17:08:05.200000 to 2024-02-20 19:33:27.800000
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   mean_xc    9009 non-null   float64
 1   mean_yc    9009 non-null   float64
 2   mean_zc    9009 non-null   float64
 3   mean_xg    9009 non-null   float64
 4   mean_yg    9009 non-null   float64
 5   mean_zg    9009 non-null   float64
 6   posture    9009 non-null   object 
 7   types      9009 non-null   object 
 8   specimen   9009 non-null   object 
 9   set        9009 non-null   float64
dtypes: float64(7), object(3)
memory usage: 774.2+ KB


In [16]:
# Filter dataframe for set 1
set_1_df = df[df['set'] == 1]

# Create a scatter plot using Plotly
fig = go.Figure(go.Scatter(x=set_1_df.index, y=set_1_df['mean_xg'], mode='lines'))

# Update layout for better readability
fig.update_layout(
    title='Mean XG Plot for Set 1',
    xaxis_title='Index',
    yaxis_title='Mean XG',
    template='plotly_dark' 
)

# Show the plot
fig.show()


- Now we can see that there are no gaps anymore 

-----

## Calculating set duration


In [18]:
unique_sets = df['set'].unique()
unique_sets

array([64., 72., 40., 86., 80., 10., 74., 52., 12., 33., 77., 53., 32.,
       79., 22., 17., 69., 63., 60., 92., 57.,  7., 16., 26., 67., 75.,
       89., 23., 11., 58., 25., 43., 24., 34.,  3., 81., 85., 84., 37.,
       55.,  5.,  2., 41., 38., 78., 91., 19., 73., 48., 87., 61.,  1.,
       30., 65., 59., 20., 29., 28., 51., 83., 13., 35., 42.,  8., 56.,
        4., 45., 82., 93., 14., 88., 21., 50.,  6., 47., 36., 31.,  9.,
       39., 90., 70., 62., 66., 46., 27., 18., 49., 54., 44., 15., 71.])

In [19]:
# for calculating the average duration 
for second in unique_sets:
    start_time = df[df.set == second].index[0]
    stop_time = df[df.set == second].index[-1]
    duration_time = stop_time - start_time
    df.loc[(df.set == second), 'duration_time'] = duration_time.seconds

df.sample(8)

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-02-11 17:46:06.800,-0.355333,0.875,-0.313667,-36.329,-5.695,32.1584,ohp,heavy,B,33.0,13.0
2024-02-14 15:32:17.800,-0.107333,0.865667,-0.068,15.4146,1.439,-14.7256,bench,heavy,C,23.0,14.0
2024-02-11 17:48:58.000,-0.222,0.717667,-0.22,8.378,-7.5854,37.3902,ohp,medium,B,77.0,21.0
2024-02-11 17:49:07.200,-0.312,0.715,-0.262667,-11.6586,0.5244,5.5366,ohp,medium,B,77.0,21.0
2024-02-15 21:32:51.000,0.0405,-0.82,0.052,36.1463,-17.4634,-0.7196,dead,medium,C,51.0,30.0
2024-02-15 15:27:09.800,-0.121667,0.881667,-0.278,6.5244,-6.0364,3.9754,bench,heavy,E,5.0,12.0
2024-02-15 16:04:09.800,0.111,-1.338333,-0.102,17.549,-8.049,-6.0122,row,heavy,E,91.0,8.0
2024-02-12 17:10:10.400,-0.131,0.950667,-0.149667,-2.4024,-0.7804,10.5,bench,heavy,E,7.0,16.0


In [20]:
df.groupby('types')['duration_time'].mean()

types
heavy       14.743501
medium      24.942529
sitting     33.000000
standing    39.000000
Name: duration_time, dtype: float64

In [28]:
avg_duration_by_types = df.groupby('types')['duration_time'].mean()

# Create a Pie chart using Plotly
fig = go.Figure(go.Pie(
    labels=avg_duration_by_types.index,
    values=avg_duration_by_types,
    textinfo='percent',
    marker=dict(colors=colors)
))

# Update layout for better readability and adjust the size of the circle
fig.update_layout(
    title='Average Duration Time by Types',
    title_font=dict(size=20),
    legend=dict(x=1, y=0.5),
    template='plotly_dark',
)

# Show the plot
fig.show()


------

# **Butterworth low-pass filter**


In [29]:
df_butter = df.copy()
PassFilter = LowPassFilter()

In [30]:
fs = 1000 / 200  # sampling frequency
cutoff = 1.27 # desired cutoff frequency of the filter, Hz

In [31]:
df_butter = PassFilter.apply_low_pass_filter(df_butter, 'mean_xc', fs, cutoff)
df_butter

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time,mean_xc_lowpass
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2024-02-11 17:08:05.200,0.013500,0.977000,-0.071000,-1.8904,2.4392,0.9388,bench,heavy,B,64.0,16.0,0.013496
2024-02-11 17:08:05.400,-0.001500,0.970500,-0.079500,-1.6826,-0.8904,2.1708,bench,heavy,B,64.0,16.0,0.008260
2024-02-11 17:08:05.600,0.001333,0.971667,-0.064333,2.5608,-0.2560,-1.4146,bench,heavy,B,64.0,16.0,-0.008713
2024-02-11 17:08:05.800,-0.024000,0.957000,-0.073500,8.0610,-4.5244,-2.0730,bench,heavy,B,64.0,16.0,-0.024124
2024-02-11 17:08:06.000,-0.028000,0.957667,-0.115000,2.4390,-1.5486,-3.6098,bench,heavy,B,64.0,16.0,-0.021642
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.048000,-1.041500,-0.076500,1.4146,-5.6218,0.2926,row,medium,E,71.0,19.0,-0.043888
2024-02-20 19:33:27.200,-0.037000,-1.030333,-0.053333,-2.7684,-0.5854,2.2440,row,medium,E,71.0,19.0,-0.041612
2024-02-20 19:33:27.400,-0.060000,-1.031000,-0.082000,2.8416,-5.1342,-0.1220,row,medium,E,71.0,19.0,-0.047711
2024-02-20 19:33:27.600,-0.038667,-1.025667,-0.044667,-0.2318,0.2562,1.1220,row,medium,E,71.0,19.0,-0.050471


In [32]:
subset_df = df_butter[df_butter['set'] == 45]
subset_df

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time,mean_xc_lowpass
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2024-02-16 21:30:35.000,0.081000,-1.054000,0.043000,3.914800,-3.432800,-2.207400,dead,medium,E,45.0,34.0,0.091073
2024-02-16 21:30:35.200,0.081333,-1.036333,0.034333,-11.902400,-6.853600,0.207200,dead,medium,E,45.0,34.0,0.056843
2024-02-16 21:30:35.400,0.020500,-0.981500,-0.098500,-16.780800,-10.024400,-7.366000,dead,medium,E,45.0,34.0,0.050290
2024-02-16 21:30:35.600,0.095667,-1.039667,-0.056667,1.256000,0.280600,-3.475600,dead,medium,E,45.0,34.0,0.078640
2024-02-16 21:30:35.800,0.112500,-1.019000,-0.066500,1.073200,-11.719600,0.304800,dead,medium,E,45.0,34.0,0.111992
...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-16 21:31:08.800,0.077444,-1.236667,-0.055333,11.366000,-9.632067,5.461467,dead,medium,E,45.0,34.0,0.078706
2024-02-16 21:31:09.000,0.078500,-1.036000,0.087500,28.256200,-10.398333,3.739933,dead,medium,E,45.0,34.0,0.082932
2024-02-16 21:31:09.200,0.094000,-0.944667,0.130000,19.640383,-11.164600,2.018400,dead,medium,E,45.0,34.0,0.088031
2024-02-16 21:31:09.400,0.077000,-0.842500,0.295500,11.024567,-11.930867,0.296867,dead,medium,E,45.0,34.0,0.078571


In [34]:
fig = go.Figure()

# Plot raw data
fig.add_trace(go.Scatter(x=subset_df.index, y=subset_df["mean_xc"], mode='lines', name="Raw Data", line=dict(color='blue')))

# Plot low-pass filtered data
fig.add_trace(go.Scatter(x=subset_df.index, y=subset_df["mean_xc_lowpass"], mode='lines', name="Butterworth", line=dict(color='orange')))

# Update layout for better readability
fig.update_layout(
    title="Raw Data vs Low-pass Filtered Data",
    xaxis_title="Index",
    yaxis_title="Value",
    template='plotly_dark',
    legend=dict(x=0.5, y=1.15, orientation='h', bgcolor='rgba(255, 255, 255, 0)', bordercolor='rgba(255, 255, 255, 0)'),
    margin=dict(l=0, r=0, t=50, b=0),  # Adjust margin for title visibility
)

# Show the plot
fig.show()


In [35]:
# Loop over all columns to apply low-pass filter
for col in predicator_columns:
    df_butter = PassFilter.apply_low_pass_filter(df_butter, col, fs, cutoff)
    # overwrite the original column with the low-pass filtered column
    df_butter[col] = df_butter[col + "_lowpass"]
    del df_butter[col + "_lowpass"]

In [36]:
df_butter

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2024-02-11 17:08:05.200,0.013496,0.977014,-0.071002,-1.891570,2.438321,0.937917,bench,heavy,B,64.0,16.0
2024-02-11 17:08:05.400,0.008260,0.965687,-0.066392,0.744158,0.512027,0.769823,bench,heavy,B,64.0,16.0
2024-02-11 17:08:05.600,-0.008713,0.964964,-0.072315,3.729410,-2.209350,-1.100146,bench,heavy,B,64.0,16.0
2024-02-11 17:08:05.800,-0.024124,0.973185,-0.084830,3.502448,-3.556205,-1.863756,bench,heavy,B,64.0,16.0
2024-02-11 17:08:06.000,-0.021642,0.963600,-0.095993,2.250385,-0.890134,-1.990244,bench,heavy,B,64.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.043888,-0.983254,-0.062298,2.477314,-4.130471,0.211078,row,medium,E,71.0,19.0
2024-02-20 19:33:27.200,-0.041612,-1.045090,-0.060838,-1.825996,-2.528451,1.485679,row,medium,E,71.0,19.0
2024-02-20 19:33:27.400,-0.047711,-1.047383,-0.066148,-0.261306,-1.974637,1.149337,row,medium,E,71.0,19.0
2024-02-20 19:33:27.600,-0.050471,-1.017967,-0.063349,2.227568,-2.549377,0.387236,row,medium,E,71.0,19.0


-----

# **Principal Component Analysis (PCA):**

In [37]:
PCA = PrincipalComponentAnalysis()
df_pca = df_butter.copy()

In [38]:
pca_val = PCA.determine_pc_explained_variance(df_pca, predicator_columns)
pca_val

array([5.45891542e-01, 2.87930042e-01, 1.63988872e-01, 1.82277593e-03,
       2.81675267e-04, 8.50921262e-05])

## elbow method:

In [44]:
fig = go.Figure()
# Plot PCA values
fig.add_trace(go.Scatter(x=list(range(0, len(pca_val)+1)), y=pca_val, mode='lines'))
# Update layout
fig.update_layout(
    title="PCA Explained Variance",
    xaxis_title="PCA Component",
    yaxis_title="Explained Variance",
    template='plotly_dark'
)

# Show plot
fig.show()


- We can see that the elbow point is at 3 (from here we can start to reduce the dimensionality of the data)

In [45]:
df_pca = PCA.apply_pca(df_pca, predicator_columns, 3)
df_pca

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time,pca_1,pca_2,pca_3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-02-11 17:08:05.200,0.013496,0.977014,-0.071002,-1.891570,2.438321,0.937917,bench,heavy,B,64.0,16.0,-2.134776,-4.437109,1.623949
2024-02-11 17:08:05.400,0.008260,0.965687,-0.066392,0.744158,0.512027,0.769823,bench,heavy,B,64.0,16.0,0.050828,-2.010353,1.483171
2024-02-11 17:08:05.600,-0.008713,0.964964,-0.072315,3.729410,-2.209350,-1.100146,bench,heavy,B,64.0,16.0,3.466693,0.352307,-0.118136
2024-02-11 17:08:05.800,-0.024124,0.973185,-0.084830,3.502448,-3.556205,-1.863756,bench,heavy,B,64.0,16.0,3.667741,0.963988,-1.544392
2024-02-11 17:08:06.000,-0.021642,0.963600,-0.095993,2.250385,-0.890134,-1.990244,bench,heavy,B,64.0,16.0,2.811832,-1.674566,-0.545874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.043888,-0.983254,-0.062298,2.477314,-4.130471,0.211078,row,medium,E,71.0,19.0,1.621374,2.037572,-0.961822
2024-02-20 19:33:27.200,-0.041612,-1.045090,-0.060838,-1.825996,-2.528451,1.485679,row,medium,E,71.0,19.0,-2.575507,-0.219284,-1.010687
2024-02-20 19:33:27.400,-0.047711,-1.047383,-0.066148,-0.261306,-1.974637,1.149337,row,medium,E,71.0,19.0,-1.080412,-0.236535,-0.215415
2024-02-20 19:33:27.600,-0.050471,-1.017967,-0.063349,2.227568,-2.549377,0.387236,row,medium,E,71.0,19.0,1.374087,0.778228,0.010993


- Okay here we see that the data of 6 acclemter and gyroscope has been reduced to 3 dimensions 

In [46]:
subset_df = df_pca[df_pca['set'] == 35]
subset_df.sample(8)

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time,pca_1,pca_2,pca_3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-02-16 16:04:12.800,0.134272,-1.051519,0.049941,-29.628734,2.044027,11.000275,row,heavy,E,35.0,8.0,-30.634769,-9.60759,-4.004314
2024-02-16 16:04:08.600,0.123199,-1.141736,0.031508,29.024733,-7.213835,-8.10749,row,heavy,E,35.0,8.0,28.022941,10.357179,3.187867
2024-02-16 16:04:07.000,0.145599,-1.283699,-0.090443,23.672153,-6.002334,-6.402213,row,heavy,E,35.0,8.0,22.705542,8.226353,2.730042
2024-02-16 16:04:13.200,0.113549,-1.346038,-0.073156,18.291638,-7.24826,-3.824865,row,heavy,E,35.0,8.0,16.776264,8.452201,1.338275
2024-02-16 16:04:14.400,0.101482,-1.242457,-0.034433,-20.957798,2.408931,5.840818,row,heavy,E,35.0,8.0,-20.564583,-9.158467,-3.429683
2024-02-16 16:04:11.400,0.073047,-0.91975,-0.066957,1.571827,-2.993998,4.15279,row,heavy,E,35.0,8.0,-1.332594,2.708327,1.949001
2024-02-16 16:04:10.800,0.113378,-1.093762,0.051983,-31.018425,3.656772,8.390741,row,heavy,E,35.0,8.0,-30.222546,-12.665868,-5.353024
2024-02-16 16:04:15.000,0.070143,-0.964982,-0.031115,0.974137,-2.181379,0.173626,row,heavy,E,35.0,8.0,0.482432,-0.084194,-0.450783


In [48]:
fig = go.Figure()

# Add traces for each PCA component
for col in ['pca_1', 'pca_2', 'pca_3']:
    fig.add_trace(go.Scatter(x=subset_df.index, y=subset_df[col], mode='lines', name=col, line=dict(width=2)))

# Update layout
fig.update_layout(
    title="PCA Components",
    xaxis_title="Index",
    yaxis_title="PCA Component Value",
    template='plotly_dark',
    legend=dict(x=1, y=1)
)

# Add grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray', zeroline=False)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')

# Show plot
fig.show()


------

# **Sum of squares**

- We using this equation to handle dynamic re-orientation of the data
- The equation is: r = sqrt(x^2 + y^2 + z^2 )

In [49]:
df_equation  = df_pca.copy()
df_equation

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time,pca_1,pca_2,pca_3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-02-11 17:08:05.200,0.013496,0.977014,-0.071002,-1.891570,2.438321,0.937917,bench,heavy,B,64.0,16.0,-2.134776,-4.437109,1.623949
2024-02-11 17:08:05.400,0.008260,0.965687,-0.066392,0.744158,0.512027,0.769823,bench,heavy,B,64.0,16.0,0.050828,-2.010353,1.483171
2024-02-11 17:08:05.600,-0.008713,0.964964,-0.072315,3.729410,-2.209350,-1.100146,bench,heavy,B,64.0,16.0,3.466693,0.352307,-0.118136
2024-02-11 17:08:05.800,-0.024124,0.973185,-0.084830,3.502448,-3.556205,-1.863756,bench,heavy,B,64.0,16.0,3.667741,0.963988,-1.544392
2024-02-11 17:08:06.000,-0.021642,0.963600,-0.095993,2.250385,-0.890134,-1.990244,bench,heavy,B,64.0,16.0,2.811832,-1.674566,-0.545874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.043888,-0.983254,-0.062298,2.477314,-4.130471,0.211078,row,medium,E,71.0,19.0,1.621374,2.037572,-0.961822
2024-02-20 19:33:27.200,-0.041612,-1.045090,-0.060838,-1.825996,-2.528451,1.485679,row,medium,E,71.0,19.0,-2.575507,-0.219284,-1.010687
2024-02-20 19:33:27.400,-0.047711,-1.047383,-0.066148,-0.261306,-1.974637,1.149337,row,medium,E,71.0,19.0,-1.080412,-0.236535,-0.215415
2024-02-20 19:33:27.600,-0.050471,-1.017967,-0.063349,2.227568,-2.549377,0.387236,row,medium,E,71.0,19.0,1.374087,0.778228,0.010993


In [50]:
# Calculate the square root of the acceleration and gyro values by implementing the formula
acceleration_sqrt = df_equation['mean_xc'] ** 2 + df_equation['mean_yc'] ** 2 + df_equation['mean_zc'] ** 2
gyro_sqrt = df_equation['mean_xg'] ** 2 + df_equation['mean_yg'] ** 2 + df_equation['mean_zg'] ** 2

In [51]:
acceleration_sqrt

time
2024-02-11 17:08:05.200    0.959780
2024-02-11 17:08:05.400    0.937028
2024-02-11 17:08:05.600    0.936462
2024-02-11 17:08:05.800    0.954867
2024-02-11 17:08:06.000    0.938208
                             ...   
2024-02-20 19:33:27.000    0.972595
2024-02-20 19:33:27.200    1.097645
2024-02-20 19:33:27.400    1.103663
2024-02-20 19:33:27.600    1.042817
2024-02-20 19:33:27.800    1.075444
Length: 9009, dtype: float64

- now we need to make a new dataframe that has the square root of the acceleration and gyro values and add it to the original dataframe

In [52]:
df_equation['acc_sqrt'] = np.sqrt(acceleration_sqrt)
df_equation['gyro_sqrt'] = np.sqrt(gyro_sqrt)

In [53]:
df_equation

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time,pca_1,pca_2,pca_3,acc_sqrt,gyro_sqrt
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2024-02-11 17:08:05.200,0.013496,0.977014,-0.071002,-1.891570,2.438321,0.937917,bench,heavy,B,64.0,16.0,-2.134776,-4.437109,1.623949,0.979684,3.225389
2024-02-11 17:08:05.400,0.008260,0.965687,-0.066392,0.744158,0.512027,0.769823,bench,heavy,B,64.0,16.0,0.050828,-2.010353,1.483171,0.968002,1.186832
2024-02-11 17:08:05.600,-0.008713,0.964964,-0.072315,3.729410,-2.209350,-1.100146,bench,heavy,B,64.0,16.0,3.466693,0.352307,-0.118136,0.967709,4.472142
2024-02-11 17:08:05.800,-0.024124,0.973185,-0.084830,3.502448,-3.556205,-1.863756,bench,heavy,B,64.0,16.0,3.667741,0.963988,-1.544392,0.977173,5.327976
2024-02-11 17:08:06.000,-0.021642,0.963600,-0.095993,2.250385,-0.890134,-1.990244,bench,heavy,B,64.0,16.0,2.811832,-1.674566,-0.545874,0.968612,3.133311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-20 19:33:27.000,-0.043888,-0.983254,-0.062298,2.477314,-4.130471,0.211078,row,medium,E,71.0,19.0,1.621374,2.037572,-0.961822,0.986202,4.821040
2024-02-20 19:33:27.200,-0.041612,-1.045090,-0.060838,-1.825996,-2.528451,1.485679,row,medium,E,71.0,19.0,-2.575507,-0.219284,-1.010687,1.047685,3.454645
2024-02-20 19:33:27.400,-0.047711,-1.047383,-0.066148,-0.261306,-1.974637,1.149337,row,medium,E,71.0,19.0,-1.080412,-0.236535,-0.215415,1.050554,2.299662
2024-02-20 19:33:27.600,-0.050471,-1.017967,-0.063349,2.227568,-2.549377,0.387236,row,medium,E,71.0,19.0,1.374087,0.778228,0.010993,1.021184,3.407540


## Plotting the square root of the acceleration and gyro :

In [54]:
subset_df = df_equation[df_equation['set'] == 35]
subset_df

Unnamed: 0_level_0,mean_xc,mean_yc,mean_zc,mean_xg,mean_yg,mean_zg,posture,types,specimen,set,duration_time,pca_1,pca_2,pca_3,acc_sqrt,gyro_sqrt
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2024-02-16 16:04:06.600,0.108631,-1.000453,-0.100114,8.722103,-3.297011,5.161295,row,heavy,E,35.0,8.0,3.940233,6.104463,5.540718,1.011301,10.657595
2024-02-16 16:04:06.800,0.127096,-1.16205,-0.119152,13.719749,-4.403636,0.926907,row,heavy,E,35.0,8.0,10.416763,6.79526,4.233562,1.175037,14.43893
2024-02-16 16:04:07.000,0.145599,-1.283699,-0.090443,23.672153,-6.002334,-6.402213,row,heavy,E,35.0,8.0,22.705542,8.226353,2.730042,1.295092,25.246528
2024-02-16 16:04:07.200,0.142715,-1.183836,0.020982,26.859014,-5.891506,-8.509379,row,heavy,E,35.0,8.0,26.526286,8.307836,2.787053,1.192592,28.784128
2024-02-16 16:04:07.400,0.114809,-0.949361,0.146433,14.385158,-3.52957,-3.682595,row,heavy,E,35.0,8.0,13.629515,4.12042,2.002354,0.967424,15.26277
2024-02-16 16:04:07.600,0.094414,-0.820265,0.191739,-7.57417,-0.557267,2.610928,row,heavy,E,35.0,8.0,-7.857788,-3.379744,-1.561642,0.847651,8.030912
2024-02-16 16:04:07.800,0.106037,-0.898622,0.129001,-23.089737,1.646032,7.235983,row,heavy,E,35.0,8.0,-23.13787,-8.674555,-3.898387,0.914006,24.252934
2024-02-16 16:04:08.000,0.133196,-1.105772,0.003256,-20.808243,2.626406,9.31864,row,heavy,E,35.0,8.0,-22.422464,-7.591114,-0.928457,1.113769,22.950339
2024-02-16 16:04:08.200,0.148878,-1.306671,-0.097373,-2.179431,1.496785,5.931597,row,heavy,E,35.0,8.0,-5.258872,-1.385263,4.237351,1.318725,6.494161
2024-02-16 16:04:08.400,0.144419,-1.350126,-0.088378,19.868563,-2.617634,-2.309784,row,heavy,E,35.0,8.0,17.372713,6.109088,5.852601,1.360702,20.172925


In [56]:
fig = go.Figure()

# Add traces for accelerometer and gyroscope data
fig.add_trace(go.Scatter(x=subset_df.index, y=subset_df['acc_sqrt'], mode='lines', name='Accelerometer', line=dict(width=2)))
fig.add_trace(go.Scatter(x=subset_df.index, y=subset_df['gyro_sqrt'], mode='lines', name='Gyroscope', line=dict(width=2)))

# Update layout
fig.update_layout(
    title="Accelerometer and Gyroscope Data",
    xaxis_title="Index",
    yaxis_title="Sensor Data",
    template='plotly_dark',
    legend=dict(x=1, y=1)
)
# Add grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='gray', zeroline=False)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='gray')

# Show plot
fig.show()

-----

# **Export Data**

In [38]:
df_equation.to_pickle(EXPORT_PATH)