# Feature engineering and model training for E4-MET

In [85]:

import pandas as pd
import os 

from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

os.chdir("E:\Projects\Oulu\Data Mining Project\DataMiningProject")
pd.set_option("display.precision", 2)

In [86]:
# Load the data from the pickle files
participants_list = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15', 'P16', 'P17']
merged_dataset = [pd.read_pickle("merged_dataset/{0}/E4.pkl".format(participant)) for participant in participants_list]
merged_dataset[0]

Unnamed: 0,Time[s],Delta_Time,MET,X,Y,Z,Magnitude
0,1.0,0 days 00:00:00,1.31,30.0,0.0,55.67,63.24
1,2.0,0 days 00:00:01,1.31,30.0,0.0,55.75,63.31
2,3.0,0 days 00:00:02,1.52,30.0,0.0,55.80,63.35
3,4.0,0 days 00:00:03,1.63,30.0,0.0,55.60,63.18
4,5.0,0 days 00:00:04,1.56,30.0,0.0,55.80,63.35
...,...,...,...,...,...,...,...
1797,1798.0,0 days 00:29:57,5.32,20.2,-49.4,36.00,64.38
1798,1799.0,0 days 00:29:58,5.11,18.0,-50.2,33.80,63.14
1799,1800.0,0 days 00:29:59,4.46,17.8,-50.2,34.20,63.30
1800,1801.0,0 days 00:30:00,4.12,15.2,-49.4,36.20,63.10


## Options

In [87]:
include_time_lagged_features = True
activity_split = False

## Feature engineering

### Time lagged features

In [88]:
# Time lagged feature generation for ACC data
def time_lagged_features(data, time_lags):
    for time_lag in time_lags:
        data['X_lag_{}'.format(time_lag)] = data['X'].shift(time_lag)
        data['Y_lag_{}'.format(time_lag)] = data['Y'].shift(time_lag)
        data['Z_lag_{}'.format(time_lag)] = data['Z'].shift(time_lag)
        # Backward fill the missing values
        data['X_lag_{}'.format(time_lag)].fillna(method='bfill', inplace=True)
        data['Y_lag_{}'.format(time_lag)].fillna(method='bfill', inplace=True)
        data['Z_lag_{}'.format(time_lag)].fillna(method='bfill', inplace=True)
    return data

if include_time_lagged_features:
    # Create time lagged features for all participants
    time_lags = [1, 2, 3, 10, 15, 20]
    for i in range(len(merged_dataset)):
        merged_dataset[i] = time_lagged_features(merged_dataset[i], time_lags)
        
    merged_dataset[0]
        

## Activity splitting
The different activities last 5 minutes each, and they are as follows:
1. Resting (sitting)
2. Resting (standing)
3. Cycling (low intensity)
4. Cycling (high intensity)
5. Running (low intensity)
6. Running (high intensity)

In [89]:
if activity_split: # TODO: DO
    # Sanity check. For each participant print the length of the data (last row's 'Time[s]' value is the length of the data in seconds)
    # Minutes of each
    for i in range(len(merged_dataset)):
        time_s = merged_dataset[i].iloc[-1]['Time[s]']
        time_m = time_s / 60
        print("Participant {0} has {1} minutes of data".format(i+1, time_m))


## Model training

### Train/test splitting

In [90]:
concatenated_data = pd.concat(merged_dataset, axis=0)
# Drop nan values
concatenated_data = concatenated_data.dropna()
# Take X_data by taking every column EXCEPT "MET" and time columns
X_data = concatenated_data.drop(columns=['MET', 'Time[s]', 'Delta_Time'])
# Take Y_data by taking only the "MET" column
Y_data = concatenated_data['MET']
X_data[0:5], Y_data[0:5]

(      X    Y      Z  Magnitude
 0  30.0  0.0  55.67      63.24
 1  30.0  0.0  55.75      63.31
 2  30.0  0.0  55.80      63.35
 3  30.0  0.0  55.60      63.18
 4  30.0  0.0  55.80      63.35,
 0    1.31
 1    1.31
 2    1.52
 3    1.63
 4    1.56
 Name: MET, dtype: float64)

In [91]:
X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X_data, Y_data, test_size=0.2, random_state=42)

normalizer = MinMaxScaler()
X_train_data = normalizer.fit_transform(X_train_data)
X_test_data = normalizer.transform(X_test_data)

### Linear regression

In [92]:
linear_regression = LinearRegression()
linear_regression.fit(X_train_data, Y_train_data)
Y_pred = linear_regression.predict(X_test_data)
mse = mean_squared_error(Y_test_data, Y_pred)
print("Linear Regression Mean Squared Error: ", mse)

Linear Regression Mean Squared Error:  4.472147703836399


### XGBoost

In [93]:
model = XGBRegressor()
model.fit(X_train_data, Y_train_data)
Y_pred = model.predict(X_test_data)
mse = mean_squared_error(Y_test_data, Y_pred)
print("XGBoost Mean Squared Error: ", mse)

XGBoost Mean Squared Error:  1.8585419909050513
