<a href="https://colab.research.google.com/github/Aniket-tempest/HAR-Employee-Identification/blob/main/HAR_FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

In [None]:
import os
import numpy as np
from pandas import read_csv

class DataHandler(object):

    # load a single txt file as a dataframe
    def load_txt(self, filedir):
        '''
        input:
            filedir: file full path
        
        output: returns numpy array
        '''
        data = read_csv(filedir, header=None, delim_whitespace=True)
        return data

    # load the files in the parnet dir and stack them together as numpy arrays
    def load_files(self, parentdir=''):
        
        '''
        input:
            parnetdir: parent directory of the target files
        
        output: returns stacked numpy arrays
        '''
        
        # get the name of files in the parent dir
        filelist = os.listdir(parentdir)
        
        # accumulator: a list for stroing loaded files
        loadedfiles = []
        
        # load all the files that exist in the filelist
        for file in filelist:
            data = self.load_txt(parentdir + file)
            loadedfiles.append(data)
            
        return np.dstack(loadedfiles)
    
    
    def remove_overlap(self, data, overlap_per=0.5):
        
        '''
        Pre-processed raw data had fixed windows of 2.56 seconds (128 data points) with a 50% overlap.
        For avoiding duplications due to overlapping in plotting or normalization steps on raw data,
        using this function we'll remove overlap and squash data frames it to the series. 
        
        input:
            data: dataset (e.g X_train or X_test)
            overlap_per: overlapping percentage (default is 0.5, as data has an overlap of 50% )
            
        output: squashed NumPy data
        '''
        
        # accumulator: a list for stroing squashed data
        series_data = []

        for frame in data:

            # get the overlap index  
            overlap_index = int(len(frame)* overlap_per) - 1
            # remove the overlap from the data frame
            series_data.append(frame[0:overlap_index])

        return np.concatenate(np.array(series_data))

In [None]:
'''
used in feature generation in for feature engineering part
contains feature list/values handling and feature caculation methods.
'''

from scipy import stats
from scipy.signal import find_peaks

class FeatureBuilder(object):

    def __init__(self, n_peaks=5):
        '''
        @params:
            n_peaks: number of the first n peaks to be selected from transformation signals
            
        '''
  
        self.n_peaks = n_peaks 
        
    def init_features(self):
        '''
        initialize feature dictionaries
        '''
        
        # variables for given signal itslef
        # this fatures will be calculated on the raw signal itself
        main_features = {
            'std': 0.0, # standart deviation
            'mean': 0.0, # mean vale
            'mad': 0.0, # median absolute deviation
            'max': 0.0, # larget value in array
            'min': 0.0, # smallest value in array
            'iqr': 0.0, # interquartile range
            'correlation-1': 0.0, # correlation
            'correlation-2': 0.0, # correlation
        }

        # varaibles for freq-domain (FFT, PSD) or autocorellation signals
        # we'll create dynamic dictionary for n-peaks
        domain_features = {

        'aCORR':  {'peaks-mean': 0, # mean of the first n selected peaks-value (not domains)
                  'peak-value':{}, # for 2 peak it will be like {'1':0, '2':0} 
                  'peak-domain': {}}, # for 2 peak it will be like {'1':0, '2':0} 

        'PSD':   {'peaks-mean': 0,
                  'peak-value': {},
                  'peak-domain': {}},

        'FFT':   {'peaks-mean': 0,
                  'peak-value': {},
                  'peak-domain':{}}
        }

        # create n-items dict for the first n-peaks values/domains 
        for signal in ['FFT', 'PSD', 'aCORR']:
            temp_dict = {}
            for i_peak in range(self.n_peaks):
                temp_dict[str(i_peak)]=0

            domain_features[signal]['peak-value'] = temp_dict
            domain_features[signal]['peak-domain'] = temp_dict.copy()
            
        # save our featue dictionaries
        self.main_features = main_features
        self.domain_features = domain_features
       
        
    # get domain_features values/ key names 
    def get_main_features(self, return_values=False):
        '''
        convert main_features dict values to list
        
        input:
            return_values: boolen variable - if it's ture then return
                           every single dict values, otherwise return
                           existing feature (key) names
                           
        output: 1D np.array
        '''
        # temporary list
        output = []

        for key, value in self.main_features.items():
            
            # if return_values==true add value, otherwise add name
            if return_values:
                output.append(value)
            else:
                output.append(key)
        
        return output
    
   
   # get domain_features values/ key names
    def get_domain_features(self, return_values=False):
        '''
        it runs over the dict and get's the required info.
        convert domain_features dict values and keys to list
        input:
            return_values: boolen variable - if it's ture then return
                           every single dict values, otherwise return
                           existing feature (key) names
        output: 1D np.array
        '''
        # temporary list
        output = []

        # as we have 3 nested dict for domain_features, first iterate over signals [FFT, PSD, aCORR]
        for signal in self.domain_features.keys():

            # iterate over the second nested dict [max, min, mean, peak-values, peak-domains] 
            for feature, f_val in self.domain_features[signal].items():

                # chekc if the value of the first nested dict is itself a dict.
                # if yes, then that means it contains the second nested dict 
                if type(f_val)!=dict:

                    # if return_values==true add value, otherwise add name
                    if return_values==True:
                        output.append(f_val)
                    else:
                        output.append(signal+'-'+feature)

                else:
                    # iterate over third nested dict ['0': 0, '1': 0, ...]
                    for peaks, p_val in f_val.items():

                        # if return_values==true add value, otherwise add name
                        if return_values==True:
                            output.append(p_val)
                        else:
                            output.append(signal+'-'+feature+'-'+peaks)

        return output
    
    
    # caculate the features for given ararys
    def calculate_main_features(self, signal, corr_signals):
        '''
        input: 
            signal: signal array for a axis in which caculation is will be done
            corr_signals: list that contains signal arrays for other two axes,
                         these will be used for calculation correlations.
                         
        output: return 1D array
        '''

        # do simple assertation on inputs
        if type(signal)!=np.ndarray or signal.ndim!=1:
            assert False, 'signal must be ndarray type with dimension 1'

        if type(corr_signals)!=list or len(corr_signals)!=2:
            assert False, 'corr_signals must be list with length 2'

            
        # calculate features
        self.main_features['std'] = np.std(signal)
        self.main_features['mean'] = np.mean(signal)
        self.main_features['mad'] = np.median(signal)
        self.main_features['max'] = np.max(signal)
        self.main_features['min'] = np.min(signal)
        self.main_features['iqr'] = stats.iqr(signal)
        self.main_features['correlation-1'] = np.corrcoef(signal, corr_signals[0])[0,1]
        self.main_features['correlation-2'] = np.corrcoef(signal, corr_signals[1])[0,1]


    # caculate the features for given ararys
    def calculate_domain_features(self, domain, signal, t_name=None):

        '''
        input:
            domain: frequency/time domain of the signal
            signal: transformed signal
            t_name: name of signal transformation ('FFT', 'PSD', 'aCORR')
        '''
        # simple assertation for transform_name
        if type(t_name)!=str or t_name.upper() not in ['FFT', 'PSD', 'ACORR']:
            assert False, 'transform_name must be str type and can get one of (FFT, PSD or aCORR)'


        # FIND THE PEAKS FROM TRANSFORMED SIGNAL
        # define required minimum height for determining peaks in the signal.
        QR_5 = np.nanpercentile(signal, 5)
        QR_95 = np.nanpercentile(signal, 95)
        height = QR_5 + (QR_95 - QR_5)/10
        # get peak indices and peaks based on given height
        indices_peaks, peak_values = find_peaks(signal, height=height)

        # CALCULATE/ASSIGN VALUES
        # iterate over peak-value/domain: if first n-peaks exist then assign them, else break
        for i, key in enumerate(self.domain_features[t_name]['peak-value'].keys()):    
                try:
                    self.domain_features[t_name]['peak-value'][key] = signal[indices_peaks[i]]
                    self.domain_features[t_name]['peak-domain'][key] = domain[indices_peaks[i]]
                except:
                    break

        # if found peaks bigger than n_peaks then mean of the first n-peak,
        # else get them what you have at the hand
        if len(indices_peaks) >= self.n_peaks:
            self.domain_features[t_name]['peaks-mean'] = np.mean(peak_values['peak_heights'][:self.n_peaks])
        elif len(indices_peaks)!=0:
            self.domain_features[t_name]['peaks-mean'] = np.mean(peak_values['peak_heights'])

In [None]:
'''
this class contains implementaion of signal transformation between time and 
frequency domains and autocorrelation caculations with lagged signal
'''

import numpy as np
from scipy.fftpack import fft
from scipy.signal import welch

class SignalTransform(object):

    def __init__(self, N, F, t):
        '''
        @params:
            N: reading size for given window
            F: sampling rate with Hz
            t: size of fixed-width sliding wondow in seconds
            *T: peroid of the signal  
        '''
        self.N = N
        self.F = F
        self.T = t/N

    # perform Fourier Transform
    def fft_transform(self, signal):
        '''    
        input: 
            signal: given signal
        output:
            f_domain: frequency domain of the signal
            FFT: FFT of the signal
        '''
        # create f_domain maunally for given T and N
        f_domain = np.linspace(0, 1/(2*self.T), self.N//2)
        
        # get the hafl of the fft signal as values after half way are redundant
        FFT = 2.0/self.N * np.abs(fft(signal)[0:self.N//2])

        return f_domain, FFT

    
    # perform Power Spectral Density Transform
    def psd_transform(self, signal):
        '''
        in(out)put description is the same with the previous one
        '''
        f_domain, PSD = welch(signal, self.F)
        
        return f_domain, PSD
    
    
    # perform autocorrelation, caculates the serial correlation of a signal with its lagged signal
    def aCorr_transform(self, signal):
        '''
        in(out)put description is the same with the previous one
        '''
        # calculate time domain manually for gien T and n-th reading
        t_domain = np.array([self.T*reading for reading in range(self.N)])
        
        # "full" mode correlates signals for every t where both have some overlap.
        # autocorrelation is calculated for 0 <= t < ∞, thus for getting corelation
        # at 0 <= t < ∞, we'll get half of it. 
        aCORR = np.correlate(signal, signal, mode='full')

        return t_domain, aCORR[len(aCORR)//2:]

In [None]:
featureBuilder = FeatureBuilder(n_peaks=2)
featureBuilder.init_features()

In [None]:
featureBuilder.get_main_features()

['std', 'mean', 'mad', 'max', 'min', 'iqr', 'correlation-1', 'correlation-2']

In [None]:
featureBuilder.get_domain_features()

['aCORR-peaks-mean',
 'aCORR-peak-value-0',
 'aCORR-peak-value-1',
 'aCORR-peak-domain-0',
 'aCORR-peak-domain-1',
 'PSD-peaks-mean',
 'PSD-peak-value-0',
 'PSD-peak-value-1',
 'PSD-peak-domain-0',
 'PSD-peak-domain-1',
 'FFT-peaks-mean',
 'FFT-peak-value-0',
 'FFT-peak-value-1',
 'FFT-peak-domain-0',
 'FFT-peak-domain-1']

In [None]:
dataHandler = DataHandler()

# init SignalTransform
# N: reading size for given window
# F: sampling rate with Hz
# t: size of fixed-width sliding wondow in seconds
signalTransform = SignalTransform(N=128, F=50, t=2.56)

# init FeatureBuilder
featureBuilder = FeatureBuilder(n_peaks=5)
featureBuilder.init_features()

In [None]:
# get the name of signal files in the path
signal_names = os.listdir('drive/MyDrive/UCI HAR Dataset/train/Inertial Signals/')

signal_names

['body_acc_x_train.txt',
 'body_acc_y_train.txt',
 'body_acc_z_train.txt',
 'total_acc_x_train.txt',
 'total_acc_y_train.txt',
 'total_acc_z_train.txt',
 'body_gyro_x_train.txt',
 'body_gyro_y_train.txt',
 'body_gyro_z_train.txt']

In [None]:
# list stores feature labels
feature_labels = []

for signal in signal_names:

    main_features = [(signal[:-9]+name).upper() for name in featureBuilder.get_main_features()]
    domain_features = [(signal[:-9]+name).upper() for name in featureBuilder.get_domain_features()]
    
    feature_labels.extend(main_features)
    feature_labels.extend(domain_features)
    
# also add target label
feature_labels.append('ACTIVITY')
    
feature_labels[:10]

['BODY_ACC_X_STD',
 'BODY_ACC_X_MEAN',
 'BODY_ACC_X_MAD',
 'BODY_ACC_X_MAX',
 'BODY_ACC_X_MIN',
 'BODY_ACC_X_IQR',
 'BODY_ACC_X_CORRELATION-1',
 'BODY_ACC_X_CORRELATION-2',
 'BODY_ACC_X_ACORR-PEAKS-MEAN',
 'BODY_ACC_X_ACORR-PEAK-VALUE-0']

In [None]:
# given signal calculate both main and domain features and return values
def get_all_features(signal, corr_signals):
    '''
    input:
        signal: signal array for a axis in which caculation is will be done
        corr_signals: list that contains signal arrays for other two axes,
                      these will be used for calculation correlations.
                      e.g. if signal is x, then corr_signals=[z,y]
                      
    output: return 1D array which contains both main and domain features 
    '''

    # init/reste fature values before start
    featureBuilder.init_features()
    
    # calculate main features
    featureBuilder.calculate_main_features(x_signal, corr_signals)

    # do time-frequency domain transformations
    domain_fft, signal_fft = signalTransform.fft_transform(x_signal)
    domain_psd, signal_psd = signalTransform.psd_transform(x_signal)
    domain_aCorr, signal_aCorr = signalTransform.aCorr_transform(x_signal)

    # calculate domain features on different transformations
    featureBuilder.calculate_domain_features(domain_fft, signal_fft, t_name='FFT')
    featureBuilder.calculate_domain_features(domain_psd, signal_psd, t_name='PSD')
    featureBuilder.calculate_domain_features(domain_aCorr, signal_aCorr, t_name='aCORR')

    # get features and concotanate them
    main_features = featureBuilder.get_main_features(return_values=True)
    domain_features = featureBuilder.get_domain_features(return_values=True)
    
    return np.concatenate((main_features, domain_features))

# Generating Features

In [None]:
# accumulator for storing data

for prefix in ['train', 'test']:
    
    data = []
    
    # load initial data
    X_data = dataHandler.load_files('drive/MyDrive/UCI HAR Dataset/{p}/Inertial Signals/'.format(p=prefix))
    y_data = dataHandler.load_txt('drive/MyDrive/UCI HAR Dataset/{p}/y_{p}.txt'.format(p=prefix)).values

    for row in range(X_data.shape[0]):

        # accumulator for storing features at each row
        features = []

        # iterate over signal types by +3, becase for each signal type we have 3 differnet axes values
        for signal in range(0, X_data.shape[2], 3):

            # fet the signal for each axis
            x_signal = X_data[row][:, signal]
            y_signal = X_data[row][:, signal+1]
            z_signal = X_data[row][:, signal+2]

            # GIVEN SIGNAL CALCULATE BOTH MAIN AND DOMAIN FEATURES 
            # corr_siganls is used to calculate corrlation of theese singals with the given signal
            x_features = get_all_features(x_signal, corr_signals=[y_signal, z_signal])
            y_features = get_all_features(y_signal, corr_signals=[x_signal, z_signal])
            z_features = get_all_features(z_signal, corr_signals=[x_signal, y_signal])

            # complete features for each row
            features.append(np.concatenate((x_features, y_features, z_features)))

        # add new feture row to the data list 
        data.append(np.array(features).flatten())

    # save data, add y_data to data and ACTIVITY to labes as well
    dataframe = pd.DataFrame(np.hstack((data, y_data)), columns=feature_labels)
    dataframe.to_csv('dataset/{}.csv'.format(prefix), index=False, header=True)

# Engineered data

In [None]:
pd.read_csv('dataset/train.csv').head(20)

Unnamed: 0,BODY_ACC_X_STD,BODY_ACC_X_MEAN,BODY_ACC_X_MAD,BODY_ACC_X_MAX,BODY_ACC_X_MIN,BODY_ACC_X_IQR,BODY_ACC_X_CORRELATION-1,BODY_ACC_X_CORRELATION-2,BODY_ACC_X_ACORR-PEAKS-MEAN,BODY_ACC_X_ACORR-PEAK-VALUE-0,...,BODY_GYRO_Z_FFT-PEAK-VALUE-1,BODY_GYRO_Z_FFT-PEAK-VALUE-2,BODY_GYRO_Z_FFT-PEAK-VALUE-3,BODY_GYRO_Z_FFT-PEAK-VALUE-4,BODY_GYRO_Z_FFT-PEAK-DOMAIN-0,BODY_GYRO_Z_FFT-PEAK-DOMAIN-1,BODY_GYRO_Z_FFT-PEAK-DOMAIN-2,BODY_GYRO_Z_FFT-PEAK-DOMAIN-3,BODY_GYRO_Z_FFT-PEAK-DOMAIN-4,ACTIVITY
0,0.002941,0.002269,0.002025,0.01081,-0.004294,0.004812,0.374934,0.433372,0.001125,0.001205,...,0.002931,0.003118,0.002554,0.001492,1.587302,2.380952,3.571429,4.365079,6.349206,5.0
1,0.001981,0.000174,0.00011,0.005251,-0.006706,0.00197,-0.011562,-0.071672,3.8e-05,3.1e-05,...,0.007569,0.006204,0.00318,0.001679,1.587302,2.777778,3.571429,5.555556,7.539683,5.0
2,0.002908,0.000428,0.000627,0.008167,-0.010483,0.003138,-0.121905,-0.179492,9.7e-05,0.000255,...,0.013024,0.008747,0.002525,0.004418,0.396825,1.587302,2.380952,3.174603,3.968254,5.0
3,0.002678,0.000329,0.000269,0.008167,-0.010483,0.003128,-0.301393,-0.360048,0.000181,0.000304,...,0.003866,0.003486,0.003027,0.002108,2.380952,3.174603,4.365079,5.15873,6.349206,5.0
4,0.002015,-0.000195,-0.000144,0.00565,-0.006847,0.002622,-0.152752,-0.188102,5.2e-05,6e-05,...,0.004498,0.002764,0.001276,0.001086,3.174603,4.365079,6.746032,9.52381,10.714286,5.0
5,0.002276,-7.8e-05,-0.000182,0.00565,-0.006847,0.002883,-0.206476,-0.149615,4.5e-05,-6e-05,...,0.005337,0.003427,0.002866,0.00216,0.793651,3.174603,3.968254,4.761905,5.555556,5.0
6,0.002409,0.000387,0.000461,0.006637,-0.005558,0.003317,-0.093299,-0.134364,0.000103,0.000155,...,0.007314,0.004507,0.003368,0.002517,0.396825,1.587302,2.777778,3.571429,4.761905,5.0
7,0.002527,-3e-05,-0.000261,0.006637,-0.00603,0.003928,-0.158534,-0.016404,9.3e-05,0.00012,...,0.003816,0.003303,0.00347,0.002973,1.190476,2.777778,3.571429,4.761905,5.555556,5.0
8,0.002278,-5.8e-05,-0.000261,0.006897,-0.00603,0.002976,-0.161664,-0.03296,0.000111,0.000138,...,0.006379,0.004093,0.002423,0.00305,0.396825,1.190476,2.380952,3.571429,4.761905,5.0
9,0.003095,0.00062,0.000495,0.007276,-0.009268,0.003705,-0.173019,-0.44458,0.000278,0.000231,...,0.004295,0.005635,0.010387,0.003762,0.396825,1.190476,1.984127,3.174603,4.365079,5.0
