In [1]:
%matplotlib widget

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from pathlib import Path

import lightgbm as lgb

from sklearn.feature_selection import RFECV
from sklearn.metrics import f1_score, make_scorer
from sklearn.tree import DecisionTreeClassifier

from PfyMU.gait.train_classifier.core import load_datasets
from PfyMU.features import *

plt.style.use('ggplot')

In [3]:
steps = {
    'jumping-rope': 0.15,
    'stairs-descending': 0.1,
    'stairs-ascending': 0.1,
    'jumping': 0.15,
    'lying': 0.15,
    'elevator-ascending': 0.15,
    'elevator-descending': 0.15,
    'running': 0.075,
    'sweeping': 0.15,
    'standing': 225,
    'running-treadmill': 0.1,
    'cycling-50W': 0.12,
    'cycling-100W': 0.12,
    'walking-left': 0.2,
    'walking-right': 0.2,
    'walking-impaired': 0.2,
    'walking': 0.25,
    'sitting': 400,
    'default': 0.5
}

In [4]:
# gait_sets_path = Path('/Users/adamol/Documents/Datasets/gait/processed')
gait_sets_path = Path('/home/lukasadamowicz/Documents/Datasets/processed')

datasets = [
    gait_sets_path / 'bluesky2',
    gait_sets_path / 'daliac',
    gait_sets_path / 'ltmm',
    gait_sets_path / 'usc-had'
]

X, Y, subjects, activities = load_datasets(
    datasets, 
    goal_fs=50.0, 
    acc_mag=True, 
    window_length=3.0, 
    window_step=steps
)

# make stair-climbing in the positive class
mask = (activities == 'stairs-ascending') | (activities == 'stairs-descending')
Y_inc_str = Y.copy()
Y_inc_str[mask] = 1

In [11]:
sa_df = pd.DataFrame(data={'Subject': subjects, 'Activity': activities})
sa_df['col1'] = 1.0

# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = sa_df.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

for i in range(0, len(loso_subjects), 4):
    tr_m = np.ones(sa_df.shape[0], dtype='bool')
    v_m = np.zeros(sa_df.shape[0], dtype='bool')
    te_m = np.zeros(sa_df.shape[0], dtype='bool')
    
    for j in range(4):
        tr_m &= (sa_df.Subject != loso_subjects[i+j]).values
    for j in range(2):
        v_m |= (sa_df.Subject == loso_subjects[i+j]).values
    for j in range(2):
        te_m |= (sa_df.Subject == loso_subjects[i+j+2]).values
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)

In [12]:
FB = Bank(window_length=None, window_step=None)

# add features
FB + Mean()
FB + MeanCrossRate()
FB + StdDev()
FB + Skewness()
FB + Kurtosis()
FB + Range()
FB + IQR()
FB + RMS()
FB + LinearSlope()
FB + SignalEntropy()
FB + SPARC()
FB + ComplexityInvariantDistance(normalize=True)
FB + JerkMetric(normalize=True)
FB + DimensionlessJerk(log=True, signal_type='acceleration')

FB + Autocorrelation(lag=1, normalize=True)
FB + Autocorrelation(lag=15, normalize=True)
FB + Autocorrelation(lag=14, normalize=True)
FB + Autocorrelation(lag=12, normalize=True)

FB + SampleEntropy(m=4, r=1.0)
FB + SampleEntropy(m=2, r=0.75)
FB + SampleEntropy(m=3, r=0.75)
FB + SampleEntropy(m=2, r=0.5)
FB + SampleEntropy(m=2, r=0.25)

FB + PermutationEntropy(order=3, delay=1, normalize=True)
FB + PermutationEntropy(order=5, delay=1, normalize=True)
FB + PermutationEntropy(order=8, delay=1, normalize=True)
FB + PermutationEntropy(order=10, delay=1, normalize=True)
FB + PermutationEntropy(order=8, delay=2, normalize=True)
FB + PermutationEntropy(order=8, delay=8, normalize=True)

FB + RangeCountPercentage(range_min=0, range_max=1.0)
FB + RangeCountPercentage(range_min=0.5, range_max=1.4)
FB + RangeCountPercentage(range_min=0.3, range_max=1.4)
FB + RangeCountPercentage(range_min=1, range_max=1.4)
FB + RangeCountPercentage(range_min=0, range_max=1.5)

FB + RatioBeyondRSigma(r=1.0)
FB + RatioBeyondRSigma(r=2.5)
FB + RatioBeyondRSigma(r=0.5)

FB + DominantFrequency(low_cutoff=0.25, high_cutoff=5.0)
FB + DominantFrequency(low_cutoff=1.0, high_cutoff=3.5)
FB + DominantFrequency(low_cutoff=1.0, high_cutoff=3.0)
FB + DominantFrequency(low_cutoff=1.5, high_cutoff=6.0)
FB + DominantFrequency(low_cutoff=0.5, high_cutoff=3.0)

FB + DominantFrequencyValue(low_cutoff=0.25, high_cutoff=5.0)
FB + DominantFrequencyValue(low_cutoff=1.0, high_cutoff=3.5)
FB + DominantFrequencyValue(low_cutoff=1.0, high_cutoff=3.0)
FB + DominantFrequencyValue(low_cutoff=1.5, high_cutoff=6.0)
FB + DominantFrequencyValue(low_cutoff=0.5, high_cutoff=3.0)

FB + PowerSpectralSum(low_cutoff=0.25, high_cutoff=5.0)
FB + PowerSpectralSum(low_cutoff=1.0, high_cutoff=3.0)
FB + PowerSpectralSum(low_cutoff=1.5, high_cutoff=3.5)
FB + PowerSpectralSum(low_cutoff=0.25, high_cutoff=4.0)
FB + PowerSpectralSum(low_cutoff=0.25, high_cutoff=3.0)

FB + SpectralFlatness(low_cutoff=0.25, high_cutoff=5.0)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=6.0)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=8.0)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=3.5)
FB + SpectralFlatness(low_cutoff=0.5, high_cutoff=3.5)

FB + SpectralEntropy(low_cutoff=0.25, high_cutoff=5.0)
FB + SpectralEntropy(low_cutoff=0.0, high_cutoff=5.0)
FB + SpectralEntropy(low_cutoff=0.0, high_cutoff=3.5)
FB + SpectralEntropy(low_cutoff=0.25, high_cutoff=3.0)
FB + SpectralEntropy(low_cutoff=1.5, high_cutoff=4.0)

FB + DetailPower(wavelet='coif4', freq_band=[1.0, 3.0])

FB + DetailPowerRatio(wavelet='coif4', freq_band=[1.0, 3.0])

In [13]:
X_feat, feature_names = FB.compute(X, fs=50.0, windowed=True, columns=[''])



In [14]:
feats = pd.DataFrame(
    index=range(X_feat.shape[0]), 
#     columns=['Subject', 'Activity', 'Label'] + feature_names,
    columns=['Label'] + feature_names,
    dtype='float'
)
# feats['Subject'] = feats.Subject.astype('str')
# feats['Activity'] = feats.Activity.astype('str')

feats.iloc[:, 1:] = X_feat
feats['Label'] = Y
feats['Label'] = feats.Label.astype('int')
# feats['Subject'] = subjects
# feats['Activity'] = activities

In [15]:
feats_istr = pd.DataFrame(
    index=range(X_feat.shape[0]), 
    columns=['Label'] + feature_names,
    dtype='float'
)

feats_istr.iloc[:, 1:] = X_feat
feats_istr['Label'] = Y_inc_str
feats_istr['Label'] = feats_istr.Label.astype('int')

# Feature Elimintation

## 1. RFECV

In [16]:
lgb_cls = lgb.LGBMClassifier(n_estimators=75, random_state=82)
tree_cls = DecisionTreeClassifier(random_state=398)

lgb_rfecv = RFECV(
    lgb_cls, 
    step=1, 
    min_features_to_select=1, 
    cv=tuple(zip(training_masks, validation_masks)),
    scoring=make_scorer(f1_score),
    n_jobs=-1,
    verbose=1
)

tree_rfecv = RFECV(
    tree_cls,
    step=1,
    min_features_to_select=1,
    cv=tuple(zip(training_masks, validation_masks)),
    scoring=make_scorer(f1_score),
    n_jobs=-1,
    verbose=1
)

In [20]:
from sklearn.base import clone

In [21]:
lgb_slc = lgb_rfecv.fit(feats.iloc[:, 1:], feats.Label)
lgb_slc_istr = clone(lgb_rfecv).fit(feats_istr.iloc[:, 1:], feats_istr.Label)

Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 fe

In [24]:
tree_slc = tree_rfecv.fit(feats.iloc[:, 1:], feats.Label);
tree_slc_istr = clone(tree_rfecv).fit(feats_istr.iloc[:, 1:], feats_istr.Label);

Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 fe

In [34]:
print("LGB number of features : %d" % lgb_slc.n_features_)
print("LGB number of features, stairs=positive : %d" % lgb_slc_istr.n_features_)
print(f'Tree number of features: {tree_slc.n_features_}')
print(f'Tree number of features, stairs=positive: {tree_slc_istr.n_features_}')

lgb_score = lgb_slc.grid_scores_
lgb_s_score = lgb_slc_istr.grid_scores_
tr_score = tree_slc.grid_scores_
tr_s_score = tree_slc_istr.grid_scores_


f, ax = plt.subplots(figsize=(10, 5))
ax.set_xlabel("Number of features selected")
ax.set_ylabel("Cross validation score (nb of correct classifications)")
ax.plot(range(1, len(lgb_score) + 1), lgb_score* 100, '.-', label='LGB', color='C0')
ax.plot(range(1, len(lgb_s_score) + 1), lgb_s_score * 100, '.--', label='LGB, stairs', color='C0')
ax.plot(range(1, len(tr_score)+1), tr_score*100, '.-', label='Tree', color='C1')
ax.plot(range(1, len(tr_s_score)+1), tr_s_score*100, '.-', label='Tree, stairs', color='C1')

axx = ax.twinx()
axx.grid(False)
axx.plot(range(1, len(lgb_score)), np.diff(lgb_score) / lgb_score[:-1] * 100, '.', label=f'LGB: N={lgb_slc.n_features_}', color='C0')
axx.plot(range(1, len(lgb_s_score)), np.diff(lgb_s_score) / lgb_s_score[:-1] * 100, '+', label=f'LGB, stairs: N={lgb_slc_istr.n_features_}', color='C0')
axx.plot(range(1, len(tr_score)), np.diff(tr_score) / tr_score[:-1] * 100, '.', label=f'Tree: N={tree_slc.n_features_}')
axx.plot(range(1, len(tr_s_score)), np.diff(tr_s_score) / tr_s_score[:-1] * 100, '+', label=f'Tree, stairs: N={tree_slc_istr.n_features_}')
axx.axhline(0.2, linestyle='--', color='k')
axx.text(24, 0.7, '0.2% Change')
axx.set_ylabel('% Score % Change')

axx.set_ylim(-0.5, 10)

axx.legend(loc=5)

f.tight_layout()
f.savefig('RFECV_results.png', bbox='tight')

LGB number of features : 40
LGB number of features, stairs=positive : 20
Tree number of features: 46
Tree number of features, stairs=positive: 48


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [28]:
np.array(feature_names)[lgb_slc.support_]

array(['mean', 'meancrossrate', 'stddev', 'skewness', 'kurtosis', 'range',
       'iqr', 'linearslope', 'signalentropy',
       'complexityinvariantdistance_True', 'jerkmetric',
       'dimensionlessjerk_True_acceleration', 'autocorrelation_1_True',
       'autocorrelation_15_True', 'autocorrelation_14_True',
       'autocorrelation_12_True', 'sampleentropy_4_1.00',
       'sampleentropy_2_0.50', 'sampleentropy_2_0.25',
       'permutationentropy_3_1_True', 'permutationentropy_8_1_True',
       'permutationentropy_8_2_True', 'permutationentropy_8_8_True',
       'rangecountpercentage_0_1.00', 'rangecountpercentage_1_1.40',
       'ratiobeyondrsigma_1.00', 'ratiobeyondrsigma_0.50',
       'dominantfrequency_0.25_5.00', 'dominantfrequency_1.00_3.00',
       'dominantfrequency_1.50_6.00', 'dominantfrequencyvalue_0.25_5.00',
       'dominantfrequencyvalue_1.00_3.50',
       'dominantfrequencyvalue_1.00_3.00', 'powerspectralsum_1.00_3.00',
       'spectralflatness_0.00_8.00', 'spectralflatn

In [30]:
np.array(feature_names)[lgb_slc_istr.support_]

array(['mean', 'meancrossrate', 'stddev', 'skewness', 'range', 'iqr',
       'signalentropy', 'jerkmetric', 'autocorrelation_1_True',
       'autocorrelation_15_True', 'autocorrelation_12_True',
       'sampleentropy_2_0.50', 'permutationentropy_3_1_True',
       'permutationentropy_8_1_True', 'rangecountpercentage_0_1.00',
       'rangecountpercentage_0.50_1.40', 'ratiobeyondrsigma_0.50',
       'dominantfrequencyvalue_1.00_3.50', 'spectralentropy_0.00_3.50',
       'detailpowerratio_coif4_[1.0, 3.0]'], dtype='<U35')