In [1]:
%matplotlib widget

In [30]:
import numpy as np
import pandas as pd
import random

import lightgbm as lgb

from sklearn.feature_selection import RFECV
from sklearn.metrics import f1_score, make_scorer
from sklearn.tree import DecisionTreeClassifier

In [4]:
data = pd.read_hdf('../feature_exploration/features.h5', key='incl_stairs')

feats = data.iloc[:, 3:]
labels = data.Label

In [5]:
# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = data.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

for i in range(0, len(loso_subjects), 4):
    tr_m = np.ones(data.shape[0], dtype='bool')
    v_m = np.zeros(data.shape[0], dtype='bool')
    te_m = np.zeros(data.shape[0], dtype='bool')
    
    for j in range(4):
        tr_m &= (data.Subject != loso_subjects[i+j]).values
    for j in range(2):
        v_m |= (data.Subject == loso_subjects[i+j]).values
    for j in range(2):
        te_m |= (data.Subject == loso_subjects[i+j+2]).values
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)

In [32]:
estimator = lgb.LGBMClassifier(n_estimators=75, random_state=82)
tree = DecisionTreeClassifier(random_state=398)

rfecv = RFECV(
    estimator, 
    step=1, 
    min_features_to_select=1, 
    cv=tuple(zip(training_masks, validation_masks)),
    scoring=make_scorer(f1_score),
    n_jobs=-1,
    verbose=1
)

trfecv = RFECV(
    tree,
    step=1,
    min_features_to_select=1,
    cv=tuple(zip(training_masks, validation_masks)),
    scoring=make_scorer(f1_score),
    n_jobs=-1,
    verbose=1
)

In [33]:
selector = rfecv.fit(feats, labels);

Fitting estimator with 26 features.
Fitting estimator with 25 features.


In [34]:
trfecv.fit(feats, labels);

Fitting estimator with 26 features.


RFECV(cv=((array([False, False, False, ...,  True,  True,  True]),
           array([ True,  True,  True, ..., False, False, False])),
          (array([ True,  True,  True, ...,  True,  True,  True]),
           array([False, False, False, ..., False, False, False])),
          (array([ True,  True,  True, ...,  True,  True,  True]),
           array([False, False, False, ..., False, False, False])),
          (array([ True,  True,  True, ..., False, False, False]),
           array([Fals...
          (array([ True,  True,  True, ...,  True,  True,  True]),
           array([False, False, False, ..., False, False, False])),
          (array([ True,  True,  True, ...,  True,  True,  True]),
           array([False, False, False, ..., False, False, False])),
          (array([ True,  True,  True, ...,  True,  True,  True]),
           array([False, False, False, ..., False, False, False]))),
      estimator=DecisionTreeClassifier(random_state=398), n_jobs=-1,
      scoring=make_scorer(f

In [35]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [52]:
print("LGB number of features : %d" % selector.n_features_)
print(f'Tree number of features: {trfecv.n_features_}')

f, ax = plt.subplots(figsize=(10, 5))
ax.set_xlabel("Number of features selected")
ax.set_ylabel("Cross validation score (nb of correct classifications)")
ax.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_ * 100, '.-', label='LGB')
ax.plot(range(1, len(trfecv.grid_scores_)+1), trfecv.grid_scores_*100, '.-', label='Tree')

axx = ax.twinx()
axx.grid(False)
axx.plot(range(1, len(selector.grid_scores_)), np.diff(selector.grid_scores_) / selector.grid_scores_[:-1] * 100, '.--', label=f'LGB: N={selector.n_features_}')
axx.plot(range(1, len(trfecv.grid_scores_)), np.diff(trfecv.grid_scores_) / trfecv.grid_scores_[:-1] * 100, '.--', label=f'Tree: N={trfecv.n_features_}')
axx.axhline(0.2, linestyle='--', color='k')
axx.text(24, 0.7, '0.2% Change')
axx.set_ylabel('% Score % Change')

axx.legend(loc=5)

f.tight_layout()
f.savefig('RFECV_results.png', bbox='tight')

LGB number of features : 24
Tree number of features: 25


  f, ax = plt.subplots(figsize=(10, 5))


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [55]:
print(feats.columns[~trfecv.support_])
print(feats.columns[~selector.support_])

Index(['_sampleentropy'], dtype='object')
Index(['_rms', '_ratiobeyondrsigma'], dtype='object')
