In [1]:
%matplotlib widget

In [2]:
import numpy as np
import pandas as pd
import random

import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score

In [3]:
data = pd.read_hdf('../feature_exploration/features.h5', key='incl_stairs')

feats = data.iloc[:, 3:]
labels = data.Label

In [4]:
# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = data.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

for i in range(0, len(loso_subjects), 4):
    tr_m = np.ones(data.shape[0], dtype='bool')
    v_m = np.zeros(data.shape[0], dtype='bool')
    te_m = np.zeros(data.shape[0], dtype='bool')
    
    for j in range(4):
        tr_m &= (data.Subject != loso_subjects[i+j]).values
    for j in range(2):
        v_m |= (data.Subject == loso_subjects[i+j]).values
    for j in range(2):
        te_m |= (data.Subject == loso_subjects[i+j+2]).values
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)

masks = (training_masks, validation_masks, testing_masks)

In [10]:
acc, bacc, f1 = [], [], [] 
for trm, vm in zip(training_masks, validation_masks):
    clf = lgb.LGBMClassifier(learning_rate=0.2, random_state=42)

    clf.fit(feats.loc[trm], labels[trm]);
    y_pred = clf.predict(feats.loc[vm])

    acc.append(accuracy_score(labels[vm], y_pred))
    bacc.append(balanced_accuracy_score(labels[vm], y_pred))
    f1.append(f1_score(labels[vm], y_pred))

    print(f'Accuracy: {acc[-1]*100:.2f}   Balanced Accuracy: {bacc[-1]*100:.2f}   F1: {f1[-1]*100:.2f}')

print('\n', '-'*50)
print(f'Mean Accuracy: {np.mean(acc)*100:.2f}  Mean Bal. Acc.: {np.mean(bacc)*100:.2f}  Mean F1: {np.mean(f1)*100:.2f}')

Accuracy: 99.13   Balanced Accuracy: 99.07   F1: 98.58
Accuracy: 98.40   Balanced Accuracy: 98.80   F1: 96.98
Accuracy: 96.88   Balanced Accuracy: 96.59   F1: 94.04
Accuracy: 96.65   Balanced Accuracy: 97.57   F1: 94.10
Accuracy: 85.31   Balanced Accuracy: 86.80   F1: 85.47
Accuracy: 88.23   Balanced Accuracy: 88.70   F1: 87.89
Accuracy: 98.40   Balanced Accuracy: 98.50   F1: 98.71
Accuracy: 98.62   Balanced Accuracy: 98.73   F1: 98.72
Accuracy: 97.91   Balanced Accuracy: 97.99   F1: 97.20
Accuracy: 98.53   Balanced Accuracy: 98.25   F1: 98.88
Accuracy: 90.22   Balanced Accuracy: 92.92   F1: 86.06

 --------------------------------------------------
Mean Accuracy: 95.30  Mean Bal. Acc.: 95.81  Mean F1: 94.24


In [26]:
acc, bacc, f1 = [], [], []

n_estimators = np.array([10, 20, 40, 50, 60, 65, 70, 75, 80, 90, 100, 125])

for n_est in n_estimators:
    acc.append([])
    bacc.append([])
    f1.append([])
    for trm, vm in zip(training_masks, validation_masks):
        clf = lgb.LGBMClassifier(learning_rate=0.2, n_estimators=n_est, random_state=42)

        clf.fit(feats.loc[trm], labels[trm]);
        y_pred = clf.predict(feats.loc[vm])

        acc[-1].append(accuracy_score(labels[vm], y_pred))
        bacc[-1].append(balanced_accuracy_score(labels[vm], y_pred))
        f1[-1].append(f1_score(labels[vm], y_pred))
    
    print(f'{n_est:3d}  Accuracy: {np.mean(acc[-1])*100:.2f}   Balanced Accuracy: {np.mean(bacc[-1])*100:.2f}   F1: {np.mean(f1[-1])*100:.2f}')

 10  Accuracy: 94.37   Balanced Accuracy: 95.15   F1: 92.98
 20  Accuracy: 94.62   Balanced Accuracy: 95.33   F1: 93.31
 40  Accuracy: 95.00   Balanced Accuracy: 95.57   F1: 93.86
 50  Accuracy: 95.17   Balanced Accuracy: 95.71   F1: 94.05
 60  Accuracy: 95.16   Balanced Accuracy: 95.69   F1: 94.06
 65  Accuracy: 95.24   Balanced Accuracy: 95.75   F1: 94.16
 70  Accuracy: 95.28   Balanced Accuracy: 95.80   F1: 94.20
 75  Accuracy: 95.33   Balanced Accuracy: 95.85   F1: 94.26
 80  Accuracy: 95.30   Balanced Accuracy: 95.83   F1: 94.22
 90  Accuracy: 95.30   Balanced Accuracy: 95.82   F1: 94.23
100  Accuracy: 95.30   Balanced Accuracy: 95.81   F1: 94.24
125  Accuracy: 95.38   Balanced Accuracy: 95.89   F1: 94.35


In [28]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

macc = np.array([np.mean(i) for i in acc])
mbacc = np.array([np.mean(i) for i in bacc])
mf1 = np.array([np.mean(i) for i in f1])

f, ax = plt.subplots(figsize=(8, 5))
ax.plot(n_estimators, macc * 100, '.-', label='Accuracy')
ax.plot(n_estimators, mbacc * 100, '.-', label='Balanced Accuracy')
ax.plot(n_estimators, mf1 * 100, '.-', label='F1')

axx = ax.twinx()
axx.grid(False)
axx.plot(n_estimators[1:], np.diff(macc) / macc[:-1] * 100, '.--')
axx.plot(n_estimators[1:], np.diff(mbacc) / mbacc[:-1] * 100, '.--')
axx.plot(n_estimators[1:], np.diff(mf1) / mf1[:-1] * 100, '.--')

ax.legend()
ax.set_xlabel('# Estimators')
ax.set_ylabel('% Score')
axx.set_ylabel('% Score Change')

f.tight_layout()

# f.savefig('lightgbm_n-estimators_performance.png', bbox='tight')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …