In [1]:
%matplotlib widget

In [24]:
import numpy as np
import pandas as pd
from scipy.signal import butter, sosfiltfilt
import matplotlib.pyplot as plt
import random
from pathlib import Path

import lightgbm as lgb

from sklearn.metrics import f1_score, make_scorer, accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.model_selection import RandomizedSearchCV, cross_validate

from PfyMU.gait.train_classifier.core import load_datasets
from PfyMU.features import *

plt.style.use('ggplot')

In [3]:
def mag_band_filter(x, fs):
    sos = butter(1, [2 * 0.25 / fs, 2 * 5 / fs], btype='band', output='sos')
    return sosfiltfilt(sos, np.linalg.norm(x, axis=1))

steps = {
    'walking': 0.4,
    'walking-impaired': 0.2,
    'sitting': 900,
    'standing': 300,
    'stairs-ascending': 0.3,
    'stairs-descending': 0.3,
    'cycling-50W': 0.3,
    'cycling-100W': 0.3,
    'default': 1.0
}

In [4]:
# gait_sets_path = Path('/Users/adamol/Documents/Datasets/gait/processed')
gait_sets_path = Path('/home/lukasadamowicz/Documents/Datasets/processed')

datasets = [
    gait_sets_path / 'bluesky2',
    gait_sets_path / 'daliac',
    gait_sets_path / 'ltmm',
    gait_sets_path / 'usc-had'
]

kwargs = {'paths': datasets, 'goal_fs': 50.0, 'window_step': steps, 'window_length': 3.0}

In [5]:
X, Y, subjects, activities = load_datasets(
    acc_mag=False, 
    signal_function=mag_band_filter,
    **kwargs
)

In [6]:
random.seed(398)
rnd_subjects = [i for i in np.unique(subjects) if np.unique(activities[subjects==i]).size > 3]
random.shuffle(rnd_subjects)

training_masks, validation_masks, testing_masks = [], [], []

for i in range(0, len(rnd_subjects), 4):
    trm = np.ones(len(subjects), dtype='bool')
    vm = np.zeros_like(trm, dtype='bool')
    tem = np.zeros_like(trm, dtype='bool')
    
    for j in range(4):
        trm &= subjects != rnd_subjects[i + j]
        if j < 2:
            vm |= subjects == rnd_subjects[i + j]
        else:
            tem |= subjects == rnd_subjects[i + j]
    
    training_masks.append(trm)
    validation_masks.append(vm)
    testing_masks.append(tem)

In [7]:
with open('lgb_features.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        print(line, end='')

dominantfrequency_1.00_3.50
meancrossrate
range
rms
sampleentropy_2_0.50
autocorrelation_15_True
iqr
rangecountpercentage_0.40_1.50
complexityinvariantdistance_True
permutationentropy_3_1_True
spectralentropy_0.00_5.00
spectralflatness_0.00_6.00
mean
jerkmetric
dimensionlessjerk_True_acceleration
signalentropy
sparc_4_10.00_0.05
linearslope


In [8]:
FB = Bank(window_length=None, window_step=None)

FB + DominantFrequency(low_cutoff=1.0, high_cutoff=3.5)
FB + MeanCrossRate()
FB + Range()
FB + RMS()
FB + SampleEntropy(m=2, r=0.5)
FB + Autocorrelation(lag=15, normalize=True)
FB + IQR()
FB + RangeCountPercentage(range_min=0.4, range_max=1.5)
FB + ComplexityInvariantDistance(normalize=True)
FB + PermutationEntropy(order=3, delay=1, normalize=True)
FB + SpectralEntropy(low_cutoff=0.0, high_cutoff=5.0)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=6.0)
FB + Mean()
FB + JerkMetric()
FB + DimensionlessJerk(log=True, signal_type='acceleration')
FB + SignalEntropy()
FB + SPARC(padlevel=4, fc=10.0, amplitude_threshold=0.05)
FB + LinearSlope()

In [9]:
X_feat, fnames = FB.compute(X, fs=50.0, windowed=True, columns=[''])

In [10]:
feats = pd.DataFrame(data=X_feat, columns=fnames)

# LightGBM Model

In [None]:
param_distributions = {
    'num_leaves': [25, 27, 29, 31, 33, 35, 40, 50],
    'max_depth': [8, 12, 16],
    'learning_rate': [0.005, 0.01, 0.05, 0.075, 0.1, 0.15, 0.18, 0.2, 0.25],
    'min_split_gain': [0., 0., 0.05, 0.1, 0.2, 0.3],
    'min_child_weight': [1e-4, 1e-3, 1e-2],
    'min_child_samples': [10, 20, 30],
    'reg_alpha': [0., 5e-2, 5e-1, 5],
    'reg_lambda': [0., 5e-2, 5e-1, 5],
}

rcv = RandomizedSearchCV(
    lgb.LGBMClassifier(random_state=42),
    param_distributions=param_distributions,
    n_iter=600,
    scoring=make_scorer(f1_score),
    n_jobs=-1,
    cv=zip(training_masks, validation_masks),
    refit=False,
    verbose=1
)

rcv_results = rcv.fit(X_feat, Y)

In [None]:
res = pd.DataFrame(data=rcv.cv_results_)
res.to_csv('final_lgb_cv_search.csv')

pcol = [i for i in res.columns if 'param_' in i] + ['mean_test_score', 'rank_test_score']

res.sort_values('rank_test_score').loc[:, pcol].head(10)

In [None]:
best_params = res.loc[np.argmin(res.rank_test_score.values), 'params']


# with open('lgb_params.txt', 'w') as f:
#     for k in best_params:
#         f.write(f'{k}: {best_params[k]}\n')
        
best_params

In [None]:
nest = [20, 30, 50, 60, 70, 75, 80, 90, 100, 125]

mf1 = []
sf1 = []

for ne in nest:
    clf = lgb.LGBMClassifier(random_state=42, n_estimators=ne, **best_params)
    
    scores = cross_validate(
        clf,
        X_feat,
        Y,
        scoring=make_scorer(f1_score),
        cv=zip(training_masks, validation_masks),
        n_jobs=-1
    )
    
    mf1.append(np.mean(scores['test_score']))
    sf1.append(np.std(scores['test_score'], ddof=1))

In [None]:
plt.figure(figsize=(10, 5))

plt.errorbar(nest, mf1, yerr=sf1)

plt.tight_layout()

# SHAP

In [None]:
import shap

In [None]:
shap.initjs()

In [None]:
best_params = {}
with open('lgb_params.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        parts = line.strip('\n').split(': ')
        
        best_params[parts[0]] = float(parts[1]) if '.' in parts[1] else int(parts[1])

In [None]:
clf = lgb.LGBMClassifier(random_state=42, n_estimators=125, **best_params)

clf.fit(X_feat[training_masks[0]], Y[training_masks[0]])

In [None]:
explainer = shap.TreeExplainer(clf)

In [None]:
shap_values = explainer.shap_values(X_feat[training_masks[0]])

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0][0], feats.loc[np.argwhere(training_masks[0])[0]])

In [None]:
plt.figure()

shap.summary_plot(shap_values[0], feats.loc[training_masks[0]], plot_type='dot')

plt.tight_layout()

In [None]:
?shap.summary_plot

# Thresholds

In [11]:
best_params = {}
with open('lgb_params.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        parts = line.strip('\n').split(': ')
        
        best_params[parts[0]] = float(parts[1]) if '.' in parts[1] else int(parts[1])

In [15]:
clf = lgb.LGBMClassifier(random_state=42, n_estimators=125, **best_params)

scorers = {
    'F1': make_scorer(f1_score), 
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score)
}

scores = cross_validate(
    clf,
    X_feat,
    Y,
    scoring=scorers,
    cv=zip(training_masks, validation_masks),
    n_jobs=-1,
    return_estimator=True
)

In [29]:
print('Accuracy:  ', np.mean(scores['test_Accuracy']))
print('Recall:    ', np.mean(scores['test_Recall']))
print('Precision: ', np.mean(scores['test_Precision']))
print('F1:        ', np.mean(scores['test_F1']))

Accuracy:   0.923359747665626
Recall:     0.9122837910107866
Precision:  0.9014838020614788
F1:         0.901129880970889


In [32]:
y_pred

array([0.02858763, 0.01016781, 0.0382296 , ..., 0.96583774, 0.95496513,
       0.85643185])

In [52]:
f, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 12))

f_scores = np.linspace(0.2, 0.8, num=4)
lines = []
labels = []
for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    l, = ax[1, 0].plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
    ax[1, 0].annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

opt_thresh = []
opt_thresh2 = []
for i, est in enumerate(scores['estimator']):
    y_pred = est.predict_proba(X_feat[validation_masks[i]])[:, 1]
    y_true = Y[validation_masks[i]]
    
    fpr, tpr, trsh = roc_curve(y_true, y_pred)
    auc_ = roc_auc_score(y_true, y_pred)
    
    imn = np.argmin(np.sqrt(fpr**2 + (1 - tpr)**2))
    opt_thresh.append(trsh[imn])
    
    ax[0, 0].plot(fpr, tpr, label=f'Fold {i+1}: {auc_:.2f}')
    
    for th, mk in zip([0.4, 0.6, 0.7], ['o', '+', '^']):
        idx = np.argmin(np.abs(trsh - th))
        ax[0, 0].plot(fpr[idx], tpr[idx], marker=mk, color=f'C{i}')
    ax[0, 1].plot(trsh, np.sqrt(fpr**2 + (1 - tpr)**2), label=f'Fold {i+1}: {trsh[imn]:.2f}')

    
    prec, rec, pr_trsh = precision_recall_curve(y_true, y_pred)
    avg_p = average_precision_score(y_true, y_pred)
    
    imn = np.argmin(np.sqrt((1 - prec[:-1])**2 + (1 - rec[1:])**2))
    opt_thresh2.append(pr_trsh[imn])
    
    ax[1, 0].plot(rec, prec, label=f'Fold {i+1}: {avg_p:.2f}')
    ax[1, 1].plot(pr_trsh, np.sqrt((1 - prec[:-1])**2 + (1 - rec[1:])**2))

print(np.mean(opt_thresh), np.median(opt_thresh), np.std(opt_thresh, ddof=1))
print(np.mean(opt_thresh2), np.median(opt_thresh2), np.std(opt_thresh2, ddof=1))
    
ax[0, 0].plot([0, 1], [0, 1], 'k--')
ax[0, 0].legend()
ax[0, 0].set_xlim([0, 1])
ax[0, 0].set_ylim([0, 1.05])
ax[0, 0].set_xlabel('False Positive Rate')
ax[0, 0].set_ylabel('True Positive Rate')

ax[0, 1].set_xlim([0, 1])
ax[0, 1].set_ylim([0, 1.05])
ax[0, 1].set_xlabel('Threshold')

ax[1, 0].legend()
ax[1, 0].set_xlim([0, 1])
ax[1, 0].set_ylim([0, 1.05])
ax[1, 0].set_xlabel('Recall')
ax[1, 0].set_ylabel('Precision')

ax[1, 1].set_xlim([0, 1])
ax[1, 1].set_ylim([0, 1.05])
ax[1, 1].set_xlabel('Threshold')

f.tight_layout()

  f, ax = plt.subplots(nrows=2, ncols=2, figsize=(10, 12))


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

0.46508144746924757 0.44808947441453595 0.21743523426551897
0.5109362552505842 0.4285830726465222 0.2693552946963254
