In [1]:
"""
*** Instructions (assumes familiarity with Jupyter notebook) ***

1. To begin, create a training data by calling prep(window), this concatenates, pre-processes and builds the features
for all the collected accelerometer+gyroscope data in the /data folder.

2. (optional) test the accuracy other classifiers using `test_classifiers(training_data)`

3. To test the random forest classifier, run `test_model(training_data)`

4. To test the random forest + HMM combination, and view the difference, run `hmm_comparison(training_data)`

5. (optional) To run individual tests on the purpose built test data, prepare the specific file with `prep_test(file_to_test)`
- this function omits setting the state, but the name tells you what the expected outcome should be 
(see the data README file for file naming guidelines)

6. WIP: (optional) Use the state_reconciler on data involving stand ups to readjust the predicted states. Particularly
aimed at fixing OCG / YMOUNT errors

For algorithm details, see the readme file.

"""

'\n*** Instructions (assumes familiarity with Jupyter notebook) ***\n\n1. To begin, create a training data by calling prep(window), this concatenates, pre-processes and builds the features\nfor all the collected accelerometer+gyroscope data in the /data folder.\n\n2. (optional) test the accuracy other classifiers using `test_classifiers(training_data)`\n\n3. To test the random forest classifier, run `test_model(training_data)`\n\n4. To test the random forest + HMM combination, and view the difference, run `hmm_comparison(training_data)`\n\n5. (optional) To run individual tests on the purpose built test data, prepare the specific file with `prep_test(file_to_test)`\n- this function omits setting the state, but the name tells you what the expected outcome should be \n(see the data README file for file naming guidelines)\n\n6. WIP: (optional) Use the state_reconciler on data involving stand ups to readjust the predicted states. Particularly\naimed at fixing OCG / YMOUNT errors\n\nFor algo

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from hmmlearn import hmm

from data_prep import combine_state_features, prep, prep_test
from rolltec_features import create_features
from manage_state import set_state, set_stand_state, state_reconciler
from algorithm_tests import trial, trial_standup, test_model, test_model_stand, test_classifiers, hmm_comparison
from utilities import (convert_to_words, print_full, get_position_stats, combine_csv, resolve_acc_gyro,
                       blank_filter, concat_data, update_df)
from config import (TIME_SEQUENCE_LENGTH, HMM_TRANSITION_MATRIX, HMM_EMISSION_MATRIX, HMM_START_MATRIX, N_COMPONENTS)





In [3]:
# create training data. We use the default time sequence length (40 rows, i.e. 1.6 seconds)
training_data = prep(TIME_SEQUENCE_LENGTH)

	Series.rolling(window=38,center=True).mean()
  df2[element] = pd.rolling_mean(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).median()
  df2['rolling_median_'+element] = pd.rolling_median(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).max()
  df2['rolling_max_'+element] = pd.rolling_max(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).min()
  df2['rolling_min_'+element] = pd.rolling_min(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).sum()
  df2['rolling_sum_'+element] = pd.rolling_sum(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).std()
  df2['rolling_std_'+element] = pd.rolling_std(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).mean()
  df2['avg_tiltx'] = pd.rolling_mean(df2['tiltx'], TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).mean()
  df2['avg_tilty'] =

df2 length: 16240
sliding df length: 812
df2 length: 9352
sliding df length: 467
df2 length: 12820
sliding df length: 641
df2 length: 5923
sliding df length: 296
df2 length: 18582
sliding df length: 929
df2 length: 16760
sliding df length: 838
df2 length: 8720
sliding df length: 436
df2 length: 48840
sliding df length: 2442
df2 length: 20880
sliding df length: 1044
Removed 612 NaN rows


In [4]:
print training_data
print len(training_data)
# check the states have been created - you should see an array of values 0-7
pd.unique(training_data['state'].values.ravel())

      index     tiltx     tilty  stand   ACCEL_X   ACCEL_Y   ACCEL_Z  \
0        19 -0.260666  0.357542    0.0 -0.292316  0.012026 -0.900921   
1        39 -0.187004  0.056524    0.0 -0.279947  0.109053 -0.940895   
2        59 -0.190364  0.252252    0.0 -0.258079  0.114000 -0.950605   
3        79 -0.275254  0.042607    0.0 -0.188289  0.090132 -0.957342   
4        99  0.442236  0.669933    0.0 -0.073526  0.202816 -0.934947   
5       119 -0.253330  0.365558    0.0 -0.019816  0.199316 -0.978711   
6       139  0.044722  0.086329    0.0  0.115684  0.229842 -0.964263   
7       159 -0.468972  0.128312    0.0 -0.044368  0.179132 -0.955737   
8       179 -0.196384 -0.486311    0.0 -0.345605  0.081447 -0.913289   
9       199 -0.611937  0.220517    0.0 -0.356632  0.248789 -0.826474   
10      219 -0.348618 -0.175220    0.0 -0.071263  0.255737 -0.922553   
11      239 -0.346450  0.220219    0.0 -0.096158  0.272947 -0.908763   
12      259 -0.360857 -0.156293    0.0 -0.300395  0.352026 -0.85

array([0, 1, 2, 3, 4, 5, 6, 7])

In [5]:
# Optional Step: Test accuracy of a variety of classifiers
# test_classifiers(training_data)

In [5]:
# Test the accuracy of our classifier
test_model(training_data)

Random Forest prediction accuracy this time: 0.825222755312
Random Forest general accuracy: 0.73 (+/- 0.21)
Feature importance ranking:
1. feature ACCEL_Z (0.074076)
2. feature rolling_max_z (0.068993)
3. feature rolling_sum_z (0.066492)
4. feature rolling_min_z (0.053169)
5. feature rolling_median_z (0.052564)
6. feature rolling_sum_x (0.037420)
7. feature avg_tiltx (0.036446)
8. feature ACCEL_X (0.033756)
9. feature rolling_max_x (0.033347)
10. feature rolling_std_x (0.031464)
11. feature rolling_median_x (0.030133)
12. feature rolling_min_x (0.029522)
13. feature acc_magnitude (0.026860)
14. feature acc_rss (0.026055)
15. feature acc_rms (0.024583)
16. feature max_min_x (0.022912)
17. feature avg_stand (0.021564)
18. feature tiltx (0.020549)
19. feature stand (0.019054)
20. feature max_min_y (0.017169)
21. feature avg_tilty (0.015313)
22. feature rolling_sum_y (0.015030)
23. feature rolling_std_gz (0.014762)
24. feature rolling_std_gx (0.014373)
25. feature rolling_median_y (0.01420

In [4]:
# prepare a labelled test sequence - note the labels prefix on these test files
test_data = prep_test('labels_TEST1_user1.csv', real_test=True)

# second test not yet working - need to fix the parser
#test_data2 = prep_test('labels_TEST3_user2.csv', real_test=True)

ValueError: hour must be in 0..23

In [8]:
# This is the key comparison
hmm_result, rf_prediction = hmm_comparison(training_data, test_data2, label_test=True)

Random Forest prediction accuracy this time: 0.773109243697
Random Forest general accuracy: 0.73 (+/- 0.21)
COMPARISON
HMM final result: [7 7 7 3 3 3 3 3 3 3 2 2 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1
 1 1 1 1 1 1 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 7 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 0 0 0 0]
RF prediction result: [7 7 7 3 3 6 2 6 6 6 2 2 7 0 0 0 5 5 5 5 0 0 5 5 0 5 5 5 5 5 5 0 1 1 1 1 1
 1 1 1 1 1 0 2 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 2 0 0 5 5 5 5 0 0 0 5 5 5 5
 5 5 7 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 0 0 5 7]
y test values: [7 7 7 3 2 3 3 3 3 3 3 2 7 0 5 5 0 0 5 5 7 7 0 0 0 0 0 0 5 0 5 0 1 1 1 1 1
 1 1 1 1 1 0 2 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 2 1 0 5 0 0 0 0 0 5 5 5 0 5
 5 5 7 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 4 6 6 6 6 6 6 6 6 6
 6 6 6 6 1 0 5 7]
0.226890756303
RF accuracy: 0.773109243697
HMM accuracy: 0.789915966387


In [6]:
# optional - run trials on the test data - the file name tells you the sequence to expect

# create some test data from 3 different test users

# test_data_user1 = prep_test('GL_TEST1_CS.csv')
# test_data_user2 = prep_test('GL_TEST1_UrsWearing.csv')
# test_data_user3 = prep_test('DIO_OCG_YCG_YMOUNT_YBC.csv')

# my_trial = trial(training_data, test_data)

In [12]:
"""
This is for testing the effectiveness of the 
standing motion detection. The datasets below with STAND in the
name refer to sequences involving a stand up.

This is effectively a tangent to the main algorithm, but one which could
be very useful and widely applicable.
"""

test_data_stand = prep_test('CS_OCG_STAND_OCG.csv')

# TODO: DEBUG If I move the test data creation out of this cell it causes an error

foo = trial_standup(training_data, test_data_stand)

# NOTE: BE CAREFUL WITH CREATING COPIES OF DF
step2 = trial(training_data, foo)
print step2[['state', 'avg_stand']]

# step 3

fixed = state_reconciler(step2)

test_data1000 = prep_test('URS_OCG_STAND_OCG3.csv')
baz = trial_standup(training_data40, test_data1000)
step2again = trial(training_data40, baz)

Removed 1 NaN rows
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
['your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_side_control', 'your_mount', 'your_side_control', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount']
Your Mount: 0.947368421053
Your Side Control: 0.0526315789474
Your Closed Guard: 0.0
Your Back Control: 0.0
Opponent Mount or Opponent Side Control: 0.0
Opponent Closed Guard: 0.0
Opponent Back Control: 0.0
OTHER: 0.0

[7 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 7 7 5 5 5 5 5 5 5 5 5 7 7 

NameError: global name 'update_df' is not defined