In [1]:
"""
*** Instructions (assumes familiarity with Jupyter notebook) ***

1. To begin, create a training data by calling prep(window), this concatenates, pre-processes and builds the features
for all the collected accelerometer+gyroscope data in the /data folder.

2. (optional) test the accuracy other classifiers using `test_classifiers(training_data)`

3. To test the random forest classifier, run `test_model(training_data)`

4. To test the random forest + HMM combination, and view the difference, run `hmm_comparison(training_data)`

5. (optional) To run individual tests on the purpose built test data, prepare the specific file with `prep_test(file_to_test)`
- this function omits setting the state, but the name tells you what the expected outcome should be 
(see the data README file for file naming guidelines)

6. (optional) Use the state_reconciler on data involving stand ups to readjust the predicted states. Particularly
aimed at fixing OCG / YMOUNT errors

For algorithm details, see the readme file.

"""

'\n*** Instructions (assumes familiarity with Jupyter notebook) ***\n\n1. To begin, create a training data by calling prep(window), this concatenates, pre-processes and builds the features\nfor all the collected accelerometer+gyroscope data in the /data folder.\n\n2. (optional) test the accuracy other classifiers using `test_classifiers(training_data)`\n\n3. To test the random forest classifier, run `test_model(training_data)`\n\n4. To test the random forest + HMM combination, and view the difference, run `hmm_comparison(training_data)`\n\n5. (optional) To run individual tests on the purpose built test data, prepare the specific file with `prep_test(file_to_test)`\n- this function omits setting the state, but the name tells you what the expected outcome should be \n(see the data README file for file naming guidelines)\n\n6. (optional) Use the state_reconciler on data involving stand ups to readjust the predicted states. Particularly\naimed at fixing OCG / YMOUNT errors\n\nFor algorithm

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from hmmlearn import hmm

from data_prep import combine_state_features, prep, prep_test
from rolltec_features import create_features
from manage_state import set_state, set_stand_state, state_reconciler
from algorithm_tests import trial, trial_standup, test_model, test_model_stand, test_classifiers, hmm_comparison
from utilities import (convert_to_words, print_full, get_position_stats, combine_csv, resolve_acc_gyro,
                       blank_filter, concat_data, update_df)
from config import (TIME_SEQUENCE_LENGTH, HMM_TRANSITION_MATRIX, HMM_EMISSION_MATRIX, HMM_START_MATRIX, N_COMPONENTS)





In [3]:
# create training data. We use the default time sequence length (40 rows, i.e. 1.6 seconds)
training_data = prep(TIME_SEQUENCE_LENGTH)

	Series.rolling(window=38,center=True).mean()
  df2[element] = pd.rolling_mean(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).median()
  df2['rolling_median_'+element] = pd.rolling_median(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).max()
  df2['rolling_max_'+element] = pd.rolling_max(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).min()
  df2['rolling_min_'+element] = pd.rolling_min(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).sum()
  df2['rolling_sum_'+element] = pd.rolling_sum(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).std()
  df2['rolling_std_'+element] = pd.rolling_std(col, TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).mean()
  df2['avg_tiltx'] = pd.rolling_mean(df2['tiltx'], TIME_SEQUENCE_LENGTH-2, center=True)
	Series.rolling(window=38,center=True).mean()
  df2['avg_tilty'] =

df2 length: 16240
sliding df length: 16240
df2 length: 9352
sliding df length: 9352
df2 length: 12820
sliding df length: 12820
df2 length: 5923
sliding df length: 5923
df2 length: 18582
sliding df length: 18582
df2 length: 16760
sliding df length: 16760
df2 length: 8720
sliding df length: 8720
df2 length: 48840
sliding df length: 48840
df2 length: 20880
sliding df length: 20880
Removed 12555 NaN rows


In [4]:
print training_data
# check the states have been created - you should see an array of values 0-7
pd.unique(training_data['state'].values.ravel())

        index     tiltx     tilty  stand   ACCEL_X   ACCEL_Y   ACCEL_Z  \
19         19 -0.260666  0.357542    0.0 -0.292316  0.012026 -0.900921   
20         20 -0.526274  0.270347    0.0 -0.298947 -0.009395 -0.941237   
21         21 -0.524159  0.266709    0.0 -0.301842 -0.004921 -0.932684   
22         22 -0.403561  0.116862    0.0 -0.312263  0.000447 -0.931447   
23         23 -0.653313 -0.007449    0.0 -0.318263  0.017368 -0.932711   
24         24 -0.337718 -0.074182    0.0 -0.328237  0.026658 -0.930500   
25         25 -0.318126 -0.062126    0.0 -0.340447  0.033421 -0.935763   
26         26 -0.325999 -0.030464    0.0 -0.348868  0.045711 -0.927211   
27         27 -0.241506  0.004088    0.0 -0.355789  0.060316 -0.924342   
28         28 -0.225242  0.217501    0.0 -0.356553  0.084000 -0.928895   
29         29 -0.328777 -0.155572    0.0 -0.352342  0.100763 -0.924289   
30         30 -0.173587 -0.224690    0.0 -0.352816  0.110237 -0.931816   
31         31 -0.267890 -0.029532    0

array([0, 1, 2, 3, 4, 5, 6, 7])

In [5]:
# Optional Step: Test accuracy of a variety of classifiers
# test_classifiers(training_data)

In [42]:
# Test the accuracy of our classifier
test_model(training_data)

Random Forest prediction accuracy this time: 0.995294198468
Random Forest general accuracy: 0.74 (+/- 0.22)
Feature importance ranking:
1. feature ACCEL_Z (0.058912)
2. feature rolling_max_z (0.056619)
3. feature rolling_sum_z (0.055756)
4. feature rolling_median_z (0.052767)
5. feature rolling_min_z (0.050440)
6. feature rolling_min_x (0.034505)
7. feature ACCEL_X (0.034134)
8. feature avg_tiltx (0.034053)
9. feature rolling_sum_x (0.033313)
10. feature rolling_max_x (0.031763)
11. feature rolling_median_x (0.027155)
12. feature rolling_std_x (0.026696)
13. feature max_min_x (0.026504)
14. feature acc_rss (0.024838)
15. feature acc_magnitude (0.024522)
16. feature acc_rms (0.024215)
17. feature max_min_y (0.021297)
18. feature avg_stand (0.019870)
19. feature rolling_min_y (0.017813)
20. feature rolling_median_y (0.017091)
21. feature tiltx (0.016821)
22. feature rolling_max_y (0.016219)
23. feature avg_tilty (0.016174)
24. feature ACCEL_Y (0.016148)
25. feature rolling_sum_y (0.01607

In [6]:
# prepare a labelled test sequence
test_data = prep_test('labels_TEST1_user1.csv', real_test=True)

      state_x  ACCEL_X  ACCEL_Y  ACCEL_Z  state_y  GYRO_X  GYRO_Y  GYRO_Z  \
0           3   -0.935   -0.369    0.220      3.0   0.549   0.793   0.244   
1           3   -1.008   -0.054    0.132      3.0   0.671   0.427   0.793   
2           3   -1.018   -0.057    0.136      3.0   0.488   0.427   1.037   
3           3   -1.013   -0.053    0.139      3.0   0.793  -0.244   0.732   
4           3   -1.017   -0.056    0.137      3.0   0.671   0.549   0.976   
5           3   -1.017   -0.050    0.140      3.0   0.671   1.037   0.976   
6           3   -1.013   -0.052    0.141      3.0   0.915   0.183   0.854   
7           3   -1.014   -0.052    0.141      3.0   0.915  -0.061   0.915   
8           3   -1.013   -0.053    0.139      3.0   0.854   0.549   0.793   
9           3   -1.014   -0.052    0.138      3.0   0.366   0.244   0.183   
10          3   -1.021   -0.058    0.129      3.0  -0.305   0.427   0.244   
11          3   -1.008   -0.048    0.135      3.0  -0.061   0.000   0.427   

In [6]:
test_data['state']

19      3
20      3
21      3
22      3
23      3
24      3
25      3
26      3
27      3
28      3
29      3
30      3
31      3
32      3
33      3
34      3
35      3
36      3
37      3
38      3
39      3
40      3
41      3
42      3
43      3
44      3
45      3
46      3
47      3
48      3
       ..
2404    6
2405    6
2406    6
2407    6
2408    6
2409    6
2410    6
2411    6
2412    6
2413    6
2414    6
2415    6
2416    6
2417    6
2418    6
2419    6
2420    6
2421    6
2422    6
2423    6
2424    6
2425    6
2426    6
2427    6
2428    6
2429    6
2430    6
2431    6
2432    6
2433    6
Name: state, dtype: int64

In [32]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from hmmlearn import hmm

def hmm_comparison2(df_train, df_test, label_test=False):
    """check different classifier accuracy, rank features"""

    y = df_train['state'].values
    X = df_train.drop(['state', 'index', 'stand'], axis=1)
    
    if X.isnull().values.any() == False: 
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3)
        rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                max_depth=None, max_features='auto', max_leaf_nodes=None,
                min_samples_leaf=8, min_samples_split=4,
                min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                oob_score=False, random_state=None, verbose=0,
                warm_start=False)
    else: 
        print "Abort: Found NaN values"

    rf.fit(X_train, y_train)
    df_test_ready = df_test
    df_test_ready.drop(['state'], axis=1)
    rf_prediction = rf.predict(df_test_ready)
    rf_scores = cross_validation.cross_val_score(rf, X, df_train.state, cv=10, scoring='accuracy')
    

    # report on the accuracy
    print 'Random Forest prediction accuracy this time: {}'.format(accuracy_score(df_test['state'], rf_prediction))
    print('Random Forest general accuracy: %0.2f (+/- %0.2f)' % (rf_scores.mean(), rf_scores.std() * 2))
    # Enhance with the HMM

    # Hidden Markov Model with multinomial (discrete) emissions
    model = hmm.MultinomialHMM(n_components=N_COMPONENTS,
                               n_iter=10,
                               verbose=False)

    model.startprob_ = HMM_START_MATRIX
    model.transmat_ = HMM_TRANSITION_MATRIX
    model.emissionprob_ = HMM_EMISSION_MATRIX
    print "rf prediction values: {}".format(rf_prediction)
    print "rf prediction type: {}".format(type(rf_prediction))
    observations = np.array(rf_prediction)
    n_samples = len(observations)
    hmm_input_data = observations.reshape((n_samples, -1))
    hmm_result = model.decode(hmm_input_data, algorithm='viterbi')

    # NOW THE COMPARISON
    print "COMPARISON"
    print 'HMM final result: {}'.format(hmm_result[1])
    print 'HMM length: {}'.format(len(hmm_result[1]))
    print "RF prediction result: {}".format(rf_prediction)
    print 'RF length: {}'.format(len(rf_prediction)) 
    print "Actual states length: {}".format(len(df_test['state'].values))
    print "y test values: {}".format(df_test['state'].values)

    rf_comparison_accuracy = np.mean(rf_prediction != df_test['state'])
    hmm_comparison_accuracy = np.mean(hmm_result[1] != df_test['state'])
    
    print (rf_prediction != df_test['state']).sum()/float(rf_prediction.size)
    
    print_full(df_test['state'])
    print_full(rf_prediction)

    print "RF accuracy: {}".format(rf_comparison_accuracy)
    print "HMM accuracy: {}".format(hmm_comparison_accuracy)

    # best of 10 code
    """
    i = 10
    while(i > 0):
        hmm_result = model.decode(hmm_input_data, algorithm='viterbi')
    """
    return hmm_result, rf_prediction

In [33]:
hmm_result, rf_prediction = hmm_comparison2(training_data, test_data, label_test=True)

Random Forest prediction accuracy this time: 0.0939958592133
Random Forest general accuracy: 0.74 (+/- 0.22)
rf prediction values: [1 1 1 ..., 1 1 1]
rf prediction type: <type 'numpy.ndarray'>
COMPARISON
HMM final result: [1 1 1 ..., 1 1 1]
HMM length: 2415
RF prediction result: [1 1 1 ..., 1 1 1]
RF length: 2415
Actual states length: 2415
y test values: [3 3 3 ..., 6 6 6]
0.906004140787
19      3
20      3
21      3
22      3
23      3
24      3
25      3
26      3
27      3
28      3
29      3
30      3
31      3
32      3
33      3
34      3
35      3
36      3
37      3
38      3
39      3
40      3
41      3
42      3
43      3
44      3
45      3
46      3
47      3
48      3
49      3
50      3
51      3
52      3
53      3
54      3
55      3
56      3
57      3
58      3
59      3
60      3
61      3
62      3
63      3
64      3
65      3
66      3
67      3
68      3
69      3
70      3
71      3
72      3
73      3
74      3
75      3
76      3
77      3
78      3
79      3

In [41]:
print rf_prediction[1900:1925]
print test_data['state'][1900:1925]

[6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
1919    4
1920    4
1921    4
1922    4
1923    4
1924    4
1925    4
1926    4
1927    4
1928    4
1929    4
1930    4
1931    4
1932    4
1933    4
1934    4
1935    4
1936    4
1937    4
1938    4
1939    4
1940    4
1941    4
1942    4
1943    4
Name: state, dtype: int64


In [6]:
my_trial = trial(training_data, test_data)

[7 7 3 6 6 6 6 6 3 3 3 6 7 7 0 0 5 0 5 5 5 0 0 5 5 0 1 1 1 1 1 6 1 1 1 1 1
 1 1 0 7 2 4 4 2 4 4 2 2 2 2 4 4 2 2 4 2 2 2 2 2 2 4 2 5 0 5 5 5 5 5 5 5 5
 0 5 5 5 5 5 2 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 6 6 4 6 6 6 6 6 6
 6 6 6 6 6 6 6 1 1 0]
['OTHER', 'OTHER', 'your_back_control', 'opponent_back_control', 'opponent_back_control', 'opponent_back_control', 'opponent_back_control', 'opponent_back_control', 'your_back_control', 'your_back_control', 'your_back_control', 'opponent_back_control', 'OTHER', 'OTHER', 'your_mount', 'your_mount', 'opponent_closed_guard', 'your_mount', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'your_mount', 'your_mount', 'opponent_closed_guard', 'opponent_closed_guard', 'your_mount', 'your_side_control', 'your_side_control', 'your_side_control', 'your_side_control', 'your_side_control', 'opponent_back_control', 'your_side_control', 'your_side_control', 'your_side_control', 'your_side_control', 'your_side_control', 'your_side_co

In [7]:
len(my_trial)

121

In [None]:
GL_TEST1_CS.csv
GL_TEST1_UrsWearing.csv
GL_TEST2_CS.csv
GL_TEST2_UrsWearing.csv
GL_TEST3_CS_very_still.csv
GL_TEST3_UrsWearing.csv

In [10]:
print len(hmm_result[1])
print len(training_data)

1459
7293


In [4]:
# create some test data from 3 different test users

test_data_user1 = prep_test('GL_TEST1_CS.csv')
test_data_user2 = prep_test('GL_TEST1_UrsWearing.csv')
test_data_user3 = prep_test('DIO_OCG_YCG_YMOUNT_YBC.csv')


Removed 2 NaN rows
Removed 1 NaN rows
Removed 1 NaN rows


In [12]:
"""
This is for testing the effectiveness of the 
standing motion detection. The datasets below with STAND in the
name refer to sequences involving a stand up.

This is effectively a tangent to the main algorithm, but one which could
be very useful and widely applicable.
"""

test_data_stand = prep_test('CS_OCG_STAND_OCG.csv')

# TODO: DEBUG If I move the test data creation out of this cell it causes an error

 = trial_standup(training_data, test_data_stand)

# NOTE: BE CAREFUL WITH CREATING COPIES OF DF
step2 = trial(training_data, foo)
print step2[['state', 'avg_stand']]

# step 3

fixed = state_reconciler(step2)

test_data1000 = prep_test('URS_OCG_STAND_OCG3.csv')
baz = trial_standup(training_data40, test_data1000)
step2again = trial(training_data40, baz)
pre_pre_smooth = step2again['state'].values
pre_pre_smooth_words = convert_to_words(pre_pre_smooth)

pre_smooth = fixed['state'].values
pre_smooth_words = convert_to_words(pre_smooth)

print pre_pre_smooth_words
print pre_smooth_words

Removed 1 NaN rows
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
['your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_side_control', 'your_mount', 'your_side_control', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount', 'your_mount']
Your Mount: 0.947368421053
Your Side Control: 0.0526315789474
Your Closed Guard: 0.0
Your Back Control: 0.0
Opponent Mount or Opponent Side Control: 0.0
Opponent Closed Guard: 0.0
Opponent Back Control: 0.0
OTHER: 0.0

[7 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 7 7 5 5 5 5 5 5 5 5 5 7 7 

NameError: global name 'update_df' is not defined

In [41]:
# Hidden Markov Model with multinomial (discrete) emissions
model = hmm.MultinomialHMM(n_components=N_COMPONENTS,
                           n_iter=10,
                           verbose=False)

model.startprob_ = HMM_START_MATRIX
model.transmat_ = HMM_TRANSITION_MATRIX
model.emissionprob_ = HMM_EMISSION_MATRIX
# model.n_features = 8

In [42]:
observations = np.array(pre_smooth)

n_samples = len(observations)
data = observations.reshape((n_samples, -1))
print data

[[5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [7]
 [5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [5]
 [0]
 [0]
 [5]
 [5]
 [7]
 [7]
 [7]]


In [43]:
print 'TEST 1'

result = model.decode(data, algorithm='viterbi')
print 'pre smooth: {}'.format(pre_pre_smooth)
print 'result accuracy {}%'.format(result[0])
print 'final result: {}'.format(result[1])

result_words = convert_to_words(result[1])
print '====================='
print 'pre pre smooth words: {}'.format(pre_pre_smooth_words)
print '====================='
print 'result words: {}'.format(result_words)


print '\n'
print "pre pre smooth stats (before stand up detection)"
print get_position_stats(pre_pre_smooth_words)


print '\n'
print "pre smooth stats (before HMM)"
print get_position_stats(pre_smooth_words)

print '\n'

print 'result stats'
print get_position_stats(result_words)

print '******************'




TEST 1
pre smooth: [5 5 5 0 0 5 5 5 5 5 5 7 5 5 5 5 5 0 5 0 0 5 5 7 7 7]
result accuracy -36.3301057979%
final result: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 7 7 7]
pre pre smooth words: ['opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'your_mount', 'your_mount', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'OTHER', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'your_mount', 'opponent_closed_guard', 'your_mount', 'your_mount', 'opponent_closed_guard', 'opponent_closed_guard', 'OTHER', 'OTHER', 'OTHER']
result words: ['opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponent_closed_guard', 'opponen