# Evaluation of different Parameters (Features: Section Means)

In [1]:
import pandas as pd
import numpy as np
from pivottablejs import pivot_ui

# my package
from packageMeinhart import PhysioDataHandler as PDH
from packageMeinhart.functionsMasterProjectMeinhart import print_precision_recall_accuracy
from packageMeinhart.functionsMasterProjectMeinhart import print_misclassified_data_points

## Using the class *PhysioData_SectionFeatures* for feature generation

In [2]:
help(PDH.PhysioData_SectionFeatures)

Help on class PhysioData_SectionFeatures in module packageMeinhart.PhysioDataHandler:

class PhysioData_SectionFeatures(builtins.object)
 |  Class for feature generation using section means.
 |  There are various selectable options --> see Parameters. 
 |  
 |  Parameters
 |  ----------
 |  num_sections : int
 |      Number of equally partitioned sections to split the single repetitions of the signals.
 |      
 |  test_subject_ids : int or list (of int)
 |      Subject IDs to select for testing (e.g. [1, 2, 3]).
 |      --> default -1: Select all subjects.
 |      --> if test_subject_ids is an empty list: empty DataFrame is returned by corresponding method.
 |      
 |  train_subject_ids : int or list
 |      Subject IDs to select for training (e.g. [1, 2, 3]).
 |      --> default -1: Select all subjects not in test_subject_ids.
 |      --> if train_subject_ids is an empty list: empty DataFrame is returned by corresponding method.
 |      
 |  test_rep_nums : int or list
 |      Repet

### Create instance of physio data class

In [3]:
PD1 = PDH.PhysioData_SectionFeatures(num_sections=10,
                                     test_subject_ids=1,
                                     train_subject_ids=-1,
                                     test_rep_nums=-1,
                                     train_rep_nums=-1,
                                     test_ex_abbrs=-1,
                                     train_ex_abbrs=-1,
                                     with_non_Ex=True,
                                     rot_axis_test_data=0,
                                     rot_angle_test_data=0,
                                     add_noise_test_data=False,
                                     add_noise_train_data=False,
                                     snr_db=20,
                                     csv_data_dir='E:\Physio_Data_Split_Ex_and_NonEx',
                                     csv_skiprows=0,
                                     csv_separator=',',
                                     data_base_path='E:\Physio_Data\DataBase_Physio_with_nonEx.db',
                                     print_progress=True,
                                     signal_abbrs=['Acc','Gyr'],
                                     signal_orientations=['x','y','z'],
                                     labels_abbr2num_dict={'RF':0,'RO':1,'RS':2,'LR':3,'BC':4,'TC':5,
                                                           'MP':6,'SA':7,'P1':8,'P2':9,'NE':10},
                                     sub_id_key='subject_id',
                                     num_rep_key='num_rep',
                                     abbreviation_key='abbreviation',
                                     start_time_key='start_time',
                                     stop_time_key='stop_time',
                                     csv_file_key='csv_file',
                                     sampling_rate=256)

### Inspecting selected data for testing

In [4]:
pivot_ui(PD1.get_test_data_points(), 
         rows=['abbreviation'], 
         cols=['subject_id', 'num_rep'], 
         outfile_path="PD1_test.html")

### Inspecting selected data for training

In [5]:
pivot_ui(PD1.get_train_data_points(), 
         rows=['abbreviation'], 
         cols=['subject_id', 'num_rep'], 
         outfile_path="PD1_train.html")

## Classification (ML part)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
# StandardScaler raises the following warning:
# --> DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
# if we want to ignore that:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

### Try it with different ML models

In [21]:
# create ML model
ML_model = RandomForestClassifier(n_estimators=50, max_leaf_nodes=40, n_jobs=-1, random_state=42)
#ML_model = make_pipeline(StandardScaler(), SVC(random_state=42)) # Support Vector Classifier with input scaling

# train the model
ML_model.fit(PD1.X_train(), PD1.y_train())

# predict labels
y_pred = ML_model.predict(PD1.X_test())

# show results
print('Model: ' + type(ML_model).__name__ + '\n')
print('Total Accuracy: {:.2f}%\n'.format((accuracy_score(PD1.y_test(), y_pred))*100))
print_precision_recall_accuracy(y_pred, PD1.y_test())
print('')
print_misclassified_data_points(y_pred, PD1.y_test())

Model: RandomForestClassifier

Total Accuracy: 98.87%

Exercise	Precision [%]	Recall [%]	Accuracy[%]
  RF		  100.00	   93.33	   99.72
  RO		   93.75	  100.00	   99.72
  RS		  100.00	  100.00	  100.00
  LR		  100.00	  100.00	  100.00
  BC		  100.00	  100.00	  100.00
  TC		  100.00	  100.00	  100.00
  MP		   85.71	  100.00	   99.29
  SA		  100.00	  100.00	  100.00
  P1		  100.00	   96.67	   99.86
  P2		  100.00	  100.00	  100.00
  NE		   99.75	   98.77	   99.15

8 misclassified (709 test data points):
RF classified as RO
RF classified as RO
P1 classified as NE
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP


## Grid Search and Cross Validation

In [8]:
pipe_elements = [('scale', StandardScaler()), ('clf', SVC())]
#pipe_elements = [('scale', StandardScaler()), ('reduce_dim', PCA()), ('clf', SVC())]
pipe = Pipeline(pipe_elements)

param_grid = {'clf__C': [0.1, 1, 10],
              'clf__gamma': ['scale', 0.01, 0.1]}

# include PCA
#param_grid = {'reduce_dim__n_components': [10, 20, 30],
#              'clf__C': [1, 10, 100],
#              'clf__gamma': [1, 10, 100]}

# C: Penalty parameter C of the error term. (Regularisation parameter)
# gamma: Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. (Bandwidth of kernel)

# splitting strategy for grid search: stratified CV with 5 folds
grid_search = GridSearchCV(pipe, 
                           param_grid=param_grid, 
                           cv=5, scoring='accuracy', 
                           verbose=10, 
                           n_jobs=-1, 
                           return_train_score=True)

In [9]:
# apply grid search and cross validation
grid_search.fit(PD1.X_train(), PD1.y_train())

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  43 out of  45 | elapsed:  1.4min remaining:    3.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'clf__C': [0.1, 1, 10], 'clf__gamma': ['scale', 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=10)

In [10]:
# show score (test data)
print('Accuracy: {:.2f}%'.format(grid_search.score(PD1.X_test(), PD1.y_test())*100))

Accuracy: 98.73%


In [11]:
# show best parameters
print('Best parameters: {}'.format(grid_search.best_params_))

Best parameters: {'clf__C': 1, 'clf__gamma': 'scale'}


In [12]:
# show best score of cross validation
print('Best score at cross validatoin: {:.2f}%'.format(grid_search.best_score_*100))

Best score at cross validatoin: 98.95%


In [13]:
# show best estimator
print('Best estimator: {}'.format(grid_search.best_estimator_))

Best estimator: Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])


In [14]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_clf__gamma,params,rank_test_score,split0_test_score,split0_train_score,...,split2_test_score,split2_train_score,split3_test_score,split3_train_score,split4_test_score,split4_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,2.188325,0.488628,0.977288,0.979167,0.1,scale,"{'clf__C': 0.1, 'clf__gamma': 'scale'}",5,0.978013,0.981603,...,0.982829,0.978763,0.971358,0.976929,0.985246,0.978776,0.25562,0.133182,0.006289,0.001524
1,1.775702,0.561232,0.955392,0.957676,0.1,0.01,"{'clf__C': 0.1, 'clf__gamma': 0.01}",8,0.94544,0.956051,...,0.953393,0.960588,0.943535,0.947121,0.986885,0.975714,0.14571,0.123523,0.016059,0.010245
2,10.603607,0.652237,0.901961,0.934069,0.1,0.1,"{'clf__C': 0.1, 'clf__gamma': 0.1}",9,0.909609,0.933974,...,0.921504,0.932203,0.893617,0.930176,0.87623,0.936939,0.517072,0.055035,0.015592,0.002675
3,0.788645,0.17361,0.989542,0.991748,1.0,scale,"{'clf__C': 1, 'clf__gamma': 'scale'}",1,0.985342,0.992845,...,0.995094,0.990811,0.988543,0.991833,0.988525,0.992449,0.139816,0.010984,0.003193,0.000834
4,0.554832,0.18401,0.987908,0.990849,1.0,0.01,"{'clf__C': 1, 'clf__gamma': 0.01}",3,0.984528,0.991619,...,0.994276,0.990402,0.98527,0.991017,0.988525,0.991837,0.109707,0.041625,0.00347,0.000892
5,14.212413,0.506629,0.965523,0.99518,1.0,0.1,"{'clf__C': 1, 'clf__gamma': 0.1}",6,0.967427,0.996934,...,0.973017,0.994895,0.97054,0.994079,0.945082,0.995306,2.265312,0.061838,0.010362,0.000962
6,1.028259,0.205612,0.987092,0.994567,10.0,scale,"{'clf__C': 10, 'clf__gamma': 'scale'}",4,0.986971,0.995707,...,0.988553,0.994282,0.987725,0.993671,0.979508,0.995306,0.278948,0.048626,0.004266,0.000802
7,0.575833,0.118807,0.988072,0.99375,10.0,0.01,"{'clf__C': 10, 'clf__gamma': 0.01}",2,0.988599,0.99489,...,0.987735,0.992649,0.988543,0.993467,0.982787,0.994898,0.093972,0.008424,0.00315,0.000972
8,11.260044,0.456826,0.961765,0.999591,10.0,0.1,"{'clf__C': 10, 'clf__gamma': 0.1}",7,0.963355,0.999387,...,0.967294,1.0,0.968903,1.0,0.938525,0.999184,2.28058,0.13606,0.011846,0.000342


## Set parameters and evaluate directly

In [15]:
PD2 = PDH.PhysioData_SectionFeatures(num_sections=10,
                                     test_subject_ids=[1,2],
                                     train_subject_ids=-1,
                                     test_rep_nums=5,
                                     train_rep_nums=[10, 15],
                                     test_ex_abbrs=-1,
                                     train_ex_abbrs=-1,
                                     with_non_Ex=True,
                                     rot_axis_test_data=0,
                                     rot_angle_test_data=10,
                                     add_noise_test_data=True,
                                     add_noise_train_data=False,
                                     snr_db=20)

# create ML model
ML_model = RandomForestClassifier(n_estimators=50, max_leaf_nodes=40, n_jobs=-1, random_state=42)
#ML_model = make_pipeline(StandardScaler(), SVC(random_state=42)) # Support Vector Classifier with input scaling

# train the model
ML_model.fit(PD2.X_train(), PD2.y_train())

# predict labels
y_pred = ML_model.predict(PD2.X_test())

# show results
print('Model: ' + type(ML_model).__name__ + '\n')
print('Total Accuracy: {:.2f}%\n'.format((accuracy_score(PD2.y_test(), y_pred))*100))
print_precision_recall_accuracy(y_pred, PD2.y_test())
print('')
print_misclassified_data_points(y_pred, PD2.y_test())

Model: RandomForestClassifier

Total Accuracy: 98.60%

Exercise	Precision [%]	Recall [%]	Accuracy[%]
  RF		   71.43	  100.00	   99.57
  RO		  100.00	   60.00	   99.57
  RS		  100.00	  100.00	  100.00
  LR		  100.00	  100.00	  100.00
  BC		  100.00	  100.00	  100.00
  TC		  100.00	  100.00	  100.00
  MP		   52.63	  100.00	   99.03
  SA		  100.00	  100.00	  100.00
  P1		  100.00	  100.00	  100.00
  P2		  100.00	  100.00	  100.00
  NE		  100.00	   98.91	   99.03

13 misclassified (926 test data points):
RO classified as RF
RO classified as RF
RO classified as RF
RO classified as RF
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP
NE classified as MP


In [16]:
pipe

Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [17]:
pipe.steps[0]  

('scale', StandardScaler(copy=True, with_mean=True, with_std=True))

In [18]:
pipe.named_steps['clf'] 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [19]:

## set number of principal components
#number_principal_comp = 30
#
## make pca model
#pca = PCA(n_components=number_principal_comp)
#
## create new features from PCA projections
#X_train_pca = pca.fit_transform(X_train_for_pca)
#X_test_pca = pca.transform(X_test_for_pca)
#
#
## make LDA model
#lda = LDA()
## create new features from LDA projections
#X_train_lda = lda.fit_transform(X_train_for_lda, y_train_lda)
#X_test_lda = lda.transform(X_test_for_lda)
