## Define parameters and variables

In [22]:
## period length in ms
periodLengthMS = 1500

## sample rate in µs
sampleRateUS = 1000

## test/train ratio
trainDataRatio = 0.7
trainDataAbs = 100*trainDataRatio


## Import neccesary helper modules

In [2]:
#import neccessary modules
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import gc

In [3]:
import random
import statistics
import matplotlib.cm as cm
%matplotlib inline

In [4]:
# helper functions
from timeseries_helpers import datasetstorer
from timeseries_helpers import dataset_importer
from timeseries_helpers import database_importer
from timeseries_helpers import standardizer

In [5]:
# plotting functions
from timeseries_helpers import timeseries_plotter

In [6]:
# tsfresh modules (for feature extraction)
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters

import logging
# Set logger-level to "error". Not recommed: Important warnings can be overseen
logging.basicConfig(level=logging.ERROR)

## Import ML Algorithm modules

In [35]:
# import machine learning algorithms
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

# New functions for classification module

In [23]:
def prepareDataset(datasetPath,periodLengthInMS, sampleRateInUS, exportToCSV=False):
    """Reads the original (JSON) Dataset and prepares it for get used by the ML algorithms."""
    with open(datasetPath, 'r') as f:
        jsnDataset = json.load(f)
    ds_orig = database_importer.jsonData_to_dataset_in_timedifference_us(data=jsnDataset)
    ds_equalPeriod = standardizer.normate_dataset_period(periodLengthInMS,sampleRateInUS,ds_orig[:5])
    ds = standardizer.change_strings_to_numbs(ds_equalPeriod)
    ds_equalPeriod = []
    ds_orig = []
    if exportToCSV:
        fileName = r"dataset_periodMS" + str(periodLengthInMS)+"_sampleUS"+str(sampleRateInUS)+".csv";
        datasetstorer.export_list_of_dataframes_to_csv(ds,fileName)
    return ds

In [24]:
def listToDataframe(dataSet):
    df_list = []
    for idx,e in enumerate(dataSet):
        df = e.assign(punch_id=idx)
        df_list.append(df)
    df_res = pd.concat(df_list)
    df_allInOne = df_res.reset_index(drop=True)
    ds = df_allInOne.rename(index=str, columns={"x": "a_x", "y": "a_y", "z":"a_z"})
    return ds

In [13]:
def train_test_split(dataFrame,predictionColumn,trainsize,seed=5):
    """Returns an array of lists containing a list of the punch indizes to train and test. First list element: train dataset. Second list element: test dataset
           arg 1: dataFrame as dataframe object
           arg 2: trainsize as percentage of training data. e.g. 0.7
           arg3: seed . optional, for same results of random values at multiple test"""
    result = False
    if predictionColumn == 'label' or predictionColumn == 'annotator' or predictionColumn == 'hand':
        punchIdx = dataFrame['punch_id'].unique()
        random.seed(seed)
        data = dataFrame.copy()
        punchIdx = data['punch_id'].unique()
        list_in_cpy = punchIdx[:]
        random.shuffle(list_in_cpy)
        n = len(list_in_cpy)
        idx_train = list_in_cpy[:round(n*trainsize)]
        idx_test =  list_in_cpy[round(n*trainsize):]
        train_ds = data[data['punch_id'].isin(idx_train)]
        test_ds = data[data['punch_id'].isin(idx_test)]

        test_dataset_unique_label_id = test_ds.drop_duplicates(subset='punch_id', keep='first', inplace=False)
        y_test = pd.Series(data=test_dataset_unique_label_id[predictionColumn])
        train_dataset_unique_label_id = train_ds.drop_duplicates(subset='punch_id', keep='first', inplace=False)
        y_train = pd.Series(data=train_dataset_unique_label_id[predictionColumn])
        result = [train_ds.reset_index(drop=True),test_ds.reset_index(drop=True),y_train,y_test]
    else:
        print('Error: Chosen predictionColumn not valid! Accepted: label, annotator or hand.')
    return result

In [26]:
def get_train_test_ratio(dataSet, column = 'label'):
    """Returns the ratio of timeseries of the chosen prediction column as a pandas.core.series.Series"""
    return dataSet[column].value_counts()

In [27]:
def extractFeatures(dataSetToExtractFrom,feature_settings="minimal"):
    dataset_for_extraction = dataSetToExtractFrom.drop(columns=['label','hand','annotator'])
    
    if feature_settings == "minimal":
        extractedFeatures = MinimalFCParameters();
    elif feature_settings == "maximal":
        extractedFeatures = ComprehensiveFCParameters();
    elif feature_settings == "findBest":
        extractedFeatures = EfficientFCParameters();
    else:
        extractedFeatures = MinimalFCParameters();
    extracted_featureset = extract_features(dataset_for_extraction, column_id="punch_id",column_sort="timestamp", impute_function=impute, default_fc_parameters = extractedFeatures);
    return extracted_featureset

In [1]:
def get_available_classifier_labels():
    return ['Linear SVC (ovr)','Standard SVC', 'Logsitic Regression', 'KNN', 'Random Forest'];

In [2]:
def predict(X_train, y_train, X_test, y_test,estimators= 100,KNNneighbors=5):
    accuracy_scores = np.zeros(len(get_available_classifier_labels()))
    clf_labels = get_available_classifier_labels()
    # Linear Support Vector classifier
    linSupp_Vectr_clf = svm.LinearSVC()
    linSupp_Vectr_clf.fit(X_train, y_train) 
    prediction = linSupp_Vectr_clf.predict(X_test)
    accuracy_scores[0] = accuracy_score(y_test, prediction)*100
    print('Linear Vector Classifier accuracy (one-vs-rest): {}%'.format(accuracy_scores[0]))
    # Support Vector Classifier
    stdSupp_Vectr_clf = SVC().fit(X_train, y_train)
    prediction = stdSupp_Vectr_clf.predict(X_test)
    accuracy_scores[1] = accuracy_score(y_test, prediction)*100
    print('Support Vector Classifier accuracy: {}%'.format(accuracy_scores[1]))
    # Logistic Regression
    logistic_reggr_clf = LogisticRegression().fit(X_train, y_train)
    prediction = logistic_reggr_clf.predict(X_test)
    accuracy_scores[2] = accuracy_score(y_test, prediction)*100
    print('Logistic Regression accuracy: {}%'.format(accuracy_scores[2]))
    # K Nearest Neighbors
    knn_clf = KNeighborsClassifier(n_neighbors=KNNneighbors).fit(X_train, y_train)
    prediction = knn_clf.predict(X_test)
    accuracy_scores[3] = accuracy_score(y_test, prediction)*100
    print('K Nearest Neighbors Classifier accuracy: {}%'.format(accuracy_scores[3]))
    # Random Forest
    rndm_forest_clf = RandomForestClassifier(n_estimators = estimators).fit(X_train, y_train)
    prediction = rndm_forest_clf.predict(X_test)
    accuracy_scores[4] = accuracy_score(y_test, prediction)*100
    print('Random Forest Classifier accuracy: {}%'.format(accuracy_scores[4]))
    return [accuracy_scores,[linSupp_Vectr_clf,stdSupp_Vectr_clf,logistic_reggr_clf,knn_clf,rndm_forest_clf]]

In [3]:
def print_prediction_results(classifier_labels,acc_scores,plotTitle,xLabel='Classifiers',yLabel='Accuracy'):
    colors = cm.rainbow(np.linspace(0, 1, 4))
    plt.figure(figsize=(20,10))
    plt.bar(classifier_labels,
            acc_scores,
            color = colors)
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.title(plotTitle)

# Test the new functions

In [28]:
ds_loaded = prepareDataset('../../Datasets/Raw_Data/complete_smartpunch_dataset_7606punches.json',periodLengthMS,sampleRateUS,False)

In [29]:
ds_loaded = listToDataframe(ds_loaded)

In [30]:
ds_ready.head(5)

Unnamed: 0,a_x,a_y,a_z,timestamp,label,hand,annotator,punch_id
0,6.7932233810424805,-2.383251428604126,2.9623143672943115,0,1,0,0,0
1,6.689701513074261,-2.6512779830073976,3.1807217824529057,1000,1,0,0,0
2,6.601797580718994,-2.871387243270874,3.3595230579376216,2000,1,0,0,0
3,6.5283193465743645,-3.04752244332295,3.502028963241094,3000,1,0,0,0
4,6.468074573238054,-3.183626817092021,3.6115502678559537,4000,1,0,0,0


## Create y_train and y_test for predicting: "label"

In [15]:
mixedPunches = train_test_split(ds_ready,'label',trainDataRatio,seed=5)
train_dataset = mixedPunches[0]
test_dataset = mixedPunches[1]
y_train = mixedPunches[2]
y_test = mixedPunches[3]

NameError: name 'ds_ready' is not defined

In [32]:
train_ratio = get_train_test_ratio(train_dataset)
test_ratio = get_train_test_ratio(train_dataset)

In [33]:
print(train_ratio)

1    6004
Name: label, dtype: int64


## Feature extraction

In [34]:
X_test = extractFeatures(test_dataset,feature_settings='minimal')

Feature Extraction: 100%|██████████| 3/3 [00:00<00:00, 41.86it/s]


In [None]:
X_train = extractFeatures(train_dataset,feature_settings='minimal')

## Prediction/Classification of: "label"

In [None]:
generalResult = predict(X_train,y_train,X_test,y_test)
label_accuracy_scores = generalResult[0]
label_classifier = generalResult[1]
print_prediction_results(get_available_classifier_labels,label_accuracy_scores,'Accuracy overview for label prediction')

# Keep on going beyond

## Hand label prediction

In [None]:
train_dataset = mixedPunches[0]
test_dataset = mixedPunches[1]

In [None]:
# try to use the minimal feature settings
settings_minimal = MinimalFCParameters() # only a few basic features
settings_minimal

In [None]:
# extract features for test dataset
testds_for_extraction = test_dataset.drop(columns=['label','hand','annotator'])
test_dataset_extracted = extract_features(testds_for_extraction, column_id="punch_id",column_sort="timestamp", impute_function=impute, default_fc_parameters = settings_minimal);

In [None]:
# extract features for train dataset
trainds_for_extraction = train_dataset.drop(columns=['label','hand','annotator'])
train_dataset_extracted = extract_features(trainds_for_extraction, column_id="punch_id",column_sort="timestamp", impute_function=impute, default_fc_parameters = settings_minimal);

In [None]:
test_dataset_unique_label_id = test_dataset.drop_duplicates(subset='punch_id', keep='first', inplace=False)
test_dataset_unique_label_id.head()

In [None]:
y_test = pd.Series(data=test_dataset_unique_label_id['hand'])

In [None]:
train_dataset_unique_label_id = train_dataset.drop_duplicates(subset='punch_id', keep='first', inplace=False)
train_dataset_unique_label_id.head()

In [None]:
y_train = pd.Series(data=train_dataset_unique_label_id['hand'])
y_train.head()

In [None]:
X_train = train_dataset_extracted
X_test = test_dataset_extracted

# Classification part

In [None]:
accuracy_scores_for_hand = np.zeros(5)
clf_for_hand = ['Linear SVC (ovr)','Standard SVC', 'Logsitic Regression', 'KNN', 'Random Forest']

In [None]:
linSupp_Vectr_clf_hand = svm.LinearSVC()
linSupp_Vectr_clf_hand.fit(X_train, y_train) 
prediction = linSupp_Vectr_clf_hand.predict(X_test)
accuracy_scores_for_hand[0] = accuracy_score(y_test, prediction)*100
print('Linear Vector Classifier accuracy (one-vs-rest): {}%'.format(accuracy_scores_for_hand[0]))

In [None]:
# Support Vector Classifier
stdSupp_Vectr_clf_hand = SVC().fit(X_train, y_train)
prediction = stdSupp_Vectr_clf_hand.predict(X_test)
accuracy_scores_for_hand[1] = accuracy_score(y_test, prediction)*100
print('Support Vector Classifier accuracy: {}%'.format(accuracy_scores_for_hand[1]))

In [None]:
# Logistic Regression
logistic_reggr_clf_hand = LogisticRegression().fit(X_train, y_train)
prediction = logistic_reggr_clf_hand.predict(X_test)
accuracy_scores_for_hand[2] = accuracy_score(y_test, prediction)*100
print('Logistic Regression accuracy: {}%'.format(accuracy_scores_for_hand[2]))

In [None]:
# K Nearest Neighbors
knn_clf_hand = KNeighborsClassifier(n_neighbors=5)
knn_clf_hand.fit(X_train, y_train)
prediction = knn_clf_hand.predict(X_test)
accuracy_scores_for_hand[3] = accuracy_score(y_test, prediction)*100
print('K Nearest Neighbors Classifier accuracy: {}%'.format(accuracy_scores_for_hand[3]))

In [None]:
# Random Forest
rndm_forest_clf_hand = RandomForestClassifier(n_estimators = 100)
rndm_forest_clf_hand.fit(X_train, y_train)
prediction = rndm_forest_clf_hand.predict(X_test)
accuracy_scores_for_hand[4] = accuracy_score(y_test, prediction)*100
print('Random Forest Classifier accuracy: {}%'.format(accuracy_scores_for_hand[4]))

## Visualization of the classifier accuracy for hand prediction

In [None]:
colors = cm.rainbow(np.linspace(0, 1, 4))
plt.figure(figsize=(20,10))
plt.bar(clf_for_hand,
        accuracy_scores_for_hand,
        color = colors)
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.title('Accuracy overview for hand-prediction')

## Prediction of the annotator name

In [None]:
train_dataset = mixedPunches[0]
test_dataset = mixedPunches[1]

In [None]:
# try to use the minimal feature settings
settings_minimal = MinimalFCParameters() # only a few basic features
settings_minimal

In [None]:
# extract features for test dataset
testds_for_extraction = test_dataset.drop(columns=['label','hand','annotator'])
test_dataset_extracted = extract_features(testds_for_extraction, column_id="punch_id",column_sort="timestamp", impute_function=impute, default_fc_parameters = settings_minimal);

In [None]:
# extract features for train dataset
trainds_for_extraction = train_dataset.drop(columns=['label','hand','annotator'])
train_dataset_extracted = extract_features(trainds_for_extraction, column_id="punch_id",column_sort="timestamp", impute_function=impute, default_fc_parameters = settings_minimal);

In [None]:
test_dataset_unique_label_id = test_dataset.drop_duplicates(subset='punch_id', keep='first', inplace=False)
test_dataset_unique_label_id.head()

In [None]:
y_test = pd.Series(data=test_dataset_unique_label_id['annotator'])

In [None]:
train_dataset_unique_label_id = train_dataset.drop_duplicates(subset='punch_id', keep='first', inplace=False)
train_dataset_unique_label_id.head()

In [None]:
y_train = pd.Series(data=train_dataset_unique_label_id['annotator'])
y_train.head()

In [None]:
X_train = train_dataset_extracted
X_test = test_dataset_extracted

# Classification part for annotator name

In [None]:
accuracy_scores_for_subject = np.zeros(5)
clf_for_subject = ['Linear SVC (ovr)','Standard SVC', 'Logsitic Regression', 'KNN', 'Random Forest']
linSupp_Vectr_clf_annotator = svm.LinearSVC().fit(X_train, y_train) 
prediction = linSupp_Vectr_clf_annotator.predict(X_test)
accuracy_scores_for_subject[0] = accuracy_score(y_test, prediction)*100
print('Linear Vector Classifier accuracy (one-vs-rest): {}%'.format(accuracy_scores_for_subject[0]))

In [None]:
# Support Vector Classifier
stdSupp_Vectr_clf_annotator = SVC().fit(X_train, y_train)
prediction = stdSupp_Vectr_clf_annotator.predict(X_test)
accuracy_scores_for_subject[1] = accuracy_score(y_test, prediction)*100
print('Support Vector Classifier accuracy: {}%'.format(accuracy_scores_for_subject[1]))

In [None]:
# Logistic Regression
logistic_reggr_clf_annotator = LogisticRegression().fit(X_train, y_train)
prediction = logistic_reggr_clf_annotator.predict(X_test)
accuracy_scores_for_subject[2] = accuracy_score(y_test, prediction)*100
print('Logistic Regression accuracy: {}%'.format(accuracy_scores_for_subject[2]))

In [None]:
# K Nearest Neighbors
knn_clf_annotator = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
prediction = knn_clf_annotator.predict(X_test)
accuracy_scores_for_subject[3] = accuracy_score(y_test, prediction)*100
print('K Nearest Neighbors Classifier accuracy: {}%'.format(accuracy_scores_for_subject[3]))

In [None]:
# Random Forest
rndm_forest_clf_annotator = RandomForestClassifier(n_estimators = 50).fit(X_train, y_train)
prediction = rndm_forest_clf_annotator.predict(X_test)
accuracy_scores_for_subject[4] = accuracy_score(y_test, prediction)*100
print('Random Forest Classifier accuracy: {}%'.format(accuracy_scores_for_subject[4]))

## Print classifier accuracy for annotator prediction

In [None]:
colors = cm.rainbow(np.linspace(0, 1, 4))
plt.figure(figsize=(20,10))
plt.bar(clf_for_subject,
        accuracy_scores_for_subject,
        color = colors)
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.title('Accuracy overview for annotator prediction')

# Save the created models

## Store the models for the punch classification

### Linear SVC

In [None]:
pkl_filename = "model_linSupp_Vectr_clf_punchtype_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(linSupp_Vectr_clf_punchtype, file)

### Standard SVC

In [None]:
pkl_filename = "model_stdSupp_Vectr_clf_punchtype_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(stdSupp_Vectr_clf_punchtype, file)

### Logistic Regression

In [None]:
pkl_filename = "model_logistic_reggr_clf_punchtype_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(logistic_reggr_clf_punchtype, file)

### KNN

In [None]:
pkl_filename = "model_knn_clf_punchtype_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(knn_clf_punchtype, file)

### Random Forest

In [None]:
pkl_filename = "model_rndm_forest_clf_punchtype_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(rndm_forest_clf_punchtype, file)

## Store the models for the hand classification

### Linear SVC

In [None]:
pkl_filename = "model_linSupp_Vectr_clf_hand_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(linSupp_Vectr_clf_hand, file)

### Standard SVC

In [None]:
pkl_filename = "model_stdSupp_Vectr_clf_hand_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(stdSupp_Vectr_clf_hand, file)

### Logistic Regression

In [None]:
pkl_filename = "model_logistic_reggr_clf_hand_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(logistic_reggr_clf_hand, file)

### KNN

In [None]:
pkl_filename = "model_knn_clf_hand_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(knn_clf_hand, file)

### Random Forest

In [None]:
pkl_filename = "model_rndm_forest_clf_hand_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(rndm_forest_clf_hand, file)

## Store the models for the annotator classification

### Linear SVC

In [None]:
pkl_filename = "model_linSupp_Vectr_clf_annotator_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"    
with open(pkl_filename, 'wb') as file:  
    pickle.dump(linSupp_Vectr_clf_annotator, file)

### Standard SVC

In [None]:
pkl_filename = "model_stdSupp_Vectr_clf_annotator_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"    
with open(pkl_filename, 'wb') as file:  
    pickle.dump(stdSupp_Vectr_clf_annotator, file)

### Logistic Regression

In [None]:
pkl_filename = "model_logistic_reggr_clf_annotator_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"  
with open(pkl_filename, 'wb') as file:  
    pickle.dump(logistic_reggr_clf_annotator, file)

### KNN

In [None]:
pkl_filename = "model_knn_clf_annotator_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"    
with open(pkl_filename, 'wb') as file:  
    pickle.dump(knn_clf_annotator, file)

### Random Forest

In [None]:
pkl_filename = "model_rndm_forest_clf_annotator_periodMS" + str(periodLengthMS)+"_sampleUS"+str(sampleRateUS)+"_TrainSize"+str(trainDataAbs)+".pkl"    
with open(pkl_filename, 'wb') as file:  
    pickle.dump(rndm_forest_clf_annotator, file)