In [102]:
import pandas as pd
import numpy as np
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.decomposition import PCA  
import pickle
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [93]:
dataset_name0 = 'lin'
dataset_name1 = 'zhong'
LABELS = ['right_standing', 'wrong_standing', 'right_turning', 'wrong_turning']
DATASET_NAME = 'dataset.pkl'
RANDOM_SEED_NUM = 0
actions = LABELS
pkl_file = open(DATASET_NAME, 'rb')
dataset = pickle.load(pkl_file)
pkl_file.close()
dataset0 = dataset[dataset_name0]
dataset1 = dataset[dataset_name1]

# get longest length among different actions
max_t = max(dataset0['length_range'][1], dataset1['length_range'][1])
sequence_length = max_t  # the longest length of sample

# divide the dataset into train set, dev set and test set
x, y, x_train, y_train, x_test, y_test = [], [], [], [], [], []
for label, action in enumerate(actions):

    # lin
    labels = [label] * len(dataset0[action])
    x += dataset0[action]
    y += labels

    # zhong
    labels = [label] * len(dataset1[action])
    x += dataset1[action]
    y += labels

# data augmentation
# x_train, y_train = crop(x_train, y_train)


# pad the data samples
for i in range(len(x)):
    x[i] = \
        np.pad(x[i][:, 1:], ((0, max_t - x[i].shape[0]), (0, 0)), 'constant', constant_values=0)

# shuffle
x, y = shuffle(x, y, random_state=RANDOM_SEED_NUM+1)

# change dataset's data format
x, y = np.array(x), np.array(y)
ts = np.array(None)
for i in range(len(x)):
    x_ = x[i]
    x_ = np.insert(x_, 0, values=i, axis=1)
    if ts.all() == None:
        ts = x_
    else:
        ts = np.vstack((ts, x_))
ts = pd.DataFrame(ts)
ts_features = extract_features(ts, column_id=0)

Feature Extraction: 100%|██████████████████████| 20/20 [13:48<00:00, 28.44s/it]


In [97]:
impute(ts_features)
pca = PCA(n_components=150)
ts_features = pca.fit_transform(ts_features)
ts_features = StandardScaler().fit_transform(ts_features)

In [98]:
ts_features.shape

(160, 150)

In [162]:
train_x, test_x, train_y, test_y = \
    train_test_split(ts_features, y, test_size=0.4, random_state=RANDOM_SEED_NUM)

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for name, clf in zip(names, classifiers):
    clf.fit(train_x, train_y)
    score = clf.score(test_x, test_y)
    print(name, score)

Nearest Neighbors 0.34375
Linear SVM 0.3125
RBF SVM 0.1875
Gaussian Process 0.265625
Decision Tree 0.78125
Random Forest 0.328125
Neural Net 0.125
AdaBoost 0.421875
Naive Bayes 0.6875




QDA 0.359375


In [163]:
output = open('features_vector.pkl', 'wb')
pickle.dump([ts_features, y], output)
output.close()