In [3]:
import os
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

In [4]:
#train test split
train = {}
test = {}

keys = [name for name in os.listdir("HMP_Dataset") if os.path.isdir("HMP_Dataset/" + name)]
for i,key in enumerate(keys):
    train[i] = []
    test[i] = []
    key_path = "HMP_Dataset/" + key
    file_names = os.listdir(key_path)
    size = len(file_names)
    test_size = int(max(np.floor(0.2*size), 1))
    train_size = int(size - test_size)
    test_index = set(np.random.choice(size, test_size, replace=False))
    for j,file_name in enumerate(file_names):
        cur_data = []
        file_path = key_path + '/' + file_name
        with open(file_path, "r") as f:
            for row in f:
                line = list(map(int, row.replace("\n", "").split(" ")))
                cur_data.append(line)
        if j in test_index:
            test[i].append(np.array(cur_data))
        else:
            train[i].append(np.array(cur_data))

In [356]:
#vector quantize
class VectorQuantize:
    def __init__(self, k, size):
        self.k = k
        self.k_mean_model = KMeans(n_clusters=self.k)
        self.size = size
      
    def fit(self, input_data):
        flatten_signals = []

        for key in input_data:
            move_data = input_data[key]
            for data in move_data:
                cut_data = self.cut(data)
                flatten_signals += cut_data

        self.k_mean_model.fit(flatten_signals)

    def transform(self, input_data):
        output_data = []
        for key in input_data:
            move_data = input_data[key]
            for data in move_data:
                cut_data = self.cut(data)
                y_pred = self.k_mean_model.predict(cut_data)
                output_data += [self.count(y_pred) + [key]]

        return np.array(output_data)
    
    def cut(self, input_data):
        result = []
        idx = 0
        while idx+self.size <  len(input_data):
            result += [np.reshape(input_data[idx:idx+self.size, :], 3*self.size)]
            idx += self.size 
        return result

    def count(self, input_data):
        quantized_vectors = [0 for _ in range(self.k)]
        for signal in input_data:
            quantized_vectors[int(signal)] += 1

        return quantized_vectors

    

In [357]:
vq = VectorQuantize(k=10, size=5)
vq.fit(train)
train_transform = vq.transform(train)
test_transform = vq.transform(test)

In [410]:
#prediciton
train_data = train_transform[:, :-1]
train_label = train_transform[:, -1]
test_data = test_transform[:, :-1]
test_label = test_transform[:, -1]

rf = RandomForestClassifier(n_estimators=10, max_depth=None)
rf.fit(train_data, train_label)
predictions = rf.predict(test_data)
print(accuracy_score(test_label, predictions))
confusion_matrix(test_label, predictions)

0.775757575758


array([[ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 15,  0,  3,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1],
       [ 0,  0,  4,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0],
       [ 0,  1,  0,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  1,  1,  0,  0,  0, 11,  4,  0,  1,  2,  0,  0],
       [ 0,  0,  0,  1,  0,  0,  0,  3,  1,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 19,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  1,  1,  0, 16,  2,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  1,  1,  0,  1, 16,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  2,  1,  0, 15]])

Above is my acc score and confustion matrix, with k = 10 and size = 5. The highest acc I got is 0.84 with avg being 0.8. It seems to be my best parameter so far. I try to increase my number of cluster to 100 and even to 500, with size being increased to 10 and 50, but the accuaracy seems to decreasing.