# Welcome to Homework 5 (CS 498 AML)!

In [0]:
# import necessary libraries
import pandas as pd
import numpy  as np
import random
from math import ceil
from math import floor
from os import listdir
from sklearn.cluster  import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics  import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [0]:
# The Dataset is composed of the recordings of 14 simple ADL
ADL_Names = ["Brush_teeth", "Climb_stairs", "Comb_hair", "Descend_stairs", "Drink_glass", 
               "Eat_meat", "Eat_soup", "Getup_bed", "Liedown_bed", "Pour_water", 
               "Sitdown_chair", "Standup_chair", "Use_telephone", "Walk"]

# ALL INPUT in this cell
# dataIN[ADL][NUM_Matrix] stores one feature item (a nrow*3 array) for a certain ADL
# dataIN is split randomly into train and test (4:1)
# test is held out to evaluate accuracy
dataIN = []
train = []
test = []
for i,ADL in enumerate(ADL_Names):
    # create a new list for an ADL
    dataIN.append([])
    # append all the files of this ADL in the list
    prefix = 'HMP_Dataset/' + ADL + '/'
    for filename in listdir(prefix):
        if (filename == ".DS_Store"):
            continue
        dataIN[i].append(pd.read_table(prefix + filename, sep = ' ', header = None).values)
    # test-train split (80% for train, 20% for test)
    random.shuffle(dataIN[i])
    test.append(dataIN[i][0:ceil(len(dataIN[i])/5)])
    train.append(dataIN[i][ceil(len(dataIN[i])/5):])

In [0]:
# three important parameters for question (b)
FIXED_LEN  = 32;    # the size of the fixed length samples that you use (textbook value: 32)
NCLUSTER_1 = 40;    # the number of cluster centers at the first level in the hierarchical k-means (textbook value: 40)
NCLUSTER_2 = 12;    # the number of cluster centers at the second level in the hierarchical k-means (textbook value: 12)
NCLUSTER   = 480;   # the number of cluster centers for normal k-means

In [0]:
#####################################################################
#
# slice: function used to slice data into several non-overlapping
#        equal length pieces
# input:
#       @data: the data to be sliced
#       @fixed_len: length of each sliced piece
# output:
#       @segments: a 2-demension list that contains each file in its
#                  first dimension, and sliced pieces for each file 
#                  in the corresponding second dimension
#
#####################################################################
def slice(data, fixed_len, overlap_percent):
  step = floor(fixed_len * (1 -overlap_percent))
  segments = []
  # for each folder
  for i in range(len(data)):
    # for each file
    for j in range(len(data[i])):
      # perform slicing
      for x in range(fixed_len, len(data[i][j]), step):
        tmp = []
        # convert the matrix into one dimension vector
        # by joining each row
        for n in range(fixed_len):
          tmp.extend(data[i][j][x-fixed_len:x][n])
        segments.append(tmp)
  # print("Obtain ", len(segments), " pieces from slicing")
  return segments

In [0]:
#####################################################################
#
# get_label: function used to get the label of data
# input:
#       @data: the input data, whose structure is:
#              data[ADL][FileNumber]
#       @LabelNames: a list contain all the label names
# output:
#       @label: a 1-dimension list that contains the corresponding
#               label for the data
#
#####################################################################
def get_label(data, LabelNames):
  label = []
  # for each folder
  for i in range(len(data)):
    # for each file
    for j in range(len(data[i])):
      label.append(LabelNames[i])
  return label

In [0]:
#####################################################################
#
# _kmeans_classify: helper function called by make_histograms,
#                   used to classify each pieces of data into
#                   given clusters
# input:
#       @data: the input data, whose structure is:
#              data[ADL][FileNumber]
#       @fixed_len: length of each sliced piece
#       @kmeans_model: kmeans model used to apply cluster
# output:
#       @classification: a 2-demension list that contains each 
#                        file in its first dimension, 
#                        and sliced pieces classification for 
#                        each file in the corresponding 
#                        second dimension
#
##################################################################### 
def _kmeans_classify(data, fixed_len, kmeans_model):
  classification = []
  file_count = 0
  # for each folder
  for i in range(len(data)):
    # for each file
    for j in range(len(data[i])):
      file_count += 1
      file_classification = []
      # for each sliced piece
      for x in range(fixed_len, len(data[i][j]),fixed_len):
        # due to the constraint, sklearn kmeans,
        # we have to make sliced_piece into 2 dimensions
        sliced_piece = []
        tmp = []
        for n in range(fixed_len):
          tmp.extend(data[i][j][x-fixed_len:x][n])
        sliced_piece.append(tmp)
        # classify
        file_classification.extend(kmeans_model.predict(np.array(sliced_piece)))
      classification.append(file_classification)
  # print("Total number of test files is:", file_count)
  # print("Total number of prediction is:", len(classification))
  return classification

In [0]:
#####################################################################
#
# normalize: function used to normalize the given data set
# input:
#       @data: the input data, whose structure is:
#              data[file][feature]
# output:
#       @list_normalized: a 2-demension list that contains each 
#                         file in its first dimension, 
#                         and corresponding normalized features 
#                         in its second dimension
#
#####################################################################
def normalize(data):
    # for each file
    for row in range(len(data)):
        sum = 0
        # calculate the sum
        for col in range(len(data[row])):
            sum += data[row][col]
        # diviede each feature by the sum
        data[row] = list(np.array(data[row]) / sum)
    return data

In [0]:
#####################################################################
#
# make_histograms: function used to convert given data into 
#                   histograms (namely, extract equal length feature
#                   out of given data)
# input:
#       @data: the input data, whose structure is:
#              data[ADL][FileNumber]
#       @fixed_len: length of each sliced piece
#       @kmeans_model: kmeans model used to apply cluster
#       @clusterNum: total number of clusters
# output:
#       @histograms: a 2-demension list that contains each 
#                    file in its first dimension, 
#                    and corresponding features in its 
#                    second dimension 
#
##################################################################### 
def make_histograms(data, fixed_len, kmeans_model, clusterNum):
  # call _kmeans_classify to slice and classify
  prediction = _kmeans_classify(data, fixed_len, kmeans_model)
  histograms = []
  # for each file
  for fileNum in range(len(prediction)):
    tmp_histograms = [0] * clusterNum
    # count the number of each cluster and build histograms based on that
    for cluster in prediction[fileNum]:
      tmp_histograms[cluster] += 1
    histograms.append(tmp_histograms)
  return histograms

In [0]:
def plot_activity_histogram(all_data, all_label):
    # dict used to record plotted times
    plotted =dict()
    # for each file
    for row in range(len(all_data)):
        # plot each activity twice
        if all_label[row] in plotted.keys():
            if plotted[all_label[row]] < 2:
                plotted[all_label[row]] += 1
                plot_activity_histogram_helper(all_data[row], all_label[row], plotted[all_label[row]])
            else:
                continue
        else:
            plotted[all_label[row]] = 1
            plot_activity_histogram_helper(all_data[row], all_label[row], plotted[all_label[row]])
    return

def plot_activity_histogram_helper(data, label, times):
    fig = plt.figure()
    x = range(len(data))
    plt.title(label)
    plt.bar(x, data, width=3)
    filename = 'plot/'+ label + '_' + str(times) + '.png'
    plt.savefig(filename)
    plt.close('all')

In [0]:
#####################################################################
#
# classifier: function to classify based on vector quantization and k-means
# input:
#       @fixed_len: length of each sliced piece
#       @ncluster: total number of clusters
# output:
#       @confusionMatrix: the confusion matrix of the classifier
#       @accuracy: the accuracy on the held out test dataset
#
#####################################################################
def classifier(fixed_len, ncluster, overlap_percent, plot = False):
    # VECTOR QUANTIZE & PREPARE FEATURES + LABELS
    # break signals into sample segments
    segments = slice(train, fixed_len, overlap_percent)
    # normal k-means (480 cluster centers)
    kmeans = KMeans(n_clusters = ncluster, random_state = 0).fit(np.array(segments))
    # making features using histogram of cluster centers
    trainFeatures = make_histograms(train, fixed_len, kmeans, ncluster)
    testFeatures = make_histograms(test, fixed_len, kmeans, ncluster)
    # normalize the histograms to get rid of the influence caused by the length of files
    trainFeatures_normalized = normalize(trainFeatures)
    testFeatures_normalized = normalize(testFeatures)
    # get the ground truth labels for both train & test dataset
    trainLabel = get_label(train, ADL_Names)
    testLabel = get_label(test, ADL_Names)
    
    if (plot == True):
      plot_activity_histogram(trainFeatures_normalized, trainLabel)

    # CLASSIFICATION
    # Random Forest Prediction
    RandomForestModel = RandomForestClassifier(n_estimators = 100)
    RandomForestModel.fit(trainFeatures_normalized, trainLabel)
    test_pred = RandomForestModel.predict(testFeatures_normalized)

    # confusion matrix
    confusionMatrix = confusion_matrix(test_pred, testLabel)
    # accuracy of prediction
    accuracy = accuracy_score(test_pred, testLabel)
    
    return (confusionMatrix, accuracy)

In [0]:
FIXED_LEN = 10
NCLUSTER = 226
OVERLAP_PERCENT = 0.1
# evaluate the classifier & get confusion matrix + accuracy
(confusionMatrix, accuracy) = classifier(FIXED_LEN, NCLUSTER, OVERLAP_PERCENT, plot = False)
# report (a) the total error rate and (b) the class confusion matrix of the classifier
print("The total error rate is: ", 1 - accuracy)
print("The class confusion matrix is: \n", confusionMatrix)
dataframe = pd.DataFrame(confusionMatrix)
dataframe.to_csv("confusionMatrix.csv")

Obtain  39242  pieces from slicing
Total number of test files is: 666
Total number of prediction is: 666
Total number of test files is: 173
Total number of prediction is: 173
The total error rate is:  0.21387283236994215
The class confusion matrix is: 
 [[ 3  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 17  0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  0  7  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  8  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 20  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0 17  4  0  0  1  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 20  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  2  0 14  7  0  3]
 [ 0  0  0  0  0  0  0  3  0  0  6 13  0  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  1  0]
 [ 0  3  0  1  0  0  0  0  0  0  0  0  0 14]]


In [0]:
# Modify the number of cluster centers & size of the fixed length to optimize
max_accuracy = 0
best_fixed_len = 0
best_ncluster = 0
bset_overlap_percent = 0

# cache
input_cache = []
output_cache = []

total = ((100 - 10) / 10) * ((1000 - 200)/100) * ((0.6 - 0) / 0.1)
curr = 0

for fixed_len in range(10, 100, 10):
    for ncluster in range(200, 1000, 100):
        for overlap_percent in np.arange(0.0, 0.6, 0.1):
            curr += 1
            print("Processing ... ", (curr / total)*100, "%")
            
            curr_input = [fixed_len, ncluster, overlap_percent]
            input_cache.append(curr_input)

            count = 0
            for times in range(3):
                count += classifier(fixed_len, ncluster, overlap_percent, plot = False)[1]
            accuracy = count/3

            output_cache.append(accuracy)

            if (accuracy > max_accuracy):
                max_accuracy = accuracy
                best_fixed_len = fixed_len
                best_ncluster = ncluster
                bset_overlap_percent = overlap_percent

print(max_accuracy, " ", best_fixed_len, " ", best_ncluster, " ", bset_overlap_percent)
cache = {'input': input_cache, 'accuracy': output_cache}
df_cache = pd.DataFrame(data=cache)
df_cache.to_csv("cache.csv")

Processing ...  0.2314814814814815 %
Processing ...  0.462962962962963 %
Processing ...  0.6944444444444445 %
Processing ...  0.925925925925926 %
Processing ...  1.1574074074074074 %
Processing ...  1.388888888888889 %
Processing ...  1.6203703703703707 %


KeyboardInterrupt: ignored

LAB TEST AREA BELOW

In [0]:
print(FIXED_LEN, NCLUSTER)

NameError: ignored

In [0]:
kmeans = KMeans(n_clusters = NCLUSTER_1, random_state = 0).fit(np.array(segments))
print(kmeans.cluster_centers)

In [0]:
np.array([[0,2],[1,4]])

array([[0, 2],
       [1, 4]])

 Upload the data zip and unzip it (do it every time)

In [0]:
# unzip
import zipfile
Zipfile = zipfile.ZipFile('ADL_Dataset.zip', 'r')
Zipfile.extractall()
# remove the folder
# import shutil
# shutil.rmtree('ADL_Dataset')

In [0]:
# DRAFT
# vector quantize
# three important parameters for question (b)
FIXED_LEN = 32;     # the size of the fixed length samples that you use (textbook value: 32)
NCLUSTER_1 = 40;    # the number of cluster centers at the first level in the hierarchical k-means (textbook value: 40)
NCLUSTER_2 = 12;    # the number of cluster centers at the second level in the hierarchical k-means (textbook value: 12)

segments = []
for i in range(len(train)):
    for j in range(len(train[i])):
        # for a file (k*3 matrix), cut into a bunch of FIXED_LEN*3 matrix
        segments.extend([train[i][j][x-FIXED_LEN:x][:] for x in range(FIXED_LEN, len(train[i][j]),FIXED_LEN)])
#     print(len(segments))
# print(len(segments)) # around 11000 segments in total


# Hierarchical k-means (two-level)

In [0]:
# convert (m x n) matrix to (1 x (m*n)) vector 
# vector quantize
segments = []
for i in range(len(train)):
    for j in range(len(train[i])):
        # for a file (k*3 matrix), cut into a bunch of FIXED_LEN*3 matrix
        for x in range(FIXED_LEN, len(train[i][j]),FIXED_LEN):
            tmp = []
            for n in range(FIXED_LEN):
                tmp.extend(train[i][j][x-FIXED_LEN:x][n])
            segments.append(tmp)
# print the number of segments
print("The number of segments is: ", len(segments))