In [1]:
# All Includes

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf  # Version 1.0.0 (some previous versions are used in past commits)
from sklearn import metrics

import os

In [2]:
# Useful Constants

# Those are separate normalised input features for the neural network
INPUT_SIGNAL_TYPES = [
    "body_acc_x_",
    "body_acc_y_",
    "body_acc_z_",
    "body_gyro_x_",
    "body_gyro_y_",
    "body_gyro_z_",
    "total_acc_x_",
    "total_acc_y_",
    "total_acc_z_"
]

# Output classes to learn how to classify
LABELS = [
    "WALKING", 
    "WALKING_UPSTAIRS", 
    "WALKING_DOWNSTAIRS", 
    "SITTING", 
    "STANDING", 
    "LAYING"
]

In [3]:
DATASET_PATH = "/FileStore/tables/"
TRAIN = "train/"
TEST = "test/"


# Load "X" (the neural network's training and testing inputs)

def load_X(X_signals_paths):
    X_signals = []
    
    for signal_type_path in X_signals_paths:
      #with open(signal_type_path, "r") as file:
        file = open("/dbfs/"+signal_type_path, 'r')
        # Read dataset from disk, dealing with text files' syntax
        X_signals.append(
            [np.array(serie, dtype=np.float32) for serie in [
                row.replace('  ', ' ').strip().split(' ') for row in file
            ]]
        )
        file.close()
    
    return np.transpose(np.array(X_signals), (1, 2, 0))

X_train_signals_paths = [
    DATASET_PATH + signal + "train.txt" for signal in INPUT_SIGNAL_TYPES
]
X_test_signals_paths = [
    DATASET_PATH + TEST +  signal + "test.txt" for signal in INPUT_SIGNAL_TYPES
]

X_train = load_X(X_train_signals_paths)
X_test = load_X(X_test_signals_paths)


In [4]:
# Load "y" (the neural network's training and testing outputs)

def load_y(y_path):
    file = open("/dbfs"+y_path, 'r')
    # Read dataset from disk, dealing with text file's syntax
    y_ = np.array(
        [elem for elem in [
            row.replace('  ', ' ').strip().split(' ') for row in file
        ]], 
        dtype=np.int32
    )
    file.close()
    
    # Substract 1 to each output class for friendly 0-based indexing 
    return y_ - 1

y_train_path = DATASET_PATH + "y_train.txt"
y_test_path = DATASET_PATH + TEST + "y_test.txt"

y_train = load_y(y_train_path)
y_test = load_y(y_test_path)

In [5]:
xtrain_df = pd.DataFrame(X_train.reshape(len(X_train),1152))
ytrain_df = pd.DataFrame(y_train.reshape(len(y_train),1))
xtest_df = pd.DataFrame(X_test.reshape(len(X_test),1152))
ytest_df = pd.DataFrame(y_test.reshape(len(y_test),1))

In [6]:
ytrain_df = ytrain_df.rename(columns={0: 'feature'})

In [7]:
trainingData = pd.concat([xtrain_df, ytrain_df], axis=1)

In [8]:
cols = trainingData.columns

In [9]:
cols

In [10]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier

classifiers = [
    DecisionTreeClassifier(),
    KNeighborsClassifier(7), # because there are 6 different labels
    SVC(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

names = []
scores = []

for clf in classifiers:
    clf = clf.fit(xtrain_df, ytrain_df)
    ypred = clf.predict(xtest_df)
    
    names.append(clf.__class__.__name__)
    scores.append(accuracy_score(ypred, ytest_df))

score_df = pd.DataFrame({'Model': names, 'Score': scores}).set_index('Model')
score_df

In [11]:
ax = score_df.plot.bar()
ax.set_xticklabels(score_df.index, rotation=45, fontsize=10)

In [12]:
svc_model_linear = SVC(kernel = 'linear', C = 1).fit(xtrain_df, ytrain_df)
svc_predictions = svc_model_linear.predict(xtest_df)

In [13]:
# model accuracy for X_test 
accuracy = svc_model_linear.score(xtest_df, ytest_df)
accuracy

In [14]:
# creating a confusion matrix
cm = confusion_matrix(ytest_df, svc_predictions)
cm