In [8]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.neural_network as nn
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold

In [9]:
def calc_metrics(confusion):
    # Evaluate the model based on accuracy, percision, recall, and BCR
    acc_list = []
    percision_list = []
    recall_list = []
    BCR_list = []

    for i in range(len(confusion)):
        # True positives
        tp = confusion[i][i]
        # False positives
        fp = np.sum(confusion[i]) - tp
        # False negitives
        fn = np.sum(confusion[:,i]) - tp
        # True negitives
        tn = np.sum(confusion) - tp - fp - fn

        # accuracy = (tp + tn) / (tp+ tn + fp + fn)
        acc_list.append((tp + tn) / (tp+ tn + fp + fn))
        # percision = tp / (tp + fp)
        if tp + fp != 0:
            percision_list.append(tp / (tp + fp))
        else:
            percision_list.append(0)
        # recall = tp / (tp + fn)
        recall_list.append(tp / (tp + fn))
        # BCR = (percision + recall) / 2
        BCR_list.append((percision_list[-1] + recall_list[-1]) / 2)
    
    return acc_list, percision_list, recall_list, BCR_list

First we load in the data into a dataframe to look at the data.

In [10]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [11]:
df_train.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627,1,STANDING
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317,1,STANDING
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118,1,STANDING
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663,1,STANDING
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892,1,STANDING


Now that we have loaded the data, let's look at the categories in this data.

In [12]:
np.unique(list(df_train['Activity']))

array(['LAYING', 'SITTING', 'STANDING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'], dtype='<U18')

So we have 6 categories in this dataset.

# Training on Two Classes

So let's try out our network by using just 2 categories. We will do an easy task where we will compare two categories that are vastly differnet like laying and walking. In order to do that, we will filter the data to find only data that is of someone laying or walking.

In [13]:
activity_list = ["WALKING", "LAYING"]

train = df_train[df_train.Activity.isin(activity_list)].values
test = df_test[df_test.Activity.isin(activity_list)].values

# Create the training and testing set and shuffle the data
train_X = train[:,:-2]
train_Y = train[:,-1]
train_X, train_Y = shuffle(train_X, train_Y)

test_X = test[:,:-2]
test_Y = test[:,-1]
test_X, test_Y = shuffle(test_X, test_Y)

Now that we have a training and a testing set, we will train a simple neural net with only two layer with 100 neruons in the first layer and 50 in the second layer.

In [14]:
mlp = nn.MLPClassifier(hidden_layer_sizes=(100,50)).fit(train_X, train_Y)

In [15]:
mlp.loss_

0.00010978582085550533

By printing out the training loss of the net, we can roughly see how good the net is. The loss is caluculated using the log-loss function and gives a numerical estimate of how good a classifer is. The lower the loss the better. The network has a loss of less than 0.001 which is really good, but this could mean that the network is overfitting. To check this we could use a validation set or we can try the network on the test set to see how well it predicts the testing set.

In [16]:
pred = mlp.predict(test_X)

Now that we have the predictions that our netowrk made, we can check them using a confusion matrix. A confusion matrix is a way to check how the guesses made differ from the truth values. It works by creating a CxC matrix where C is the number of categories where the diagonals of the matrix show true postivies guesses, values along a row that are not on the diagonals show how many false postivies for that class were made and values along a column that are not on the diagonals show how many false negitives were made.   

In [17]:
confusion = confusion_matrix(pred, test_Y, labels=np.unique(test_Y))
confusion

array([[537,   0],
       [  0, 496]], dtype=int64)

Based on the confusion matrix, we can see that our network can perfectly guess the difference between sitting and standing as only the diagonals are nonzero values.

# Training on One Subject and Testing on Another

Now that we know that our network can dicern between two different activities, let us try it again on just one subject to see how well it does with getting the differences between all the classes.

In [18]:
# Set up the training and testing set once more
sub1_train = df_train[df_train['subject'] == 1].values
sub2_test = df_test[df_test['subject'] == 2].values

sub1_train_X = sub1_train[:,:-2]
sub1_train_Y = sub1_train[:,-1]
sub1_train_X, sub1_train_Y = shuffle(sub1_train_X, sub1_train_Y)

sub2_test_X = sub2_test[:,:-2]
sub2_test_Y = sub2_test[:,-1]
sub2_test_X, sub2_test_Y = shuffle(sub2_test_X, sub2_test_Y)

In [19]:
mlp = nn.MLPClassifier(hidden_layer_sizes=(100,50)).fit(sub1_train_X, sub1_train_Y)

In [20]:
mlp.loss_

0.002776433545265388

The loss here is around 0.003 which is still really good, but again this could be an over fitting issue so lets again use the confusion matrix to look at the quality of our network.

In [21]:
pred = mlp.predict(sub2_test_X)

confusion = confusion_matrix(pred, sub2_test_Y, labels=np.unique(sub2_test_Y))
confusion

array([[48,  0,  0,  0,  0,  0],
       [ 0, 16,  0,  0,  0,  0],
       [ 0, 30, 54,  0,  0,  0],
       [ 0,  0,  0, 54,  2,  0],
       [ 0,  0,  0,  5, 45, 40],
       [ 0,  0,  0,  0,  0,  8]], dtype=int64)

In [22]:
np.unique(sub2_test_Y)

array(['LAYING', 'SITTING', 'STANDING', 'WALKING', 'WALKING_DOWNSTAIRS',
       'WALKING_UPSTAIRS'], dtype=object)

So the network right now is not as good at this task as it was before. The confusion matrix shows that there are some errors in the network's predictions as there are nonzero values outside of the diagonal of the matrix. So here we will try and provide a metric to decide roughly how good our netowrk is. We want to see the accuracy, percision, recall, and (BCR) of our network. Each one of these calcualtions are some combination of true positives (tp), false positives (fp), true negitives (tn), and false negitives (fn). Accuracy is (tp + tn) / (tp + tn + fn + fp), percision is tp / (tp + fp), recall is tp / (tp + fn), and BCR is (percision + recall) / 2.

In [23]:
# Evaluate the model based on accuracy, percision, recall, and BCR
acc_list, percision_list, recall_list, BCR_list = calc_metrics(confusion)

In [24]:
# Average accuracy
print (np.sum(acc_list)/len(acc_list))

# Average percision
print (np.sum(percision_list)/len(percision_list))

# Average recall
print (np.sum(recall_list)/len(recall_list))

# Average accuracy
print (np.sum(BCR_list)/len(BCR_list))

0.9150110375275938
0.8511904761904763
0.731198966570327
0.7911947213804016


We can see that the average accuracy of the network is greater than 90%, but that does not tell the whole story. In the accuracy calculation a true negitive is when the network does not guess a particular class when it shouldn't. The problem here is that if we want to know the accuracy of class 1 any time the network is given a sample from outside class 1, no matter what guess the network makes so long as the guess is not class 1, it is counted as a tn. BCR is a more accurate calculation of how well the network is doing since it only looks at tp, fp, and fn. This allows is to penalize wrong guesses when it pertains to the class. 

So now lets look at those metrics per class to see what classes the network is having difficulties with.

In [25]:
act_list = np.unique(sub2_test_Y)
for i in range(len(act_list)):
    print("Metrics for %s" % act_list[i])
    print("  Accuracy: %f" % acc_list[i])
    print("  Percision: %f" % percision_list[i])
    print("  Recall: %f" % recall_list[i])
    print("  BCR: %f" % BCR_list[i])

Metrics for LAYING
  Accuracy: 1.000000
  Percision: 1.000000
  Recall: 1.000000
  BCR: 1.000000
Metrics for SITTING
  Accuracy: 0.900662
  Percision: 1.000000
  Recall: 0.347826
  BCR: 0.673913
Metrics for STANDING
  Accuracy: 0.900662
  Percision: 0.642857
  Recall: 1.000000
  BCR: 0.821429
Metrics for WALKING
  Accuracy: 0.976821
  Percision: 0.964286
  Recall: 0.915254
  BCR: 0.939770
Metrics for WALKING_DOWNSTAIRS
  Accuracy: 0.844371
  Percision: 0.500000
  Recall: 0.957447
  BCR: 0.728723
Metrics for WALKING_UPSTAIRS
  Accuracy: 0.867550
  Percision: 1.000000
  Recall: 0.166667
  BCR: 0.583333


The network is really good at getting laying as it will never make a mistake with laying (BCR = 100%). However, the network struggles with sitting and walking upstairs, both having a BCR of less than 70%.

# Experimenting with Structure of the Neural Net

Now that we see there are some problems with our network, let's see if we can improve our network by chaning things like the activation function, number of layers, and number of neurons in each layer (learning rate, maybe try doing in pytorch as well).

In [26]:
# Set up the training and testing set once more
sub1_train = df_train[df_train['subject'] == 1].values
sub2_test = df_test[df_test['subject'] == 2].values

sub1_train_X = sub1_train[:,:-2]
sub1_train_Y = sub1_train[:,-1]
sub1_train_X, sub1_train_Y = shuffle(sub1_train_X, sub1_train_Y)

sub2_test_X = sub2_test[:,:-2]
sub2_test_Y = sub2_test[:,-1]
sub2_test_X, sub2_test_Y = shuffle(sub2_test_X, sub2_test_Y)

### Activation function

Our old network used ReLU as its activation function so let's try out two other activation functions: tanh and sigmoid. (do k-fold validation)

In [27]:
mlp_tanh = nn.MLPClassifier(hidden_layer_sizes=(100,50), activation="tanh")
mlp_sig = nn.MLPClassifier(hidden_layer_sizes=(100,50), activation='logistic', max_iter=500)
mlp_ReLU = mlp = nn.MLPClassifier(hidden_layer_sizes=(100,50))

The sigmoid network needed more iterations in order to converge, since we use early stopping for the other network this does not change the performance of the other networks.

In order to compare the networks we will use K-Fold validation to expose each network to the full set of training data. KFold validation is a way for us to compare the quality of the networks by spliting the dataset into k different groups and training each network, leaving one of the groups out of training to use as a validation set, then testing the quality of the network on the validation set and repeating until every group has been used once. We will split the training data into 3 stratified sets and hold one set out to use as a validation set. This means that we seperate the data in a way that tries to keep the proportions of each class in the data set constant per set.

In [47]:
# Create lists to store the results of each fold for each network
metrics_tanh = []
metrics_sig = []
metrics_ReLU = []

# Create the k splits
kf = StratifiedKFold(n_splits=3)
for train_idx, val_idx in kf.split(sub1_train_X, sub1_train_Y):
    # For each split seperate the training and the validation set
    train_X, val_X = sub1_train_X[train_idx], sub1_train_X[val_idx]
    train_Y, val_Y = sub1_train_Y[train_idx], sub1_train_Y[val_idx]
    
    # Train each network on the split
    mlp_tanh_train = mlp_tanh.fit(train_X, train_Y)
    mlp_sig_train = mlp_tanh.fit(train_X, train_Y)
    mlp_ReLU_train = mlp_tanh.fit(train_X, train_Y)
    
    # Evaluate each network
    pred_tanh = mlp_tanh_train.predict(val_X)
    confusion_tanh = confusion_matrix(pred_tanh, val_Y, labels=np.unique(val_Y))
    metrics_tanh.append(calc_metrics(confusion_tanh))

    pred_sig = mlp_sig_train.predict(val_X)
    confusion_sig = confusion_matrix(pred_sig, val_Y, labels=np.unique(val_Y))
    metrics_sig.append(calc_metrics(confusion_sig))
    
    pred_ReLU = mlp_ReLU_train.predict(val_X)
    confusion_ReLU = confusion_matrix(pred_ReLU, val_Y, labels=np.unique(val_Y))
    metrics_ReLU.append(calc_metrics(confusion_sig))

['WALKING_UPSTAIRS' 'WALKING_DOWNSTAIRS' 'WALKING_DOWNSTAIRS' 'STANDING'
 'WALKING_UPSTAIRS' 'WALKING' 'WALKING_UPSTAIRS' 'SITTING' 'STANDING'
 'STANDING' 'WALKING_UPSTAIRS' 'SITTING' 'LAYING' 'WALKING' 'STANDING'
 'STANDING' 'WALKING' 'STANDING' 'LAYING' 'WALKING' 'WALKING' 'STANDING'
 'WALKING' 'STANDING' 'WALKING_UPSTAIRS' 'WALKING_DOWNSTAIRS' 'WALKING'
 'WALKING_UPSTAIRS' 'WALKING' 'LAYING' 'WALKING_UPSTAIRS' 'WALKING'
 'WALKING_DOWNSTAIRS' 'LAYING' 'WALKING' 'STANDING' 'WALKING' 'SITTING'
 'WALKING_UPSTAIRS' 'STANDING' 'WALKING_UPSTAIRS' 'WALKING' 'SITTING'
 'SITTING' 'LAYING' 'WALKING_UPSTAIRS' 'WALKING_DOWNSTAIRS' 'LAYING'
 'WALKING' 'LAYING' 'STANDING' 'WALKING_UPSTAIRS' 'WALKING' 'WALKING'
 'LAYING' 'LAYING' 'WALKING_DOWNSTAIRS' 'STANDING' 'WALKING_DOWNSTAIRS'
 'LAYING' 'SITTING' 'WALKING' 'WALKING_DOWNSTAIRS' 'STANDING'
 'WALKING_DOWNSTAIRS' 'STANDING' 'WALKING_DOWNSTAIRS' 'WALKING_UPSTAIRS'
 'WALKING' 'SITTING' 'WALKING_UPSTAIRS' 'WALKING' 'STANDING' 'WALKING'
 'WALKING_UPST

In [68]:
for i in range(len(metics_tanh))

array([1.5, 1.5, 1.5, 1.5])

In [72]:
metrics_tanh[0][3]

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

Comparing the BCR for each activation function it seems that tanh is better than the rest with a BCR of 80%.

### Larger and smaller networks

Now lets change around the netowrk size to see if a larger network or a smaller network would work better. Since we know that tanh is the better activation function, we will use that in these networks.

In [24]:
# TODO: Implement k-fold validation

In [25]:
mlp_onelayer = nn.MLPClassifier(hidden_layer_sizes=(100), activation="tanh").fit(sub1_train_X, sub1_train_Y)
mlp_less_neurons = nn.MLPClassifier(hidden_layer_sizes=(50, 25), activation="tanh").fit(sub1_train_X, sub1_train_Y)
mlp_more_neurons = nn.MLPClassifier(hidden_layer_sizes=(200, 100), activation="tanh").fit(sub1_train_X, sub1_train_Y)
mlp_threelayer = nn.MLPClassifier(hidden_layer_sizes=(100, 50, 50), activation="tanh").fit(sub1_train_X, sub1_train_Y)

In [26]:
pred_onelayer = mlp_onelayer.predict(sub2_test_X)
confusion_onelayer = confusion_matrix(pred_onelayer, sub2_test_Y, labels=np.unique(sub2_test_Y))
metrics_onelayer = calc_metrics(confusion_onelayer)
print("Metrics for tanh activation function:")
for metric in metrics_onelayer:
    print (np.sum(metric)/len(metric))

Metrics for tanh activation function:
0.9205298013245033
0.8495155150981709
0.7454774385856718
0.7974964768419214


In [27]:
pred_less_neurons = mlp_less_neurons.predict(sub2_test_X)
confusion_less_neurons = confusion_matrix(pred_less_neurons, sub2_test_Y, labels=np.unique(sub2_test_Y))
metrics_less_neurons = calc_metrics(confusion_less_neurons)
print("Metrics for tanh activation function:")
for metric in metrics_less_neurons:
    print (np.sum(metric)/len(metric))

Metrics for tanh activation function:
0.9172185430463577
0.8415352763178849
0.7347588395518553
0.7881470579348702


In [28]:
pred_more_neurons = mlp_more_neurons.predict(sub2_test_X)
confusion_more_neurons = confusion_matrix(pred_more_neurons, sub2_test_Y, labels=np.unique(sub2_test_Y))
metrics_more_neurons = calc_metrics(confusion_more_neurons)
print("Metrics for tanh activation function:")
for metric in metrics_more_neurons:
    print (np.sum(metric)/len(metric))

Metrics for tanh activation function:
0.9238410596026491
0.8569775132275131
0.7557399270223044
0.8063587201249088


In [29]:
pred_threelayer = mlp_threelayer.predict(sub2_test_X)
confusion_threelayer = confusion_matrix(pred_threelayer, sub2_test_Y, labels=np.unique(sub2_test_Y))
metrics_threelayer = calc_metrics(confusion_threelayer)
print("Metrics for tanh activation function:")
for metric in metrics_threelayer:
    print (np.sum(metric)/len(metric))

Metrics for tanh activation function:
0.9183222958057394
0.8435637891520243
0.7380800955905027
0.7908219423712636


Comparing the BCR for each of the trials to the one obtained from the tanh model, we can see that increasing the number of layers imporved the network.

# Final Neural Net

In [30]:
train = df_train.values
test = df_test.values

train_X = train[:,:-2]
train_Y = train[:,-1]
train_X, train_Y = shuffle(train_X, train_Y)

test_X = test[:,:-2]
test_Y = test[:,-1]
test_X, test_Y = shuffle(test_X, test_Y)

In [32]:
layer_makeup = (100,100,100,50,50)
mlp = nn.MLPClassifier(hidden_layer_sizes=layer_makeup, activation="tanh").fit(train_X, train_Y)

In [33]:
mlp.loss_

0.015939948597319648

Here we get a training loss of about 0.03. While this is higher than before, it is still fine. This could show that the network has stopped overfitting and is performing even better than before. We will check this by checking its results on the testing set.

In [34]:
pred = mlp.predict(test_X)
confusion = confusion_matrix(pred, test_Y, labels=np.unique(test_Y))
confusion

array([[537,   0,   0,   0,   0,   0],
       [  0, 458,  39,   0,   0,   0],
       [  0,  31, 492,   0,   0,   0],
       [  0,   0,   1, 489,   4,  27],
       [  0,   0,   0,   4, 400,  14],
       [  0,   2,   0,   3,  16, 430]], dtype=int64)

In [35]:
# Evaluate the model based on accuracy, percision, recall, and BCR
acc_list, percision_list, recall_list, BCR_list = calc_metrics(confusion)

# Average accuracy
print (np.sum(acc_list)/len(acc_list))

# Average percision
print (np.sum(percision_list)/len(percision_list))

# Average recall
print (np.sum(recall_list)/len(recall_list))

# Average accuracy
print (np.sum(BCR_list)/len(BCR_list))

0.9840515778758059
0.9518683355228538
0.9514702451651931
0.9516692903440234


In [36]:
act_list = np.unique(test_Y)
for i in range(len(act_list)):
    print("Metrics for %s" % act_list[i])
    print("  Accuracy: %f" % acc_list[i])
    print("  Percision: %f" % percision_list[i])
    print("  Recall: %f" % recall_list[i])
    print("  BCR: %f" % BCR_list[i])

Metrics for LAYING
  Accuracy: 1.000000
  Percision: 1.000000
  Recall: 1.000000
  BCR: 1.000000
Metrics for SITTING
  Accuracy: 0.975568
  Percision: 0.921529
  Recall: 0.932790
  BCR: 0.927160
Metrics for STANDING
  Accuracy: 0.975908
  Percision: 0.940727
  Recall: 0.924812
  BCR: 0.932769
Metrics for WALKING
  Accuracy: 0.986766
  Percision: 0.938580
  Recall: 0.985887
  BCR: 0.962233
Metrics for WALKING_DOWNSTAIRS
  Accuracy: 0.987106
  Percision: 0.956938
  Recall: 0.952381
  BCR: 0.954659
Metrics for WALKING_UPSTAIRS
  Accuracy: 0.978962
  Percision: 0.953437
  Recall: 0.912951
  BCR: 0.933194


The average BCR here has really imporved. It is at the point where the network can correctly guess what actiivity someone is doing about 95% of the time.