In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import time
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold

In [5]:
##Preprocessing

##Using dataset with ideal placement of sensors.
data = pd.read_csv("subject4_ideal.log",delim_whitespace=True,header=None)
#print(data)

##Check if dataframe contains NaN values, returns true if there are.
print("Does dataframe contain NaN values? {}".format(data.isnull().any().any()))

##Remove previous preprocessed file
if (os.path.exists('np.txt') == True):
    os.remove("np.txt")

##Feature selection with MEAN to reduce file size/rows. 
i = 0
rows_per_partition = 10
start = 0
end = rows_per_partition
remainder_rows = data.shape[0]%rows_per_partition
f = open("np.txt", "a")
while (i<data.shape[0]):
    
    np.savetxt(f, data[start:end].mean(axis=0).to_frame().T, fmt='%.6f', delimiter=' ')
    i+=rows_per_partition
    start += rows_per_partition
    end += rows_per_partition

##Add remainder rows
#print(data[data.shape[0] - remainder_rows :data.shape[0]])
np.savetxt(f, data[data.shape[0] - remainder_rows :data.shape[0]].mean(axis=0).to_frame().T, fmt='%.6f', delimiter=' ')


##Standardize/Normalize features (Mean = 0, S.D = 1) 
def standardise_dataset(X_train, X_test):
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.fit_transform(X_test)
    return X_train, X_test

##Dropping QUAT and MAG columns as they are not relevant to the context of our project
delete_columns = []
count = 0;
i = 2;
while i < len(data.columns):
    count += 1
    if(count==6):
        #print(i)
        for j in range(8):
            delete_columns.append(i)
            i += 1
        count = 0
        i -=1
    i += 1

##Use preprocessed txt file
data = pd.read_csv("np.txt",delim_whitespace=True,header=None)
#print(data)

X = data.drop(delete_columns, axis = 1)        
Y = data[119].round().astype(int)

#print(data[119].unique())
#print(Y.unique())

##Splitting dataset into 80% training data and 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
#print(y_train, y_test)
##Standardize features
X_train, X_test = standardise_dataset(X_train, X_test)


Does dataframe contain NaN values? False


In [6]:
def train_model(model, X_train, y_train):
    clf = model
    clf.fit(X_train, y_train)
    return clf

def eval_model(clf, X_test, y_test, X_train, y_train):
    pred_clf = clf.predict(X_test)
    acc_test = clf.score(X_test, y_test)
    acc_train = clf.score(X_train, y_train)
    
    print("Evluation for {}".format(clf))
    print("Train Accuracy: {}".format(acc_train))    
    print("Test Accuracy: {}".format(acc_test))
    #print(classification_report(y_test, pred_clf))
    #print(confusion_matrix(y_test, pred_clf))
    print(pd.crosstab(y_test, pred_clf, rownames=['True'], colnames=['Predicted'], margins=True))


start_time = time.time()
knn_clf = train_model(KNeighborsClassifier(), X_train, y_train)
eval_model(knn_clf, X_test, y_test, X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
print("")


start_time = time.time()
rf_clf = train_model(RandomForestClassifier(n_estimators = 20), X_train, y_train)
eval_model(rf_clf, X_test, y_test, X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))
print("")

# param_grid = { 
#     'n_estimators': [200, 700],
# }
# rfc = RandomForestClassifier(n_estimators=50) 
# CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid)
# CV_rfc.fit(X_train, y_train)
# print (CV_rfc.best_params_)

Evluation for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Train Accuracy: 0.9671274443695212
Test Accuracy: 0.9582210242587601
Predicted    0   1   2   3   4   5   6   7  8   9  ...  23  24  25  26  27  \
True                                               ...                       
0          760  13   4   7   1   0   0   2  0   0  ...   0   0   1   0   0   
1            2  48   0   0   0   0   0   0  0   0  ...   0   0   0   0   0   
2            4   1  65   0   0   0   0   0  0   0  ...   0   0   0   0   0   
3            0   0   0  40   0   0   0   0  0   0  ...   0   0   0   0   0   
4            0   0   0   0  12   0   0   0  0   0  ...   0   0   0   0   0   
5            0   0   0   0   4  19   0   0  0   0  ...   0   0   0   0   0   
6            0   0   0   0   1   0  20   0  0   0  ...   0   0   0   0   0   
7            0   0   0   0  