# Exploratory Data Analysis[EDA]

## Import Libraries

In [26]:
import pandas as pd
import numpy as np

## Get Feature List

In [27]:
## Get Features
feature_list = list()

## Open features.txt and get feature names
with open('data/UCI HAR Dataset/features.txt') as f:
    feature_list = [line.split()[1] for line in f.readlines()]
    
## Print Statement
print('No of Features: {}'.format(len(feature_list)))
print("Feature List:", feature_list)

No of Features: 561
Feature List: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X', 'tBodyAcc-max()-Y', 'tBodyAcc-max()-Z', 'tBodyAcc-min()-X', 'tBodyAcc-min()-Y', 'tBodyAcc-min()-Z', 'tBodyAcc-sma()', 'tBodyAcc-energy()-X', 'tBodyAcc-energy()-Y', 'tBodyAcc-energy()-Z', 'tBodyAcc-iqr()-X', 'tBodyAcc-iqr()-Y', 'tBodyAcc-iqr()-Z', 'tBodyAcc-entropy()-X', 'tBodyAcc-entropy()-Y', 'tBodyAcc-entropy()-Z', 'tBodyAcc-arCoeff()-X,1', 'tBodyAcc-arCoeff()-X,2', 'tBodyAcc-arCoeff()-X,3', 'tBodyAcc-arCoeff()-X,4', 'tBodyAcc-arCoeff()-Y,1', 'tBodyAcc-arCoeff()-Y,2', 'tBodyAcc-arCoeff()-Y,3', 'tBodyAcc-arCoeff()-Y,4', 'tBodyAcc-arCoeff()-Z,1', 'tBodyAcc-arCoeff()-Z,2', 'tBodyAcc-arCoeff()-Z,3', 'tBodyAcc-arCoeff()-Z,4', 'tBodyAcc-correlation()-X,Y', 'tBodyAcc-correlation()-X,Z', 'tBodyAcc-correlation()-Y,Z', 'tGravityAcc-mean()-X', 'tGravityAcc-mean

In [28]:
#Numerically Encode the Activities
ACTIVITIES = {
    0: 'WALKING',
    1: 'WALKING_UPSTAIRS',
    2: 'WALKING_DOWNSTAIRS',
    3: 'SITTING',
    4: 'STANDING',
    5: 'LAYING',
}

## Explore Training Data[$X_{train}$]

In [29]:
#Base Directory
BASEDIR = "data/UCI HAR Dataset/train"

#Features and Labels
FEATURES = BASEDIR + "/X_train.txt"
LABELS = BASEDIR + "/y_train.txt"

#Training Subject Information
TEST_SUBJECTS = 'data/UCI HAR Dataset/train/subject_train.txt'

#Get Training Data
X_test = pd.read_csv(FEATURES, delim_whitespace = True, header = None)
X_test.columns = feature_list

## Add Subject Information to Training Dataframe
X_test['Subject'] = pd.read_csv(TEST_SUBJECTS, header=None).squeeze("columns")

## Set up Labels
Y_test = pd.read_csv(LABELS, names = ['Activity']).squeeze("columns")
Y_test_labels = Y_test.map(ACTIVITIES)

#Concatenate X_train and Y_train to one matrix
test = X_test
test['Activity'] = Y_test
test['ActivityName'] = Y_test_labels

## Plot a few samples of the training data!
test.sample(5)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Subject,Activity,ActivityName
6552,0.295794,-0.014411,-0.113734,-0.955005,-0.968259,-0.977753,-0.957509,-0.965047,-0.975066,-0.891025,...,0.105827,-0.145812,0.172074,0.009528,0.562766,-0.286009,-0.728898,28,6,
5082,0.278372,-0.018324,-0.109368,-0.996676,-0.964301,-0.985914,-0.996683,-0.961909,-0.983512,-0.941514,...,-0.120295,0.037329,-0.108622,0.216319,-0.660024,0.200909,-0.201196,25,5,LAYING
4511,0.313266,-0.026318,-0.080982,0.074007,0.279732,-0.092542,0.071571,0.23353,-0.076896,0.01217,...,-0.460265,-0.788172,-0.758045,0.025444,-0.7055,0.30157,0.022229,22,2,WALKING_DOWNSTAIRS
4120,0.228344,-0.01285,-0.064602,-0.378818,0.101766,-0.20496,-0.398856,0.1078,-0.178296,-0.258528,...,0.552682,-0.778303,0.635419,-0.328803,-0.555368,0.350901,0.191569,21,1,WALKING_UPSTAIRS
672,0.12804,-0.17149,-0.201265,-0.317705,-0.076935,-0.303066,-0.35845,-0.114142,-0.291694,-0.198286,...,0.023843,-0.902173,-0.977837,0.467573,-0.678191,0.286501,0.138094,3,2,WALKING_DOWNSTAIRS


## Explore Test Data[$X_{test}$]

In [30]:
#Base Directory
BASEDIR = "data/UCI HAR Dataset/test"

#Features and Labels
FEATURES = BASEDIR + "/X_test.txt"
LABELS = BASEDIR + "/y_test.txt"

#Training Subject Information
TEST_SUBJECTS = 'data/UCI HAR Dataset/test/subject_test.txt'

#Get Training Data
X_test = pd.read_csv(FEATURES, delim_whitespace = True, header = None)
X_test.columns = feature_list

## Add Subject Information to Training Dataframe
X_test['Subject'] = pd.read_csv(TEST_SUBJECTS, header=None).squeeze("columns")

## Set up Labels
Y_test = pd.read_csv(LABELS, names = ['Activity']).squeeze("columns")
Y_test_labels = Y_test.map(ACTIVITIES)

#Concatenate X_test and Y_test to one matrix
test = X_test
test['Activity'] = Y_test
test['ActivityName'] = Y_test_labels

## Plot a few samples of the test data!
test.sample(5)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Subject,Activity,ActivityName
1462,0.236398,-0.046664,-0.06749,-0.024004,-0.063897,-0.532306,-0.087457,-0.094151,-0.529782,0.375966,...,0.210497,0.982068,-0.305156,-0.425433,-0.8801,0.179049,0.036952,12,1,WALKING_UPSTAIRS
1998,0.226075,-0.014891,-0.2047,-0.432732,-0.360538,-0.381336,-0.50272,-0.364823,-0.386959,-0.146339,...,0.263353,-0.03363,0.932111,-0.017459,-0.704877,0.293349,-0.042078,18,2,WALKING_DOWNSTAIRS
529,0.2578,-0.016095,-0.106748,-0.987424,-0.980003,-0.98759,-0.987443,-0.980904,-0.988268,-0.937776,...,-0.051061,0.248941,0.532633,-0.687609,0.529496,-0.882329,-0.101303,4,6,
1208,0.278206,-0.016704,-0.110684,-0.995716,-0.988089,-0.981073,-0.996081,-0.988566,-0.980986,-0.940708,...,-0.104216,-0.135222,-0.660916,-0.340332,-0.863651,0.184778,0.056727,12,5,LAYING
1167,0.088293,-0.026905,-0.214148,0.277034,0.068286,-0.187604,0.224726,0.076189,-0.222217,0.571543,...,0.588381,-0.007652,0.94836,-0.165366,-0.90401,0.045601,-0.029669,10,3,SITTING
