# Exploratory Data Analysis[EDA]

## Import Libraries

In [63]:
# Data Analysis Libraries
import pandas as pd
import numpy as np

## Get Feature List

In [64]:
#Initialize list to store features
feature_list = list()

## Open features.txt and get feature names
with open('data/UCI HAR Dataset/features.txt') as f:
    feature_list = [line.split()[1] for line in f.readlines()]
    
## Print Statement
print('No of Features: {}'.format(len(feature_list)))
print("Feature List:", feature_list)

No of Features: 561
Feature List: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X', 'tBodyAcc-max()-Y', 'tBodyAcc-max()-Z', 'tBodyAcc-min()-X', 'tBodyAcc-min()-Y', 'tBodyAcc-min()-Z', 'tBodyAcc-sma()', 'tBodyAcc-energy()-X', 'tBodyAcc-energy()-Y', 'tBodyAcc-energy()-Z', 'tBodyAcc-iqr()-X', 'tBodyAcc-iqr()-Y', 'tBodyAcc-iqr()-Z', 'tBodyAcc-entropy()-X', 'tBodyAcc-entropy()-Y', 'tBodyAcc-entropy()-Z', 'tBodyAcc-arCoeff()-X,1', 'tBodyAcc-arCoeff()-X,2', 'tBodyAcc-arCoeff()-X,3', 'tBodyAcc-arCoeff()-X,4', 'tBodyAcc-arCoeff()-Y,1', 'tBodyAcc-arCoeff()-Y,2', 'tBodyAcc-arCoeff()-Y,3', 'tBodyAcc-arCoeff()-Y,4', 'tBodyAcc-arCoeff()-Z,1', 'tBodyAcc-arCoeff()-Z,2', 'tBodyAcc-arCoeff()-Z,3', 'tBodyAcc-arCoeff()-Z,4', 'tBodyAcc-correlation()-X,Y', 'tBodyAcc-correlation()-X,Z', 'tBodyAcc-correlation()-Y,Z', 'tGravityAcc-mean()-X', 'tGravityAcc-mean

In [65]:
#Numerically Encode the Activities
ACTIVITIES = {
    1: 'WALKING',
    2: 'WALKING_UPSTAIRS',
    3: 'WALKING_DOWNSTAIRS',
    4: 'SITTING',
    5: 'STANDING',
    6: 'LAYING',
}

## Explore Training Data[$X_{train}$]

In [66]:
#Base Directory
BASEDIR = "data/UCI HAR Dataset/train"

#Features and Labels
FEATURES_TRAINING = BASEDIR + "/X_train.txt"
LABELS_TRAINING = BASEDIR + "/y_train.txt"

#Training Subject Information
TRAIN_SUBJECTS = 'data/UCI HAR Dataset/train/subject_train.txt'

#Get Training Data
X_train = pd.read_csv(FEATURES_TRAINING, delim_whitespace = True, header = None)
X_train.columns = feature_list

## Add Subject Information to Training Dataframe
X_train['Subject'] = pd.read_csv(TRAIN_SUBJECTS, header=None).squeeze("columns")

## Set up Labels
Y_train = pd.read_csv(LABELS_TRAINING, names = ['Activity']).squeeze("columns")
Y_train_labels = Y_train.map(ACTIVITIES)

#Concatenate X_train and Y_train to one matrix
train = X_train
train['Activity'] = Y_train
train['ActivityName'] = Y_train_labels

## Plot a few samples of the training data!
train.sample(5)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Subject,Activity,ActivityName
5922,0.299526,-0.022923,-0.118008,-0.981942,-0.960797,-0.964511,-0.981426,-0.954479,-0.962861,-0.923241,...,-0.370835,0.194255,0.517065,-0.509891,-0.785649,0.019073,-0.115302,27,4,SITTING
3875,0.355876,-0.00701,-0.105277,-0.947398,-0.97251,-0.98402,-0.949436,-0.975052,-0.985482,-0.870228,...,0.028812,-0.855716,0.40111,0.435754,0.907195,-0.437676,-0.504009,19,6,LAYING
3627,0.264106,-0.038552,-0.098421,-0.97907,-0.87631,-0.958317,-0.984604,-0.889666,-0.961266,-0.912181,...,-0.015509,-0.230901,-0.26554,-0.09524,-0.729093,0.268088,-0.062517,19,5,STANDING
4442,0.280794,-0.014619,-0.112501,-0.993845,-0.993057,-0.991472,-0.99456,-0.992949,-0.991317,-0.936799,...,0.046933,0.559341,0.001402,0.145075,0.537677,-0.409825,-0.598759,22,6,LAYING
199,0.277963,-0.015041,-0.106399,-0.997229,-0.989634,-0.992676,-0.99724,-0.988831,-0.991881,-0.943439,...,0.158464,-0.172523,-0.605144,-0.924045,-0.675481,0.319782,0.051878,1,5,STANDING


## Explore Test Data[$X_{test}$]

In [67]:
#Base Directory
BASEDIR = "data/UCI HAR Dataset/test"

#Features and Labels
FEATURES_TEST = BASEDIR + "/X_test.txt"
LABELS_TEST = BASEDIR + "/y_test.txt"

#Training Subject Information
TEST_SUBJECTS = 'data/UCI HAR Dataset/test/subject_test.txt'

#Get Training Data
X_test = pd.read_csv(FEATURES_TEST, delim_whitespace = True, header = None)
X_test.columns = feature_list

## Add Subject Information to Training Dataframe
X_test['Subject'] = pd.read_csv(TEST_SUBJECTS, header=None).squeeze("columns")

## Set up Labels
Y_test = pd.read_csv(LABELS_TEST, names = ['Activity']).squeeze("columns")
Y_test_labels = Y_test.map(ACTIVITIES)

#Concatenate X_test and Y_test to one matrix
test = X_test
test['Activity'] = Y_test
test['ActivityName'] = Y_test_labels

## Plot a few samples of the test data!
test.sample(5)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Subject,Activity,ActivityName
2688,0.289094,-0.031074,-0.106702,-0.479297,-0.1333,-0.263706,-0.483602,-0.173264,-0.246997,-0.458304,...,-0.231193,0.464067,-0.678821,-0.246425,-0.840292,0.146159,0.122218,24,1,WALKING
564,0.304477,-0.017665,-0.089242,-0.32173,0.041882,-0.58054,-0.349436,0.010841,-0.573951,-0.073342,...,-0.518972,0.088687,0.667262,-0.584848,-0.898567,0.121028,-0.052488,4,1,WALKING
1731,0.278337,-0.020052,-0.111465,-0.995799,-0.986174,-0.99349,-0.995878,-0.984867,-0.992993,-0.939064,...,0.190481,0.156571,-0.513223,-0.475003,-0.610918,-0.159205,-0.070786,13,4,SITTING
1049,0.213687,0.032561,-0.10496,-0.197134,0.030564,-0.220324,-0.256309,0.07279,-0.202411,0.099647,...,0.476074,-0.820175,-0.853793,0.853178,-0.732997,0.282555,0.012282,10,2,WALKING_UPSTAIRS
130,0.410859,-0.024854,-0.148987,0.134885,0.317884,0.303422,-0.022184,0.304157,0.273941,0.656513,...,-0.923376,0.784068,0.817548,0.15829,-0.512183,0.270154,0.313362,2,3,WALKING_DOWNSTAIRS


## Sanity Checks for Duplicates/NaN

In [68]:
## NAN Checks
print(f'We have {train.isnull().values.sum()} NaN/Null values in train')
print(f'We have {test.isnull().values.sum()} NaN/Null values in test')

We have 0 NaN/Null values in train
We have 0 NaN/Null values in test
