# Exploratory Data Analysis[EDA]

## Import Libraries

In [69]:
# Data Analysis Libraries
import pandas as pd
import numpy as np

## Get Feature List

In [70]:
#Initialize list to store features
feature_list = list()

## Open features.txt and get feature names
with open('data/UCI HAR Dataset/features.txt') as f:
    feature_list = [line.split()[1] for line in f.readlines()]
    
## Print Statement
print('No of Features: {}'.format(len(feature_list)))
print("Feature List:", feature_list)

No of Features: 561
Feature List: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X', 'tBodyAcc-max()-Y', 'tBodyAcc-max()-Z', 'tBodyAcc-min()-X', 'tBodyAcc-min()-Y', 'tBodyAcc-min()-Z', 'tBodyAcc-sma()', 'tBodyAcc-energy()-X', 'tBodyAcc-energy()-Y', 'tBodyAcc-energy()-Z', 'tBodyAcc-iqr()-X', 'tBodyAcc-iqr()-Y', 'tBodyAcc-iqr()-Z', 'tBodyAcc-entropy()-X', 'tBodyAcc-entropy()-Y', 'tBodyAcc-entropy()-Z', 'tBodyAcc-arCoeff()-X,1', 'tBodyAcc-arCoeff()-X,2', 'tBodyAcc-arCoeff()-X,3', 'tBodyAcc-arCoeff()-X,4', 'tBodyAcc-arCoeff()-Y,1', 'tBodyAcc-arCoeff()-Y,2', 'tBodyAcc-arCoeff()-Y,3', 'tBodyAcc-arCoeff()-Y,4', 'tBodyAcc-arCoeff()-Z,1', 'tBodyAcc-arCoeff()-Z,2', 'tBodyAcc-arCoeff()-Z,3', 'tBodyAcc-arCoeff()-Z,4', 'tBodyAcc-correlation()-X,Y', 'tBodyAcc-correlation()-X,Z', 'tBodyAcc-correlation()-Y,Z', 'tGravityAcc-mean()-X', 'tGravityAcc-mean

In [71]:
#Numerically Encode the Activities
ACTIVITIES = {
    1: 'WALKING',
    2: 'WALKING_UPSTAIRS',
    3: 'WALKING_DOWNSTAIRS',
    4: 'SITTING',
    5: 'STANDING',
    6: 'LAYING',
}

## Explore Training Data[$X_{train}$]

In [72]:
#Base Directory
BASEDIR = "data/UCI HAR Dataset/train"

#Features and Labels
FEATURES_TRAINING = BASEDIR + "/X_train.txt"
LABELS_TRAINING = BASEDIR + "/y_train.txt"

#Training Subject Information
TRAIN_SUBJECTS = 'data/UCI HAR Dataset/train/subject_train.txt'

#Get Training Data
X_train = pd.read_csv(FEATURES_TRAINING, delim_whitespace = True, header = None)
X_train.columns = feature_list

## Add Subject Information to Training Dataframe
X_train['Subject'] = pd.read_csv(TRAIN_SUBJECTS, header=None).squeeze("columns")

## Set up Labels
Y_train = pd.read_csv(LABELS_TRAINING, names = ['Activity']).squeeze("columns")
Y_train_labels = Y_train.map(ACTIVITIES)

#Concatenate X_train and Y_train to one matrix
train = X_train
train['Activity'] = Y_train
train['ActivityName'] = Y_train_labels

## Plot a few samples of the training data!
train.sample(5)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Subject,Activity,ActivityName
3510,0.23377,-0.029291,-0.077472,-0.966555,-0.959627,-0.95289,-0.975119,-0.968586,-0.956875,-0.9254,...,0.048249,0.03842,0.479779,0.397166,0.38689,-0.544104,-0.43049,17,6,LAYING
3252,0.284621,-0.012627,-0.098683,-0.998387,-0.990948,-0.993396,-0.998839,-0.990971,-0.992834,-0.941191,...,-0.00411,0.047987,0.115244,0.221741,-0.662064,0.222567,0.221912,17,5,STANDING
145,0.312776,-0.050409,-0.043029,0.026285,0.168278,-0.1862,-0.074635,0.090839,-0.161996,0.34324,...,-0.23707,0.340483,0.60283,-0.561069,-0.750669,0.269015,0.040438,1,3,WALKING_DOWNSTAIRS
3398,0.142888,-0.013431,-0.161956,0.11989,-0.32428,-0.015158,0.07692,-0.335978,0.00644,0.249222,...,0.49021,-0.982243,0.963837,-0.15994,-0.747093,0.123319,0.199163,17,3,WALKING_DOWNSTAIRS
395,0.274639,-0.014627,-0.096355,-0.997169,-0.984687,-0.980618,-0.997495,-0.983633,-0.977732,-0.943189,...,0.021699,-0.389085,-0.15918,-0.250565,-0.865639,0.028543,-0.050865,3,4,SITTING


## Explore Test Data[$X_{test}$]

In [73]:
#Base Directory
BASEDIR = "data/UCI HAR Dataset/test"

#Features and Labels
FEATURES_TEST = BASEDIR + "/X_test.txt"
LABELS_TEST = BASEDIR + "/y_test.txt"

#Training Subject Information
TEST_SUBJECTS = 'data/UCI HAR Dataset/test/subject_test.txt'

#Get Training Data
X_test = pd.read_csv(FEATURES_TEST, delim_whitespace = True, header = None)
X_test.columns = feature_list

## Add Subject Information to Training Dataframe
X_test['Subject'] = pd.read_csv(TEST_SUBJECTS, header=None).squeeze("columns")

## Set up Labels
Y_test = pd.read_csv(LABELS_TEST, names = ['Activity']).squeeze("columns")
Y_test_labels = Y_test.map(ACTIVITIES)

#Concatenate X_test and Y_test to one matrix
test = X_test
test['Activity'] = Y_test
test['ActivityName'] = Y_test_labels

## Plot a few samples of the test data!
test.sample(5)

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",Subject,Activity,ActivityName
1974,0.213183,-0.030436,-0.12124,-0.362066,-0.173584,-0.528167,-0.399511,-0.167644,-0.524934,-0.122507,...,0.552251,0.380194,0.622315,-0.516672,-0.755857,0.253589,-0.049572,18,1,WALKING
2566,0.278686,-0.012324,-0.038279,-0.976325,-0.956617,-0.884714,-0.982216,-0.954334,-0.900642,-0.908367,...,0.023628,0.097454,0.035566,0.58817,-0.880105,0.179368,0.035394,24,5,STANDING
2561,0.436585,0.016542,-0.107352,0.031037,0.41408,-0.165218,-0.006197,0.498857,-0.188523,0.185132,...,-0.347441,-0.48986,-0.898488,0.599798,-0.467906,0.46366,-0.018171,20,2,WALKING_UPSTAIRS
1711,0.27403,-0.021858,-0.099103,-0.998078,-0.960056,-0.979729,-0.998172,-0.960341,-0.978718,-0.945161,...,0.00105,0.067501,0.235405,-0.824184,-0.728208,0.28504,-0.000905,13,5,STANDING
1457,0.34349,-0.059665,-0.123232,-0.001658,0.110938,-0.507173,-0.066841,0.071493,-0.498359,0.334673,...,-0.428508,-0.923095,-0.145585,-0.329352,-0.859017,0.19151,0.04712,12,1,WALKING


## Sanity Checks for Duplicates/NaN

In [74]:
## NAN Checks
print(f'We have {train.isnull().values.sum()} NaN/Null values in train')
print(f'We have {test.isnull().values.sum()} NaN/Null values in test')

We have 0 NaN/Null values in train
We have 0 NaN/Null values in test


In [75]:
## Duplicate Entry Checks
print(f'No of duplicates in train: {sum(train.duplicated())}')
print(f'No of duplicates in test : {sum(test.duplicated())}')

No of duplicates in train: 0
No of duplicates in test : 0
