### Getting and Cleaning Data
### Course Project

In [1]:
# use the group_by, summerise and pipeline functions
library(dplyr)

# use read.table with pre-defined column names options ("col.names") to load all datasets
# reading features dataset
featuresDf<-read.table('data\\features.txt',col.names=c('No','Features'))

# reading activities dataset
activitiesDf<-read.table('data\\activity_labels.txt', col.names=c('No', 'Activities'))

# Prepare list of features as to be used as column names for measurement datasets
# convert factor to character
featureNames  <- featuresDf$Features

# make activity labels lowercase
activityNames <- tolower(activitiesDf$Activities)

# reading training datasets
trainMeasurementsDf <- read.table('data\\X_train.txt', col.names=featureNames)
trainActivitiesDf   <- read.table('data\\y_train.txt', col.names='Activity') 
trainVolunteersDf   <- read.table('data\\subject_train.txt', col.names='VolunteerNo')

# reading testing datasets
testMeasurementsDf <- read.table('data\\X_test.txt', col.names=featureNames)
testActivitiesDf   <- read.table('data\\y_test.txt', col.names='Activity')
testVolunteersDf   <- read.table('data\\subject_test.txt', col.names='VolunteerNo')

__Step # 1 - merge train & test datasets into a large dataset__

In [2]:
# combining all the associated training & testing datasets (rows) using rbind()
combinedMeasurementsDf<-rbind(trainMeasurementsDf, testMeasurementsDf)
combinedActivitiesDf <-rbind(trainActivitiesDf, testActivitiesDf)
combinedVolunteersDf <-rbind(trainVolunteersDf, testVolunteersDf)

In [3]:
# combining all the associated datasets (columns) into one large dataset using cbind
combinedDf<-cbind(combinedVolunteersDf, combinedActivitiesDf, combinedMeasurementsDf)

__Step 2 - extract mean & std from measurement dataset__

In [4]:
# extract all the mean and std columns into new extractedDf per instructions
# copy VolunteerNo & Activity columns as well
extractedDf <- combinedDf[, grep('VolunteerNo|Activity|std|mean', names(combinedDf))]

__Step 3 - Uses descriptive activity names to name the activities in the data set__

In [5]:
# make "Activity" column of combinedDf from "integer" to "factor"
extractedDf$Activity <- factor(extractedDf$Activity)

# assign the levels with activityNames
levels(extractedDf$Activity) <- activityNames

__Step 4 - Appropriately labels the data set with descriptive variable names__

In [6]:
# the followings are ways to make the variable names more descriptive
# 1. expand variable prefixed "t" and "f" to more descriptive "time" and "freq" respectively 
# 2. remove the creptic (.-()) characters from variable names
# 3. captilise the first character of "mean" & "std" to make them more prominent and readable

# use REGEX to replace t with time & f with freq and remove ".-()"  in the variable names
featureNames<-names(extractedDf)
featureNames  %>% sub('^[t]','time', .)  %>% sub('^[f]', 'freq', .)  %>% sub('mean', 'Mean', .) %>% sub('std', 'Std', .) %>% gsub('[-|().]', '', .) -> featureNames

In [7]:
# update the variable names on the extractedDf
names(extractedDf) <- featureNames

In [8]:
# create tidy.txt file
extractedDf %>% write.table(file='data\\tidyDataset1.txt', row.names=FALSE)

__Step 5 - From the data set in step 4, creates a second, independent tidy data set with the average of each variable for each activity and each subject.__

In [9]:
# use group_by and summerise functions to compute the mean of every activity for each volunteer
extractedDf %>% group_by(VolunteerNo, Activity) %>% summarise_all(list(mean='mean')) -> tidy2Df

# create tidy.txt file
tidy2Df %>% write.table(file='data\\tidyDataset2.txt', row.names=FALSE)

In [10]:
setwd('d:\\jupyter\\r')
df=read.table('data\\tidyDataset2.txt', header=TRUE)

In [11]:
df

VolunteerNo,Activity,timeBodyAccMeanX_mean,timeBodyAccMeanY_mean,timeBodyAccMeanZ_mean,timeBodyAccStdX_mean,timeBodyAccStdY_mean,timeBodyAccStdZ_mean,timeGravityAccMeanX_mean,timeGravityAccMeanY_mean,...,freqBodyAccMagMeanFreq_mean,freqBodyBodyAccJerkMagMean_mean,freqBodyBodyAccJerkMagStd_mean,freqBodyBodyAccJerkMagMeanFreq_mean,freqBodyBodyGyroMagMean_mean,freqBodyBodyGyroMagStd_mean,freqBodyBodyGyroMagMeanFreq_mean,freqBodyBodyGyroJerkMagMean_mean,freqBodyBodyGyroJerkMagStd_mean,freqBodyBodyGyroJerkMagMeanFreq_mean
<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,walking,0.2773308,-0.017383819,-0.11114810,-0.28374026,0.114461337,-0.26002790,0.9352232,-0.28216502,...,0.190643724,-0.057119400,-0.10349240,0.093822181,-0.1992526,-0.3210180,0.2688443675,-0.3193086,-0.3816019,0.190663449
1,walking_upstairs,0.2554617,-0.023953149,-0.09730200,-0.35470803,-0.002320265,-0.01947924,0.8933511,-0.36215336,...,-0.097743350,-0.442652162,-0.53305985,0.085352409,-0.3259615,-0.1829855,-0.2193033761,-0.6346651,-0.6939305,0.114277342
1,walking_downstairs,0.2891883,-0.009918505,-0.10756619,0.03003534,-0.031935943,-0.23043421,0.9318744,-0.26661034,...,0.119187143,0.026218495,-0.10405226,0.076491547,-0.1857203,-0.3983504,0.3496138955,-0.2819634,-0.3919199,0.190000706
1,sitting,0.2612376,-0.001308288,-0.10454418,-0.97722901,-0.922618642,-0.93958629,0.8315099,0.20441159,...,0.236655012,-0.985262127,-0.98160618,0.351852202,-0.9584356,-0.9321984,-0.0002621867,-0.9897975,-0.9870496,0.184775928
1,standing,0.2789176,-0.016137590,-0.11060182,-0.99575990,-0.973190056,-0.97977588,0.9429520,-0.27298383,...,0.284555291,-0.992542478,-0.99253600,0.422220102,-0.9846176,-0.9784661,-0.0286057725,-0.9948154,-0.9946711,0.334498734
1,laying,0.2215982,-0.040513953,-0.11320355,-0.92805647,-0.836827406,-0.82606140,-0.2488818,0.70554977,...,0.086408563,-0.933300361,-0.92180398,0.266391154,-0.8621902,-0.8243194,-0.1397750127,-0.9423669,-0.9326607,0.176485907
2,walking,0.2764266,-0.018594920,-0.10550036,-0.42364284,-0.078091253,-0.42525752,0.9130173,-0.34660709,...,0.393206208,-0.169064353,-0.16409197,0.207500927,-0.5307048,-0.6517928,0.3052838253,-0.5832493,-0.5581046,0.126344614
2,walking_upstairs,0.2471648,-0.021412113,-0.15251390,-0.30437641,0.108027280,-0.11212102,0.7907174,-0.41621489,...,0.107680395,-0.189511137,-0.26042384,-0.011916823,-0.4506122,-0.4386204,-0.0227527690,-0.6007985,-0.6218202,0.046743399
2,walking_downstairs,0.2776153,-0.022661416,-0.11681294,0.04636668,0.262881789,-0.10283791,0.8618313,-0.32578010,...,0.093832130,0.222247410,0.22748073,0.018835511,-0.3208385,-0.3725768,0.1039712417,-0.3801753,-0.3436990,-0.051247964
2,sitting,0.2770874,-0.015687994,-0.10921827,-0.98682228,-0.950704499,-0.95982817,0.9404773,-0.10563002,...,0.130934203,-0.983874699,-0.98412419,0.280062424,-0.9718406,-0.9613857,-0.1063589092,-0.9898620,-0.9896329,0.194722149
