Skip to content

Commit

Permalink
updates for Linux
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexChristensen committed Aug 28, 2020
1 parent ef12bea commit eb3042d
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 133 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: SemNetCleaner
Title: An Automated Cleaning Tool for Semantic and Linguistic Data
Version: 1.2.1
Date: 2020-06-16
Date: 2020-08-28
Authors@R: c(person("Alexander P.", "Christensen", email = "alexpaulchristensen@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9798-7037")))
Maintainer: Alexander P. Christensen <alexpaulchristensen@gmail.com>
Description: Implements several functions that automates the cleaning and spell-checking of text data. Also converges, finalizes, removes plurals and continuous strings, and puts text data in binary format for semantic network analysis. Uses the 'SemNetDictionaries' package to make the cleaning process more accurate, efficient, and reproducible.
Expand All @@ -15,4 +15,4 @@ Depends: R (>= 3.6.0), SemNetDictionaries (>= 0.1.5)
Imports: stringdist, hunspell, searcher, tcltk, foreign, readxl, R.matlab, stringi
Suggests: knitr, rmarkdown, htmlTable
VignetteBuilder: knitr
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
Changes in version 1.2.1

o UPDATE: `correct.changes` included within `textcleaner` to streamline code for single function preprocessing

o UPDATE: `edit` function added within the `correct.changes` function to allow more seamless interface

o UPDATE: change tracking has been added to `correct.changes` in the output (`$spellcheck$changes`)
Expand Down
260 changes: 150 additions & 110 deletions R/correct.changes.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,38 @@
#' @export
#'
# Correct changes----
# Updated 16.06.2020
# Updated 21.08.2020
# Major update: 19.04.2020
correct.changes <- function(textcleaner.obj)
{

# Set up message to user
cat(colortext("\nYou will now have a chance to correct the changes that", defaults = "message"))
cat(colortext("\nwere made during the automated spell-checking process.", defaults = "message"))
cat(colortext("\nA spreadsheet will open allowing you to manually correct", defaults = "message"))
cat(colortext("\nthese changes.\n\n", defaults = "message"))

readline("Press ENTER to continue...")

cat(colortext("\n\nThe first column of the spreadsheet corresponds to the", defaults = "message"))
cat(colortext("\nrow number provided in the output object `$spellcheck$correspondence`", defaults = "message"))
cat(colortext("\n(see ?textcleaner for more information about this output).", defaults = "message"))

cat(colortext("\n\nThe second column is the original response the participant provided", defaults = "message"))
cat(colortext(paste("\nand columns 3 through", 3 + (ncol(textcleaner.obj$spellcheck$automated) - 2),
"are the automated spell-check responses."), defaults = "message"))
cat(colortext('\nThese columns will have names formatted with "to_#".\n\n', defaults = "message"))

readline("Press ENTER to continue...")

cat(colortext(paste("\nYou should change columns 3 through", 3 + (ncol(textcleaner.obj$spellcheck$automated) - 2),
"by manually typing responses."), defaults = "message"))
cat(colortext('\nFor inappropriate responses, "NA" should be typed. When finished,', defaults = "message"))
cat(colortext('\nyou can exit this process by clicking the "X" in the top right', defaults = "message"))
cat(colortext('\ncorner of the spreadsheet.\n\n', defaults = "message"))

readline("Press ENTER to proceed with spell-check.")

# Check if textcleaner object is input
if(!class(textcleaner.obj) == "textcleaner")
{stop("A 'textcleaner' class object was not input in the 'textcleaner.obj' argument")}
Expand All @@ -102,119 +130,131 @@ correct.changes <- function(textcleaner.obj)
## Find rows that have changed
target.changes <- which(apply(automated[,-1] != changes[,-1], 1, function(x){any(x)}))

## Initialize track changes
track.changes <- list()

## Loop through changes
for(i in 1:length(target.changes))
# If there are no changes, then return original object
if(length(target.changes) == 0)
{
## Set up change matrix
chn.mat <- rbind(automated[target.changes[i],-1], changes[target.changes[i],-1])
colnames(chn.mat) <- rep("to", ncol(chn.mat))
row.names(chn.mat) <- c("Automated", "Corrected")
message("\nNo responses changed.\n")

track.changes[[automated[target.changes[i],1]]] <- chn.mat
}

res$spellcheck$changes <- track.changes

## Original is used (rather than corrected) to run through same preprocessing
## as in textcleaner (far more efficient than actually changing through each
## object in the results list)
original <- as.matrix(res$responses$original)

# Create new correspondence matrix
correspondence <- res$spellcheck$correspondence

# Get number of columns between correspondence and changes matrices to match
if(ncol(correspondence) > ncol(changes))
{
## Difference in number of columns
diff <- ncol(correspondence) - ncol(changes)
return(textcleaner.obj)

## Tack on NA columns
for(i in 1:diff)
{changes <- as.matrix(cbind(changes, rep(NA, nrow(changes))))}
}else{

}else if(ncol(correspondence) < ncol(changes))
{
## Difference in number of columns
diff <- ncol(changes) - ncol(correspondence)
## Initialize track changes
track.changes <- list()

## Loop through changes
for(i in 1:length(target.changes))
{
## Set up change matrix
chn.mat <- rbind(automated[target.changes[i],-1], changes[target.changes[i],-1])
colnames(chn.mat) <- rep("to", ncol(chn.mat))
row.names(chn.mat) <- c("Automated", "Corrected")

track.changes[[automated[target.changes[i],1]]] <- chn.mat
}

res$spellcheck$changes <- track.changes

## Original is used (rather than corrected) to run through same preprocessing
## as in textcleaner (far more efficient than actually changing through each
## object in the results list)
original <- as.matrix(res$responses$original)

# Create new correspondence matrix
correspondence <- res$spellcheck$correspondence

# Get number of columns between correspondence and changes matrices to match
if(ncol(correspondence) > ncol(changes))
{
## Difference in number of columns
diff <- ncol(correspondence) - ncol(changes)

## Tack on NA columns
for(i in 1:diff)
{changes <- as.matrix(cbind(changes, rep(NA, nrow(changes))))}

}else if(ncol(correspondence) < ncol(changes))
{
## Difference in number of columns
diff <- ncol(changes) - ncol(correspondence)

## Tack on NA columns
for(i in 1:diff)
{correspondence <- as.matrix(cbind(correspondence, rep(NA, nrow(correspondence))))}
}

# Update correspondence matrix
correspondence[row.names(res$spellcheck$automated),] <- changes
res$spellcheck$correspondence <- correspondence

# Create 'from' list
from <- as.list(correspondence[,"from"])

# Create 'to' list
to <- apply(correspondence[,grep("to", colnames(correspondence))], 1, function(x){unname(na.omit(x))})

# Create correspondence matrix (error catch)
corr.mat <- try(
correspondence.matrix(from, to),
silent = TRUE
)

if(any(class(corr.mat) == "try-error"))
{return(error.fun(corr.mat, "correspondence.matrix", "correct.changes"))}

## Update with changes made by user
res$spellcheck$automated <- changes

# Get spell-corrected data (error catch)
corrected <- try(
correct.data(original, corr.mat),
silent = TRUE
)

if(any(class(corrected) == "try-error"))
{return(error.fun(corrected, "correct.data", "correct.changes"))}

## Collect behavioral data
behavioral <- corrected$behavioral

## Make sure to replace faux "NA" with real NA
corrected$corrected[which(corrected$corrected == "NA")] <- NA

## Cleaned responses (no instrusions or perseverations)
cleaned.list <- apply(corrected$corrected, 1, function(x){unique(na.omit(x))})

max.resp <- max(unlist(lapply(cleaned.list, length)))

cleaned.matrix <- t(sapply(
lapply(cleaned.list, function(x, max.resp){
c(x, rep(NA, max.resp - length(x)))
}, max.resp = max.resp)
,rbind))

colnames(cleaned.matrix) <- paste("Response_", formatC(1:ncol(cleaned.matrix),
digits = nchar(ncol(cleaned.matrix)) - 1,
flag = "0"), sep = "")

res$responses$clean <- cleaned.matrix

# Convert to binary response matrix (error catch)
res$responses$binary <- try(
resp2bin(corrected$corrected),
silent = TRUE
)

if(any(class(res$responses$binary) == "try-error"))
{return(error.fun(res$responses$binary, "resp2bin", "correct.changes"))}

behavioral <- cbind(behavioral, rowSums(res$responses$binary))
colnames(behavioral)[3] <- "Appropriate"
res$behavioral <- as.data.frame(behavioral)

#make 'textcleaner' class
class(res) <- "textcleaner"

return(res)

## Tack on NA columns
for(i in 1:diff)
{correspondence <- as.matrix(cbind(correspondence, rep(NA, nrow(correspondence))))}
}

# Update correspondence matrix
correspondence[row.names(res$spellcheck$automated),] <- changes

# Create 'from' list
from <- as.list(correspondence[,"from"])

# Create 'to' list
to <- apply(correspondence[,grep("to", colnames(correspondence))], 1, function(x){unname(na.omit(x))})

# Create correspondence matrix (error catch)
corr.mat <- try(
correspondence.matrix(from, to),
silent = TRUE
)

if(any(class(corr.mat) == "try-error"))
{return(error.fun(corr.mat, "correspondence.matrix", "correct.changes"))}

## Update with changes made by user
res$spellcheck$automated <- changes

# Get spell-corrected data (error catch)
corrected <- try(
correct.data(original, corr.mat),
silent = TRUE
)

if(any(class(corrected) == "try-error"))
{return(error.fun(corrected, "correct.data", "correct.changes"))}

## Collect behavioral data
behavioral <- corrected$behavioral

## Make sure to replace faux "NA" with real NA
corrected$corrected[which(corrected$corrected == "NA")] <- NA
res$responses$checked <- as.data.frame(corrected$corrected, stringsAsFactors = FALSE)

## Cleaned responses (no instrusions or perseverations)
cleaned.list <- apply(corrected$corrected, 1, function(x){unique(na.omit(x))})

max.resp <- max(unlist(lapply(cleaned.list, length)))

cleaned.matrix <- t(sapply(
lapply(cleaned.list, function(x, max.resp){
c(x, rep(NA, max.resp - length(x)))
}, max.resp = max.resp)
,rbind))

colnames(cleaned.matrix) <- paste("Response_", formatC(1:ncol(cleaned.matrix),
digits = nchar(ncol(cleaned.matrix)) - 1,
flag = "0"), sep = "")

res$responses$clean <- cleaned.matrix

# Convert to binary response matrix (error catch)
res$responses$binary <- try(
resp2bin(corrected$corrected),
silent = TRUE
)

if(any(class(res$responses$binary) == "try-error"))
{return(error.fun(res$responses$binary, "resp2bin", "correct.changes"))}

behavioral <- cbind(behavioral, rowSums(res$responses$binary))
colnames(behavioral)[3] <- "Appropriate"
res$behavioral <- as.data.frame(behavioral)

#make 'textcleaner' class
class(res) <- "textcleaner"

return(res)
}
}
20 changes: 16 additions & 4 deletions R/textcleaner.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@
#'
#' @export
# Text Cleaner----
# Updated 19.04.2020
# Updated 21.08.2020
# Major update: 19.04.2020
textcleaner <- function(data = NULL, miss = 99,
partBY = c("row","col"),
Expand Down Expand Up @@ -174,15 +174,19 @@ textcleaner <- function(data = NULL, miss = 99,
## Obtain unique responses for efficient spell-checking
uniq.resp <- na.omit(unique(unlist(data)))

# Sort out dictionaries
if(is.null(dictionary))
{dictionary <- "general"}

# Perform spell-check
spell.check <- try(
spellcheck.dictionary(uniq.resp = uniq.resp,
dictionary = ifelse(is.null(dictionary), "general", dictionary),
dictionary = dictionary,
data = data, walkthrough = walkthrough),
silent <- TRUE
)

}else if(length(continue) == 14) # Continue spell-check
}else if(length(continue) != 3) # Continue spell-check
{spell.check <- spellcheck.dictionary(continue = continue)
}else{spell.check <- continue}

Expand Down Expand Up @@ -284,8 +288,16 @@ textcleaner <- function(data = NULL, miss = 99,
colnames(behavioral)[3] <- "Appropriate"
res$behavioral <- as.data.frame(behavioral)

#make 'textcleaner' class
# Make 'textcleaner' class
class(res) <- "textcleaner"

# Correct auto-corrections
res <- correct.changes(res)

# Let user know spell-check is complete
Sys.sleep(1)
message("\nPreprocessing complete.\n")
Sys.sleep(1)

return(res)
}
Expand Down
Loading

0 comments on commit eb3042d

Please sign in to comment.