Skip to content

Commit

Permalink
bug fix related to punc with no surrounding spaces, closes #6
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisMuir committed Mar 21, 2018
1 parent c52e4df commit ffc50b4
Showing 1 changed file with 5 additions and 7 deletions.
12 changes: 5 additions & 7 deletions R/get_fingerprint.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#' @noRd
get_fingerprint_KC <- function(vect, bus_suffix = TRUE,
ignore_strings = NULL) {
vect <- gsub("[[:punct:]]", " ", vect, perl = TRUE)
if (bus_suffix) {
vect <- business_suffix(tolower(vect))
if (!is.null(ignore_strings)) {
Expand All @@ -16,8 +17,6 @@ get_fingerprint_KC <- function(vect, bus_suffix = TRUE,
} else {
vect <- tolower(vect)
}
# Perform initial transformations.
vect <- gsub("[[:punct:]]", "", vect, perl = TRUE)
vect <- strsplit(cpp_trimws_left(vect), "\\s+", perl = TRUE)
# If "ignore_strings" is not NULL, for each element of list "vect", remove
# any string that has a match within vector "ignore_strings".
Expand All @@ -35,6 +34,7 @@ get_fingerprint_KC <- function(vect, bus_suffix = TRUE,
#'@noRd
get_fingerprint_ngram <- function(vect, numgram = 2, bus_suffix = TRUE,
ignore_strings = NULL) {
vect <- gsub("[[:punct:]]", " ", vect, perl = TRUE)
# Compile variable ignore_strings.
if (bus_suffix) {
if (!is.null(ignore_strings)) {
Expand All @@ -49,18 +49,16 @@ get_fingerprint_ngram <- function(vect, numgram = 2, bus_suffix = TRUE,

if (!is.null(ignore_strings)) {
# Initial transformations given "ignore_strings" is not NULL.
#
# Use values in "ignore_strings" to create a regex of substrings to
# eliminate from each element of "vect" (also remove all punctuation
# and spaces).
# eliminate from each element of "vect" (also remove all spaces).
regex <- paste0("\\b(",
paste(ignore_strings, collapse = "|"),
")\\b|[[:punct:]]|\\s")
")\\b|\\s")
vect <- business_suffix(tolower(vect))
vect <- gsub(regex, "", vect, perl = TRUE)
} else {
# Initial transformations given "ignore_strings" is NULL.
gsub("[[:punct:]]|\\s", "", tolower(vect), perl = TRUE)
gsub("\\s", "", tolower(vect), perl = TRUE)
}

# Rest of the transformations. For each value in vect: get ngrams, filter by
Expand Down

0 comments on commit ffc50b4

Please sign in to comment.