Skip to content

Commit

Permalink
Excel time to numeric (#479)
Browse files Browse the repository at this point in the history
Add excel_time_to_numeric to convert many time formats to seconds

---------

Co-authored-by: Sam Firke <sfirke@users.noreply.github.com>
Co-authored-by: olivroy <52606734+olivroy@users.noreply.github.com>
  • Loading branch information
3 people committed Dec 7, 2023
1 parent bb23615 commit f16cf42
Show file tree
Hide file tree
Showing 10 changed files with 378 additions and 1 deletion.
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ S3method(clean_names,tbl_graph)
S3method(clean_names,tbl_lazy)
S3method(describe_class,default)
S3method(describe_class,factor)
S3method(excel_time_to_numeric,POSIXct)
S3method(excel_time_to_numeric,POSIXlt)
S3method(excel_time_to_numeric,character)
S3method(excel_time_to_numeric,logical)
S3method(excel_time_to_numeric,numeric)
S3method(fisher.test,default)
S3method(fisher.test,tabyl)
S3method(print,tabyl)
Expand All @@ -34,6 +39,7 @@ export(convert_to_datetime)
export(crosstab)
export(describe_class)
export(excel_numeric_to_date)
export(excel_time_to_numeric)
export(find_header)
export(fisher.test)
export(get_dupes)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ These are all minor breaking changes resulting from enhancements and are not exp

* `row_to_names()` now accepts multiple rows as input, and merges them using a new `sep` argument (#536). The default is `sep = "_"`. When handling multiple `NA` values, `row_to_names()` ignores them and only merges non-NA values for column names. When all values are `NA`, `row_to_names()` creates a column name of `"NA"`, a character, rather than `NA`.

* The new function `excel_time_to_numeric()` converts times from Excel that do not have accompanying dates into a number of seconds. (#245, thanks to **@billdenney** for the feature.)

## Bug fixes

* `adorn_totals("row")` now succeeds if the new `name` of the totals row is already a factor level of the input data.frame (#529, thanks @egozoglu for reporting).
Expand Down
1 change: 1 addition & 0 deletions R/excel_dates.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#' https://support.microsoft.com/en-us/help/2722715/support-for-the-leap-second).
#'
#' @export
#' @seealso \code{\link{excel_time_to_numeric}}
#' @examples
#' excel_numeric_to_date(40000)
#' excel_numeric_to_date(40000.5) # No time is included
Expand Down
166 changes: 166 additions & 0 deletions R/excel_time_to_numeric.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#' Convert a time that may be inconsistently or inconveniently formatted from
#' Microsoft Excel to a numeric number of seconds between 0 and 86400.
#'
#' @details
#'
#' `time_value` may be one of the following formats:
#' \itemize{
#' \item{numeric}{The input must be a value from 0 to 1 (exclusive of 1); this value is returned as-is.}
#' \item{POSIXlt or POSIXct}{The input must be on the day 1899-12-31 (any other day will cause an error). The time of day is extracted and converted to a fraction of a day.}
#' \item{character}{Any of the following (or a mixture of the choices):}
#' \itemize{
#' \item{A character string that is a number between 0 and 1 (exclusive of 1). This value will be converted like a numeric value.}
#' \item{A character string that looks like a date on 1899-12-31 (specifically, it must start with `"1899-12-31 "`), converted like a POSIXct object as described above.}
#' \item{A character string that looks like a time. Choices are 12-hour time as hour, minute, and optionally second followed by "am" or "pm" (case insensitive) or 24-hour time when hour, minute, optionally second, and no "am" or "pm" is included.}
#' }
#' }
#'
#' @param time_value A vector of values to convert (see Details)
#' @param round_seconds Should the output number of seconds be rounded to an
#' integer?
#' @return A vector of numbers >= 0 and <86400
#' @family Date-time cleaning
#' @seealso `\link{excel_numeric_to_date}`
#' @export
excel_time_to_numeric <- function(time_value, round_seconds = TRUE) {
UseMethod("excel_time_to_numeric")
}

#' @export
excel_time_to_numeric.logical <- function(time_value, round_seconds = TRUE) {
if (all(is.na(time_value))) {
rep(NA_real_, length(time_value))
} else {
stop("If given as a logical vector, all values must be ")
}
}

#' @export
excel_time_to_numeric.numeric <- function(time_value, round_seconds = TRUE) {
if (all(is.na(time_value) |
(time_value >= 0 &
time_value < 1))) {
seconds <- time_value * 86400
if (round_seconds) {
seconds <- round(seconds)
}
} else {
stop("When numeric, all `time_value`s must be between 0 and 1 (exclusive of 1)")
}
seconds
}

#' @export
excel_time_to_numeric.POSIXct <- function(time_value, round_seconds = TRUE) {
# using trunc removes timezone inconsistency. Timezones aren't used in Excel.
seconds <- as.numeric(time_value) - as.numeric(trunc(time_value, units = "days"))
mask_good_seconds <- is.na(seconds) | (seconds >= 0 & seconds < 86400)
if (all(mask_good_seconds)) {
if (round_seconds) {
seconds <- round(seconds)
}
} else {
# This should be impossible except for leap seconds
stop(sum(!mask_good_seconds), " `time_value`s were not at or above 0 and below 86400.") # nocov
}
seconds
}

#' @export
excel_time_to_numeric.POSIXlt <- function(time_value, round_seconds = TRUE) {
excel_time_to_numeric.POSIXct(
as.POSIXct(time_value),
round_seconds = round_seconds
)
}

#' @export
excel_time_to_numeric.character <- function(time_value, round_seconds = TRUE) {
ret <- rep(NA_real_, length(time_value))
patterns <-
list(
number = "^0(\\.[0-9]*)?$",
# SI numbers have to have the form [number]E-[number] becasue the number
# has to be between 0 and 1 and can't be bigger than 1.
si_number = "^[1-9](\\.[0-9]*)?E-[0-9]+$",
"12hr" = "^([0]?[1-9]|1[0-2]):([0-5][0-9])(?::([0-5][0-9]))? ?([AP]M)$",
"24hr" = "^([0-1]?[0-9]|2[0-3]):([0-5][0-9])(?::([0-5][0-9]))?$",
# The ".*?" at the end of POSIX is to allow for a time zone, but it allows
# for imperfect parsing if there were just a date and a space.
# The entire time is optional to allow for midnight which shows as
# just the date and time zone.
POSIX = "1899-12-31 (?:([0-1]?[0-9]|2[0-3]):([0-5][0-9])(?::([0-5][0-9]))?)?.*?$"
)
mask_na <- is.na(time_value)
mask_number <-
grepl(pattern = patterns$number, x = time_value) |
grepl(pattern = patterns$si_number, x = time_value)
mask_POSIX <- grepl(pattern = patterns[["POSIX"]], x = time_value)
mask_12hr <- grepl(pattern = patterns[["12hr"]], x = time_value, ignore.case = TRUE)
mask_24hr <- grepl(pattern = patterns[["24hr"]], x = time_value)
unmatched <- !(mask_na | mask_number | mask_POSIX | mask_12hr | mask_24hr)
if (any(unmatched)) {
stop(
"The following character strings did not match an interpretable ",
"character format for time conversion: ",
paste(unique(time_value[unmatched]))
)
}
if (any(mask_number)) {
ret[mask_number] <-
excel_time_to_numeric.numeric(
time_value = as.numeric(time_value[mask_number]),
round_seconds = round_seconds
)
}
mask_clock <- mask_12hr | mask_24hr | mask_POSIX
if (any(mask_clock)) {
hours <- minutes <- seconds <- rep(NA_real_, length(time_value))
if (any(mask_POSIX)) {
hours[mask_POSIX] <-
gsub(pattern = patterns$POSIX, replacement = "\\1", x = time_value[mask_POSIX])
minutes[mask_POSIX] <-
gsub(pattern = patterns$POSIX, replacement = "\\2", x = time_value[mask_POSIX])
seconds[mask_POSIX] <-
gsub(pattern = patterns$POSIX, replacement = "\\3", x = time_value[mask_POSIX])
}
if (any(mask_12hr)) {
mask_pm <- rep(FALSE, length(time_value))
hours[mask_12hr] <-
gsub(pattern = patterns[["12hr"]], replacement = "\\1", x = time_value[mask_12hr], ignore.case = TRUE)
minutes[mask_12hr] <-
gsub(pattern = patterns[["12hr"]], replacement = "\\2", x = time_value[mask_12hr], ignore.case = TRUE)
seconds[mask_12hr] <-
gsub(pattern = patterns[["12hr"]], replacement = "\\3", x = time_value[mask_12hr], ignore.case = TRUE)
# 12 is 0 hours in the AM and the PM conversion below adds the needed 12
# at noon.
mask_0_hours <- mask_12hr & (hours %in% "12")
hours[mask_0_hours] <- "0"
mask_pm[mask_12hr] <-
tolower(
gsub(pattern = patterns[["12hr"]], replacement = "\\4", x = time_value[mask_12hr], ignore.case = TRUE)
) %in% "pm"
hours[mask_pm] <- 12 + as.numeric(hours[mask_pm])
}
if (any(mask_24hr)) {
hours[mask_24hr] <-
gsub(pattern = patterns[["24hr"]], replacement = "\\1", x = time_value[mask_24hr])
minutes[mask_24hr] <-
gsub(pattern = patterns[["24hr"]], replacement = "\\2", x = time_value[mask_24hr])
seconds[mask_24hr] <-
gsub(pattern = patterns[["24hr"]], replacement = "\\3", x = time_value[mask_24hr])
}
hours[hours %in% ""] <- "0"
minutes[minutes %in% ""] <- "0"
seconds[seconds %in% ""] <- "0"

ret[mask_clock] <-
as.numeric(hours[mask_clock]) * 3600 +
as.numeric(minutes[mask_clock]) * 60 +
as.numeric(seconds[mask_clock])
}
if (round_seconds) {
ret <- round(ret)
}
ret
}
1 change: 1 addition & 0 deletions man/convert_to_date.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/excel_numeric_to_date.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 44 additions & 0 deletions man/excel_time_to_numeric.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/janitor-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/sas_numeric_to_date.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f16cf42

Please sign in to comment.