Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split functions and other smaller improvements for 1.6.3 #53

Merged
merged 23 commits into from
Jul 25, 2015
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ Description: Provides easier interaction with
returns an R data frame.
Converts dates to 'POSIX' format.
Manages throttling by 'Socrata'.
Version: 1.6.2-10
Date: 2015-7-12
Version: 1.6.3
Date: 2015-07-15
Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc
Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
Depends:
Expand Down
149 changes: 41 additions & 108 deletions R/RSocrata.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,36 +7,6 @@
# library('jsonlite') # for parsing data types from Socrata
# library('mime') # for guessing mime type

#' Time-stamped message
#'
#' Issue a time-stamped, origin-stamped log message.
#' @param s - a string
#' @return None (invisible NULL) as per cat
#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org}
#' @noRd
logMsg <- function(s) {
cat(format(Sys.time(), "%Y-%m-%d %H:%M:%OS3 "), as.character(sys.call(-1))[1], ": ", s, '\n', sep='')
}

#' Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4.
#'
#' Will check the validity of a potential dataset unique identifier
#' supported by Socrata. It will provide an exception if the syntax
#' does not align to Socrata unique identifiers. It only checks for
#' the validity of the syntax, but does not check if it actually exists.
#' @param fourByFour - a string; character vector of length one
#' @return TRUE if is valid Socrata unique identifier, FALSE otherwise
#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
#' @export
isFourByFour <- function(fourByFour) {
fourByFour <- as.character(fourByFour)
if(nchar(fourByFour) != 9)
return(FALSE)
if(regexpr("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour) == -1)
return(FALSE)
TRUE
}

#' Convert, if necessary, URL to valid REST API URL supported by Socrata.
#'
#' Will convert a human-readable URL to a valid REST API call
Expand All @@ -55,64 +25,43 @@ isFourByFour <- function(fourByFour) {
validateUrl <- function(url, app_token) {
url <- as.character(url)
parsedUrl <- httr::parse_url(url)
if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path))

if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path)) {
stop(url, " does not appear to be a valid URL.")
}

if(!is.null(app_token)) { # Handles the addition of API token and resolves invalid uses

if(is.null(parsedUrl$query[["$$app_token"]])) {
token_inclusion <- "valid_use"
} else {
token_inclusion <- "already_included" }
token_inclusion <- "already_included"
}

switch(token_inclusion,
"already_included"={ # Token already included in url argument
"already_included" = { # Token already included in url argument
warning(url, " already contains an API token in url. Ignoring user-defined token.")
},
"valid_use"={ # app_token argument is used, not duplicative.
"valid_use" = { # app_token argument is used, not duplicative.
parsedUrl$query[["app_token"]] <- as.character(paste("%24%24app_token=", app_token, sep=""))
})
}
)

}

if(substr(parsedUrl$path, 1, 9) == 'resource/') {
return(httr::build_url(parsedUrl)) # resource url already
}
fourByFour <- basename(parsedUrl$path)
if(!isFourByFour(fourByFour))

fourByFour <- basename(parsedUrl$path)
if(!isFourByFour(fourByFour)) {
stop(fourByFour, " is not a valid Socrata dataset unique identifier.")
else {
} else {
parsedUrl$path <- paste('resource/', fourByFour, '.csv', sep="")
httr::build_url(parsedUrl)
}
}

#' Convert Socrata human-readable column name to field name
#'
#' Convert Socrata human-readable column name,
#' as it might appear in the first row of data,
#' to field name as it might appear in the HTTP header;
#' that is, lower case, periods replaced with underscores#'
#' @param humanName - a Socrata human-readable column name
#' @return Socrata field name
#' @export
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @examples
#' fieldName("Number.of.Stations") # number_of_stations
fieldName <- function(humanName) {
tolower(gsub('\\.', '_', as.character(humanName)))
}

#' Convert Socrata calendar_date string to POSIX
#'
#' @param x - character vector in one of two Socrata calendar_date formats
#' @return a POSIX date
#' @export
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
posixify <- function(x) {
x <- as.character(x)
if (length(x)==0) return(x)
# Two calendar date formats supplied by Socrata
if(any(regexpr("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$", x[1])[1] == 1))
strptime(x, format="%m/%d/%Y") # short date format
else
strptime(x, format="%m/%d/%Y %I:%M:%S %p") # long date-time format
}

#' Wrap httr GET in some diagnostics
#'
Expand All @@ -125,15 +74,11 @@ posixify <- function(x) {
#' @noRd
getResponse <- function(url) {
response <- httr::GET(url)
# status <- httr::http_status(response)

if(response$status_code != 200) {
msg <- paste("Error in httr GET:", response$status_code, response$headers$statusmessage, url)
if(!is.null(response$headers$`content-length`) && (response$headers$`content-length` > 0)) {
details <- httr::content(response)
msg <- paste(msg, details$code[1], details$message[1])
}
logMsg(msg)
stop("Error in httr GET:", response$status_code, " during the request for ", response$url)
}

httr::stop_for_status(response)
return(response)
}
Expand All @@ -149,18 +94,25 @@ getResponse <- function(url) {
#' @noRd
getContentAsDataFrame <- function(response) { UseMethod('response') }
getContentAsDataFrame <- function(response) {

mimeType <- response$header$'content-type'

# skip optional parameters
sep <- regexpr(';', mimeType)[1]
if(sep != -1) mimeType <- substr(mimeType, 0, sep[1] - 1)

if(sep != -1) {
mimeType <- substr(mimeType, 0, sep[1] - 1)
}

switch(mimeType,
'text/csv' =
content(response), # automatic parsing
httr::content(response), # automatic parsing
'application/json' =
if(content(response, as='text') == "[ ]") # empty json?
if(httr::content(response, as ='text') == "[ ]") { # empty json?
data.frame() # empty data frame
else
data.frame(t(sapply(content(response), unlist)), stringsAsFactors=FALSE)
} else {
data.frame(t(sapply(httr::content(response), unlist)), stringsAsFactors = FALSE)
}
) # end switch
}

Expand All @@ -172,7 +124,6 @@ getContentAsDataFrame <- function(response) {
#' @return a named vector mapping field names to data types
#' @importFrom jsonlite fromJSON
#' @noRd
getSodaTypes <- function(response) { UseMethod('response') }
getSodaTypes <- function(response) {
result <- jsonlite::fromJSON(response$headers[['x-soda2-types']])
names(result) <- jsonlite::fromJSON(response$headers[['x-soda2-fields']])
Expand Down Expand Up @@ -202,46 +153,28 @@ read.socrata <- function(url, app_token = NULL) {
validUrl <- validateUrl(url, app_token) # check url syntax, allow human-readable Socrata url
parsedUrl <- httr::parse_url(validUrl)
mimeType <- mime::guess_type(parsedUrl$path)
if(!(mimeType %in% c('text/csv','application/json')))

if(!(mimeType %in% c('text/csv','application/json'))) {
stop("Error in read.socrata: ", mimeType, " not a supported data format.")
}

response <- getResponse(validUrl)
page <- getContentAsDataFrame(response)
result <- page
dataTypes <- getSodaTypes(response)

while (nrow(page) > 0) { # more to come maybe?
query <- paste(validUrl, if(is.null(parsedUrl$query)) {'?'} else {"&"}, '$offset=', nrow(result), sep='')
query <- paste0(validUrl, ifelse(is.null(parsedUrl$query), '?', "&"), '$offset=', nrow(result))
response <- getResponse(query)
page <- getContentAsDataFrame(response)
result <- rbind(result, page) # accumulate
}

# convert Socrata calendar dates to posix format
for(columnName in colnames(page)[!is.na(dataTypes[fieldName(colnames(page))]) & dataTypes[fieldName(colnames(page))] == 'calendar_date']) {
result[[columnName]] <- posixify(result[[columnName]])
}

return(result)
}

#' List datasets available from a Socrata domain
#'
#' @param url - A Socrata URL. This simply points to the site root.
#' @return an R data frame containing a listing of datasets along with
#' various metadata.
#' @author Peter Schmiedeskamp \email{pschmied@@uw.edu}
#' @examples
#' df <- ls.socrata("http://soda.demo.socrata.com")
#' @importFrom jsonlite fromJSON
#' @importFrom httr parse_url
#' @export
ls.socrata <- function(url) {
url <- as.character(url)
parsedUrl <- httr::parse_url(url)
if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname))
stop(url, " does not appear to be a valid URL.")
parsedUrl$path <- "data.json"
df <- jsonlite::fromJSON(httr::build_url(parsedUrl))
df <- as.data.frame(df$dataset)
df$issued <- as.POSIXct(df$issued)
df$modified <- as.POSIXct(df$modified)
df$theme <- as.character(df$theme)
return(df)
}
29 changes: 29 additions & 0 deletions R/fourByFour.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#' Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4.
#'
#' @description Will check the validity of a potential dataset unique identifier
#' supported by Socrata. It will provide an exception if the syntax
#' does not align to Socrata unique identifiers. It only checks for
#' the validity of the syntax, but does not check if it actually exists.
#'
#' @param fourByFour - a string; character vector of length one
#' @return TRUE if is valid Socrata unique identifier, FALSE otherwise
#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
#' @examples
#' isFourByFour(fourByFour = "4334-bgaj")
#' isFourByFour("433-bgaj")
#' isFourByFour(fourByFour = "4334-!gaj")
#'
#' @export
isFourByFour <- function(fourByFour = "") {

if (nchar(fourByFour) == 9) {
if(identical(grepl("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour), TRUE)) {
return(TRUE)
} else {
return(FALSE)
}
} else {
return(FALSE)
}

}
32 changes: 32 additions & 0 deletions R/listDatasets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#' List datasets available from a Socrata domain
#'
#' @param url - A Socrata URL. This simply points to the site root.
#' @return an R data frame containing a listing of datasets along with
#' various metadata.
#' @author Peter Schmiedeskamp \email{pschmied@@uw.edu}
#' @note URLs such as \code{"soda.demo.socrata.com"} are not supported
#' @examples
#' df <- ls.socrata(url = "http://soda.demo.socrata.com")
#' ## df.ny <- ls.socrata("https://data.ny.gov/")
#'
#' @importFrom jsonlite fromJSON
#' @importFrom httr parse_url build_url
#'
#' @export
ls.socrata <- function(url = "") {

parsedUrl <- httr::parse_url(url)

if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname)) {
stop(url, " does not appear to be a valid URL.")
}
parsedUrl$path <- "data.json"

df <- jsonlite::fromJSON(httr::build_url(parsedUrl))
df <- df$dataset
df$issued <- as.POSIXct(df$issued)
df$modified <- as.POSIXct(df$modified)
df$theme <- as.character(df$theme)

return(df)
}
52 changes: 52 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#' Convert Socrata human-readable column name to field name
#'
#' @description Convert Socrata human-readable column name,
#' as it might appear in the first row of data,
#' to field name as it might appear in the HTTP header;
#' that is, lower case, periods replaced with underscores
#'
#' @param humanName - a Socrata human-readable column name
#' @return Socrata field name
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @examples
#' fieldName("Number.of.Stations") # number_of_stations
#'
#' @export
fieldName <- function(humanName) {
tolower(gsub('\\.', '_', as.character(humanName)))
}

#' Convert Socrata calendar_date string to POSIX
#'
#' @description Datasets will either specify what timezone they should be interpreted in,
#' or you can usually assume they’re in the timezone of the publisher. See examples below too.
#'
#' @seealso \url{http://dev.socrata.com/docs/datatypes/floating_timestamp.html}
#' @param x - character vector in one of two Socrata calendar_date formats
#' @return a POSIX date
#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org}
#' @examples
#' posixify("2014-10-13T23:00:00")
#' posixify("09/14/2012 10:38:01 PM")
#' posixify("09/14/2012")
#'
#' @export
posixify <- function(x = "") {

if (length(x) == 0) {
return(x)
}

# Three calendar date formats supplied by Socrata
# https://github.com/GregDThomas/jquery-localtime/issues/1
if (regexpr("^(-?(?:[1-9][0-9]*)?[0-9]{4})-(1[0-2]|0[1-9])-(3[0-1]|0[1-9]|[1-2][0-9])T(2[0-3]|[0-1][0-9]):([0-5][0-9]):([0-5][0-9])(.[0-9]+)?(Z|[+-](?:2[0-3]|[0-1][0-9]):[0-5][0-9])?$", x) == TRUE) {
strptime(x, format = "%Y-%m-%dT%H:%M:%S") # floating timestamp
} else {
if (any(regexpr("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$", x[1])[1] == 1)) {
strptime(x, format="%m/%d/%Y") # short date format
} else {
strptime(x, format="%m/%d/%Y %I:%M:%S %p") # long date-time format
}
}

}
4 changes: 2 additions & 2 deletions man/fieldName.Rd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
% Generated by roxygen2 (4.1.1): do not edit by hand
% Please edit documentation in R/RSocrata.R
% Please edit documentation in R/utils.R
\name{fieldName}
\alias{fieldName}
\title{Convert Socrata human-readable column name to field name}
Expand All @@ -16,7 +16,7 @@ Socrata field name
Convert Socrata human-readable column name,
as it might appear in the first row of data,
to field name as it might appear in the HTTP header;
that is, lower case, periods replaced with underscores#'
that is, lower case, periods replaced with underscores
}
\examples{
fieldName("Number.of.Stations") # number_of_stations
Expand Down
Loading