Skip to content

Commit

Permalink
Save data.json to file system; handle non-data files
Browse files Browse the repository at this point in the history
Save data.json to file system
------------------------------
A copy of the data.json file at the beginning of the download process is
saved alongside the actual downloaded data. Since `export.socrata()` uses
data.json as the index to download data, this will allow users to
cross-reference the downloaded data with other metadata associated with it
available through [Project Open Data](https://project-open-data.cio.gov).

Handle non-data file
---------------------
Socrata lists non-data files, such as Socrata Stories--HTML websites that
contain text but no machine-readable data--in the data.json file. This
causes errors when trying to download those sites because they do not have
a "distribution URL". While it's arguable that these "sites" should not be
included in the first place, the script now simply skips those files.

Since a copy of the data.json file is downloaded (see above), users will
have transparency into which URLs were not downloaded.
  • Loading branch information
Tom Schenk Jr committed Oct 28, 2018
1 parent 46a488d commit aafcf15
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 15 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Expand Up @@ -10,8 +10,8 @@ Description: Provides easier interaction with
format and manages throttling by 'Socrata'.
Users can upload data to Socrata portals directly
from R.
Version: 1.8.0-4
Date: 2017-05-06
Version: 1.8.0-5
Date: 2018-10-27
Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc
Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
Depends:
Expand Down
39 changes: 26 additions & 13 deletions R/RSocrata.R
Expand Up @@ -465,41 +465,55 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
#' will download all CSV files (no other files supported) and saved in
#' a single directory named after the root URL (e.g., "data.cityofchicago.org/").
#' Downloaded files are compressed to GZip format and timestamped so the download
#' time is saved. No data is saved within the R workspace.
#' time is cataloged. The site's data.json file is downloaded as a canonical index
#' of data saved from the website. Users can cross-reference the data.json file
#' by matching the "four-by-four" in data.json with the first 5 letters of GZipped
#' files.
#' @param url - the base URL of a domain (e.g., "data.cityofchicago.org")
#' @param app_token - a string; SODA API token used to query the data
#' portal \url{http://dev.socrata.com/consumers/getting-started.html}
#' @return a Gzipped file with the four-by-four and timestamp of when the download began in filename
#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
#' @importFrom httr GET
#' @importFrom jsonlite write_json
#' @importFrom utils write.csv
#' @export
export.socrata <- function(url, app_token = NULL) {
dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL
ls <- ls.socrata(url = url)

downloadTime <- Sys.time() # Grab timestamp when data.json was downloaded
downloadTz <- Sys.timezone() # Timezone on system that downloaded data.json -- not used
ls <- ls.socrata(url = url) # Downloads data.json file

downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore
downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
ls_filename <- paste0(basename(url), "/", "data_json", "_", downloadTimeChr, ".json") # Creates path and filename for data.json file
jsonlite::write_json(ls, path = ls_filename) # Writes data.json contents to directory

for (i in 1:dim(ls)[1]) {
# Track timestamp before download
downloadTime <- Sys.time()
downloadTz <- Sys.timezone()
downloadTime <- Sys.time() # Denotes when data began download
downloadTz <- Sys.timezone() # Timezone o n system that downloaded data.json -- not used

# Download data
downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element
if (grepl(".csv", downloadUrl)) {
if(is.null(downloadUrl)) { # Skips if not a data file (e.g., Socrata Pages)
next
} else if (grepl(".csv", downloadUrl)) { # Downloads if it's a CSV
d <- read.socrata(downloadUrl, app_token)

# Construct the filename output
default_format <- "csv"
downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore
downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore
downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
filename <- httr::parse_url(ls$identifier[i])
filename$path <- substr(filename$path, 11, 19)
filename <- httr::parse_url(ls$identifier[i]) # Determines four-by-four for file name
filename$path <- substr(filename$path, 11, 19) # Determines four-by-four for file name
filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz")

# Write file
write.csv(d, file = gzfile(filename))

write.csv(d, file = gzfile(filename)) # Writes g-zipped file
} else {
response <- GET(downloadUrl)
response <- GET(downloadUrl) # Downloads non-CSVs

# Construct the filename output
content_disposition <- response$headers$`content-disposition`
Expand All @@ -513,8 +527,7 @@ export.socrata <- function(url, app_token = NULL) {
filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format)

# Write file
writeBin(response$content, filename)
writeBin(response$content, filename) # Writes non-CSVs to directory
}

}
}

0 comments on commit aafcf15

Please sign in to comment.