Save data.json to file system; handle non-data files

Save data.json to file system ------------------------------ A copy of the data.json file at the beginning of the download process is saved alongside the actual downloaded data. Since `export.socrata()` uses data.json as the index to download data, this will allow users to cross-reference the downloaded data with other metadata associated with it available through [Project Open Data](https://project-open-data.cio.gov). Handle non-data file --------------------- Socrata lists non-data files, such as Socrata Stories--HTML websites that contain text but no machine-readable data--in the data.json file. This causes errors when trying to download those sites because they do not have a "distribution URL". While it's arguable that these "sites" should not be included in the first place, the script now simply skips those files. Since a copy of the data.json file is downloaded (see above), users will have transparency into which URLs were not downloaded.
Chicago · Oct 28, 2018 · aafcf15 · aafcf15
1 parent 46a488d
commit aafcf15
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 15 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,8 +10,8 @@ Description: Provides easier interaction with
     format and manages throttling by 'Socrata'.
     Users can upload data to Socrata portals directly
     from R.
-Version: 1.8.0-4
-Date: 2017-05-06
+Version: 1.8.0-5
+Date: 2018-10-27
 Author: Hugh Devlin, Ph. D., Tom Schenk, Jr., and John Malc
 Maintainer: "Tom Schenk Jr." <developers@cityofchicago.org>
 Depends:

diff --git a/R/RSocrata.R b/R/RSocrata.R
@@ -465,41 +465,55 @@ write.socrata <- function(dataframe, dataset_json_endpoint, update_mode, email,
 #' will download all CSV files (no other files supported) and saved in
 #' a single directory named after the root URL (e.g., "data.cityofchicago.org/").
 #' Downloaded files are compressed to GZip format and timestamped so the download
-#' time is saved. No data is saved within the R workspace.
+#' time is cataloged. The site's data.json file is downloaded as a canonical index
+#' of data saved from the website. Users can cross-reference the data.json file
+#' by matching the "four-by-four" in data.json with the first 5 letters of GZipped
+#' files.
 #' @param url - the base URL of a domain (e.g., "data.cityofchicago.org")
 #' @param app_token - a string; SODA API token used to query the data 
 #' portal \url{http://dev.socrata.com/consumers/getting-started.html} 
 #' @return a Gzipped file with the four-by-four and timestamp of when the download began in filename
 #' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org}
 #' @importFrom httr GET
+#' @importFrom jsonlite write_json
 #' @importFrom utils write.csv
 #' @export
 export.socrata <- function(url, app_token = NULL) {
   dir.create(basename(url), showWarnings = FALSE) # Create directory based on URL
-  ls <- ls.socrata(url = url)
+
+  downloadTime <- Sys.time()     # Grab timestamp when data.json was downloaded
+  downloadTz <- Sys.timezone()   # Timezone on system that downloaded data.json -- not used
+  ls <- ls.socrata(url = url)    # Downloads data.json file
+
+  downloadTimeChr <- gsub('\\s+','_',downloadTime)  # Remove spaces and replaces with underscore
+  downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
+  ls_filename <- paste0(basename(url), "/", "data_json", "_", downloadTimeChr, ".json")  # Creates path and filename for data.json file
+  jsonlite::write_json(ls, path = ls_filename)      # Writes data.json contents to directory
+
   for (i in 1:dim(ls)[1]) {
     # Track timestamp before download
-    downloadTime <- Sys.time()
-    downloadTz <- Sys.timezone()
+    downloadTime <- Sys.time()    # Denotes when data began download
+    downloadTz <- Sys.timezone()  # Timezone o n system that downloaded data.json -- not used
 
     # Download data
     downloadUrl <- ls$distribution[[i]]$downloadURL[1] # Currently grabs CSV, which is the first element
-    if (grepl(".csv", downloadUrl)) {
+    if(is.null(downloadUrl)) {                         # Skips if not a data file (e.g., Socrata Pages)
+      next
+    } else if (grepl(".csv", downloadUrl)) {           # Downloads if it's a CSV
       d <- read.socrata(downloadUrl, app_token)
 
       # Construct the filename output
       default_format <- "csv"
-      downloadTimeChr <- gsub('\\s+','_',downloadTime) # Remove spaces and replaces with underscore
+      downloadTimeChr <- gsub('\\s+','_',downloadTime)  # Remove spaces and replaces with underscore
       downloadTimeChr <- gsub(':', '', downloadTimeChr) # Removes colon from timestamp to be valid filename
-      filename <- httr::parse_url(ls$identifier[i])
-      filename$path <- substr(filename$path, 11, 19)
+      filename <- httr::parse_url(ls$identifier[i])     # Determines four-by-four for file name
+      filename$path <- substr(filename$path, 11, 19)    # Determines four-by-four for file name
       filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format, ".gz")
 
       # Write file
-      write.csv(d, file = gzfile(filename))
-
+      write.csv(d, file = gzfile(filename))             # Writes g-zipped file
     } else {
-      response <- GET(downloadUrl)
+      response <- GET(downloadUrl)                      # Downloads non-CSVs
 
       # Construct the filename output
       content_disposition <- response$headers$`content-disposition`
@@ -513,8 +527,7 @@ export.socrata <- function(url, app_token = NULL) {
       filename <- paste0(filename$hostname, "/", filename$path, "_", downloadTimeChr, ".", default_format)
 
       # Write file
-      writeBin(response$content, filename)
+      writeBin(response$content, filename)              # Writes non-CSVs to directory
     }
-
   }
 }