diff --git a/pipeline/L1.qmd b/pipeline/L1.qmd index b7787ae..a2bd4e5 100644 --- a/pipeline/L1.qmd +++ b/pipeline/L1.qmd @@ -8,8 +8,10 @@ params: L1_NORMALIZE: "L1_normalize/" L1: "L1/" METADATA_ROOT: "metadata/" + METADATA_SITE_FILES: "site_files/" + RELEASE_README_FILES: "readme_files/" L1_METADATA: "L1_metadata/" - METADATA_VARS_TABLE: "L1_metadata_variables.csv" + METADATA_VARS_TABLE: "variables_metadata.csv" METADATA_COLUMNS_TABLE: "L1_metadata_columns.csv" # We use "Etc/GMT+5" rather than e.g. "America/New_York" for # L1_DATA_TIMEZONE because outputs should always be in STANDARD time @@ -67,6 +69,10 @@ column_md <- read.csv(file.path(params$METADATA_ROOT, L1_NORMALIZE <- file.path(params$DATA_ROOT, params$L1_NORMALIZE) dirs_to_process <- scan_folders(L1_NORMALIZE) +# Get the variable metadata +L1_VAR_MD <- read.csv(file.path(params$METADATA_ROOT, + params$METADATA_VARS_TABLE)) + L1 <- file.path(params$DATA_ROOT, params$L1) ``` @@ -81,6 +87,7 @@ HTML outfile is "`r params$html_outfile`". ## Processing ```{r processing} + f <- function(dir_name, dirs_to_process, out_dir) { message(Sys.time(), " Processing ", basename(dir_name)) d <- dirs_to_process[[dir_name]] @@ -120,6 +127,7 @@ f <- function(dir_name, dirs_to_process, out_dir) { data_level = "L1", site = site, plot = plot, + variable_metadata = L1_VAR_MD, version = params$L1_VERSION, write_plots = params$write_plots) } @@ -170,10 +178,14 @@ options(warn = oldwarn) L1 metadata template directory is `r params$L1_METADATA`. ```{r metadata} + +source("metadata-utils.R") + # Write the overall README -readme_fn <- file.path(params$METADATA_ROOT, params$L1_METADATA, +readme_fn <- file.path(params$METADATA_ROOT, + params$RELEASE_README_FILES, paste0("README_v", params$L1_VERSION, ".txt")) -if(!file.exists(readme_fn)) stop("Couln't find ", readme_fn) +if(!file.exists(readme_fn)) stop("Couldn't find ", readme_fn) readme <- readLines(readme_fn) readme <- gsub("[VERSION]", params$L1_VERSION, readme, fixed = TRUE) readme <- gsub("[DATESTAMP]", params$L1_RELEASE_DATE, readme, fixed = TRUE) @@ -196,62 +208,40 @@ L1_metadata_template <- readLines(template_file) col_md_for_insert <- paste(sprintf("%-15s", column_md$Column), column_md$Description) # Get the variable metadata -var_md <- read.csv(file.path(params$METADATA_ROOT, params$L1_METADATA, params$METADATA_VARS_TABLE)) -var_md_for_insert <- paste(sprintf("%-20s", c("research_name", var_md$research_name)), - sprintf("%-10s", c("Units", var_md$final_units)), - sprintf("%-12s", c("Bounds", paste0(var_md$low_bound, ", ", var_md$high_bound))), - c("Description", var_md$description)) +var_md_for_insert <- md_variable_info(file.path(params$METADATA_ROOT, + params$METADATA_VARS_TABLE)) message("Main template has ", length(L1_metadata_template), " lines") message("Column metadata info has ", length(col_md_for_insert), " lines") message("Variable metadata info has ", length(var_md_for_insert), " lines") -message("Inserting data version string") -L1_metadata_template <- gsub("[VERSION]", params$L1_VERSION, - L1_metadata_template, fixed = TRUE) - # Identify the main data directories in L1/{version}/, which are _ data_dirs <- list.files(L1, pattern = "^[a-zA-Z]+_[0-9]{4}$") +site_files_folder <- file.path(params$METADATA_ROOT, + params$METADATA_SITE_FILES) for(dd in data_dirs) { dd_full <- file.path(L1, dd) message("Generating metadata for ", dd_full) - message("\tInserting verion") - md <- gsub("[VERSION]", params$L1_VERSION, L1_metadata_template, fixed = TRUE) - message("\tInserting timestamp and folder name") - md <- gsub("[TIMESTAMP]", date(), md, fixed = TRUE) + md <- gsub("[TIMESTAMP]", date(), L1_metadata_template, fixed = TRUE) md <- gsub("[FOLDER_NAME]", dd, md, fixed = TRUE) - # File info - files <- list.files(path = dd_full, pattern = "csv$", full.names = TRUE) - message("\tFound ", length(files), " data files") - file_info <- c() - # Build up information about files... - for(f in files) { - fdata <- readLines(f) # just for a quick line count - file_info <- c(file_info, - basename(f), - paste("Rows:", length(fdata) - 1), - paste("md5:", digest::digest(f, file = TRUE)), - "") - } - # ...and insert into metadata - # We used the head(-1) to drop the final empty line, just to keep things pretty - file_info_pos <- grep("[FILE_INFO", md, fixed = TRUE) - md <- append(md, head(file_info, -1), after = file_info_pos) - md <- md[-file_info_pos] - + # Insert info on data files into metadata + md <- md_insert_fileinfo(dd_full, md) + # Insert column metadata col_info_pos <- grep("[COLUMN_INFO]", md, fixed = TRUE) md <- append(md, col_md_for_insert, after = col_info_pos) md <- md[-col_info_pos] - # The NA code is an in-line replacement - md <- gsub("[NA_STRING_L1]", NA_STRING_L1, md, fixed = TRUE) - # The time zone is an in-line replacement - md <- gsub("[TIMEZONE]", params$L1_DATA_TIMEZONE, md, fixed = TRUE) - + + # Insert NA code, time zone, and version information + md <- md_insert_miscellany(md, + NA_STRING_L1, + params$L1_DATA_TIMEZONE, + params$L1_VERSION) + # Insert variable metadata var_info_pos <- grep("[VARIABLE_INFO]", md, fixed = TRUE) md <- append(md, var_md_for_insert, after = var_info_pos) @@ -259,18 +249,8 @@ for(dd in data_dirs) { # Site information # Folders are _ - # There MUST be an informational file named .txt site <- strsplit(dd, "_")[[1]][1] - site_md_file <- file.path(params$METADATA_ROOT, params$L1_METADATA, paste0(site, ".txt")) - if(!file.exists(site_md_file)) { - stop("Couldn't find file ", site_md_file, " in ", params$L1_METADATA, " for dd=", dd) - } - site_md_for_insert <- readLines(site_md_file) - - # Insert site information - site_info_pos <- grep("[SITE_INFO]", md, fixed = TRUE) - md <- append(md, site_md_for_insert, after = site_info_pos) - md <- md[-site_info_pos] + md <- md_insert_siteinfo(site, site_files_folder, md) # Write the final metadata file mdfn <- paste0(dd, "_L1_v", params$L1_VERSION, "_metadata.txt") diff --git a/pipeline/L1_normalize.qmd b/pipeline/L1_normalize.qmd index 7f7db17..9a0889f 100644 --- a/pipeline/L1_normalize.qmd +++ b/pipeline/L1_normalize.qmd @@ -14,7 +14,7 @@ params: L1_DATA_TIMEZONE: "Etc/GMT+5" METADATA_ROOT: "metadata/" METADATA_TIMEZONES_TABLE: "L1_metadata/L1_metadata_timezones.csv" - METADATA_VARS_TABLE: "L1_metadata/L1_metadata_variables.csv" + METADATA_VARS_TABLE: "variables_metadata.csv" OOS: "out-of-service/" logfile: "" run_parallel: false diff --git a/pipeline/helpers.R b/pipeline/helpers.R index cf3bc10..4e1e94a 100644 --- a/pipeline/helpers.R +++ b/pipeline/helpers.R @@ -76,6 +76,7 @@ write_to_folders <- function(x, root_dir, data_level, site, plot, logger, table, # only provided for L1_normalize + variable_metadata, # only provided for L1 version = "???", quiet = FALSE, write_plots = TRUE) { # Sanity checks @@ -134,6 +135,9 @@ write_to_folders <- function(x, root_dir, filename <- paste(logger, table, y, rn, short_hash, sep = "_") na_string <- NA_STRING_L1 } else if(data_level == "L1") { + # Isolate this research name's metadata + vmd <- variable_metadata[variable_metadata$research_name == rn,] + folder <- file.path(root_dir, paste(site, y, sep = "_")) filename <- paste(site, plot, time_period, rn, data_level, vversion, sep = "_") na_string <- NA_STRING_L1 @@ -141,9 +145,24 @@ write_to_folders <- function(x, root_dir, p <- ggplot(x, aes(TIMESTAMP, Value, group = paste(Instrument_ID, Sensor_ID))) + geom_line(na.rm = TRUE) + facet_wrap(~research_name, scales = "free") + - ggtitle(filename) + - theme(axis.text = element_text(size = 6), - strip.text = element_text(size = 8)) + ylab(paste0(vmd$research_name, " (", vmd$final_units, ")")) + + theme(axis.text = element_text(size = 10), + strip.text = element_text(size = 10)) + + # If any data are out of bounds, show those bounds + if(any(x$F_OOB, na.rm = TRUE)) { + p <- p + geom_hline(yintercept = vmd$low_bound, + linetype = 2, color = "blue", + na.rm = TRUE) + + geom_hline(yintercept = vmd$high_bound, + linetype = 2, color = "blue", + na.rm = TRUE) + + ggtitle(filename, + subtitle = "Dashed lines show instrument bounds") + } else { + p <- p + ggtitle(filename) + + } } else if(data_level == "L2_qaqc") { folder <- file.path(root_dir, paste(site, y, sep = "_")) filename <- paste(site, y, rn, data_level, vversion, sep = "_") @@ -184,9 +203,10 @@ write_to_folders <- function(x, root_dir, } # Write basic QA/QC plot + # We use cairo_pdf to better handle Unicode chars in axis labels if(write_plots && write_this_plot) { fn_p <- gsub("csv$", "pdf", fqfn) - ggsave(fn_p, plot = p, width = 12, height = 8) + ggsave(fn_p, plot = p, width = 12, height = 8, device = cairo_pdf) } lines_written[[fqfn]] <- nrow(dat) diff --git a/pipeline/metadata-utils.R b/pipeline/metadata-utils.R new file mode 100644 index 0000000..e957dfe --- /dev/null +++ b/pipeline/metadata-utils.R @@ -0,0 +1,60 @@ +# metadata-utils.R +# Helper functions for generating both L1 and L2 metadata +# BBL June 2025 + +# Get information about files in a folder and insert into metadata +# (a character vector) +md_insert_fileinfo <- function(folder, md) { + files <- list.files(path = folder, pattern = "csv$", full.names = TRUE) + message("\tFound ", length(files), " data files") + file_info <- c() + # Build up information about files... + for(f in files) { + fdata <- readLines(f) # just for a quick line count + file_info <- c(file_info, + basename(f), + paste("Rows:", length(fdata) - 1), + paste("md5:", digest::digest(f, file = TRUE)), + "") + } + # ...and insert into metadata + # We used the head(-1) to drop the final empty line, just to keep things pretty + file_info_pos <- grep("[FILE_INFO", md, fixed = TRUE) + md <- append(md, head(file_info, -1), after = file_info_pos) + md[-file_info_pos] +} + +# Read the variable metadata table and return a formatted extract from it +md_variable_info <- function(variable_md_file) { + var_md <- read.csv(variable_md_file) + paste(sprintf("%-20s", c("research_name", var_md$research_name)), + sprintf("%-10s", c("Units", var_md$final_units)), + sprintf("%-12s", c("Bounds", paste0(var_md$low_bound, ", ", var_md$high_bound))), + c("Description", var_md$description)) +} + +# Insert site information into metadata +# There MUST be an informational file named .txt +md_insert_siteinfo <- function(site, site_files_folder, md) { + site_md_file <- file.path(site_files_folder, paste0(site, ".txt")) + if(!file.exists(site_md_file)) { + stop("Couldn't find file ", site_md_file, " in ", site_files_folder) + } + site_md_for_insert <- readLines(site_md_file) + + # Insert site information + site_info_pos <- grep("[SITE_INFO]", md, fixed = TRUE) + md <- append(md, site_md_for_insert, after = site_info_pos) + md[-site_info_pos] +} + +md_insert_miscellany <- function(md, na_string, time_zone, version) { + message("Inserting NA code, time zone, and version strings") + + # The NA code is an in-line replacement + md <- gsub("[NA_STRING]", na_string, md, fixed = TRUE) + # The time zone is an in-line replacement + md <- gsub("[TIMEZONE]", time_zone, md, fixed = TRUE) + + gsub("[VERSION]", version, md, fixed = TRUE) +} diff --git a/pipeline/metadata/L1_metadata/L1_metadata_columns.csv b/pipeline/metadata/L1_metadata/L1_metadata_columns.csv index b009ace..cb89d23 100644 --- a/pipeline/metadata/L1_metadata/L1_metadata_columns.csv +++ b/pipeline/metadata/L1_metadata/L1_metadata_columns.csv @@ -7,7 +7,7 @@ Instrument,Name of measurement instrument (character) Instrument_ID,Identifier of instrument within plot (character) Sensor_ID,"Identifier of individual sensor, tree, etc. being measured (character)" Location,"Spatial location; for TEMPEST, grid square (character)" -Value,Observed value (numeric). The no-data value is '[NA_STRING_L1]' +Value,Observed value (numeric). The no-data value is '[NA_STRING]' research_name,Measurement name (character) ID,Observation identifier (character) F_OOB,Flag: out of instrumental bounds (logical; 1=TRUE) diff --git a/pipeline/metadata/readme_files/README.md b/pipeline/metadata/readme_files/README.md new file mode 100644 index 0000000..f2d5479 --- /dev/null +++ b/pipeline/metadata/readme_files/README.md @@ -0,0 +1,4 @@ +# readme_files + +This folder holds the README files for each data release. The L1.qmd step +will error if a version doesn't have an associated README file here. diff --git a/pipeline/metadata/L1_metadata/README_v0-9.txt b/pipeline/metadata/readme_files/README_v0-9.txt similarity index 100% rename from pipeline/metadata/L1_metadata/README_v0-9.txt rename to pipeline/metadata/readme_files/README_v0-9.txt diff --git a/pipeline/metadata/L1_metadata/README_v1-0.txt b/pipeline/metadata/readme_files/README_v1-0.txt similarity index 100% rename from pipeline/metadata/L1_metadata/README_v1-0.txt rename to pipeline/metadata/readme_files/README_v1-0.txt diff --git a/pipeline/metadata/L1_metadata/README_v1-1.txt b/pipeline/metadata/readme_files/README_v1-1.txt similarity index 100% rename from pipeline/metadata/L1_metadata/README_v1-1.txt rename to pipeline/metadata/readme_files/README_v1-1.txt diff --git a/pipeline/metadata/L1_metadata/README_v1-2.txt b/pipeline/metadata/readme_files/README_v1-2.txt similarity index 100% rename from pipeline/metadata/L1_metadata/README_v1-2.txt rename to pipeline/metadata/readme_files/README_v1-2.txt diff --git a/pipeline/metadata/L1_metadata/README_v2-0.txt b/pipeline/metadata/readme_files/README_v2-0.txt similarity index 95% rename from pipeline/metadata/L1_metadata/README_v2-0.txt rename to pipeline/metadata/readme_files/README_v2-0.txt index e394199..64a8b56 100644 --- a/pipeline/metadata/L1_metadata/README_v2-0.txt +++ b/pipeline/metadata/readme_files/README_v2-0.txt @@ -34,7 +34,7 @@ Data are organized into {SITE}_{YEAR} folders, with comma-separated value (CSV) files in each folder for each plot and output variable at that site. -The data file naming convention is +The data file naming convention is {SITE}_{PLOT}_{DATE RANGE}_{OUTPUT VARIABLE}_L1_{VERSION}.csv Sites include CRC (Crane Creek), GCW (GCReW), GWI (Goodwin @@ -64,12 +64,13 @@ CHANGELOG Version 2-0 released [DATESTAMP] * Covers late 2019 through June 2025 for TEMPEST and all synoptic sites * Data files are now annual and single-variable, rather than monthly and multi-variable +* Data plots now include out-of-bounds indicators and informative axis labels * Back-corrected two years of corrupted AQ600 files at TEMPEST; thanks to SJW -* Minor data fixes: CD8 sapflux sensor, wx_par_tot15 calculation, MSM Buoy time zone, sapflux sensor depth, ClimaVue VP units +* Minor data fixes: CD8 sapflux sensor, wx_par_tot15 calculation, MSM Buoy time zone, sapflux sensor depth, ClimaVue VP units * New code examples, documentation improvements, and more * Many backend improvements; see https://github.com/COMPASS-DOE/sensor-data-pipeline/issues/244 -Version 1-2 released [DATESTAMP] +Version 1-2 released 2025-02-14 * Covers late 2019 through December 2024 for TEMPEST and all synoptic sites * All sonde (EXO) data now appear in their own "OW" (open water) plot * The TEMPEST (TMP) folder README files now include detailed information on flood timings, volumes, etc. diff --git a/pipeline/metadata/L1_metadata/README_v???.txt b/pipeline/metadata/readme_files/README_v???.txt similarity index 100% rename from pipeline/metadata/L1_metadata/README_v???.txt rename to pipeline/metadata/readme_files/README_v???.txt diff --git a/pipeline/metadata/L1_metadata/CRC.txt b/pipeline/metadata/site_files/CRC.txt similarity index 100% rename from pipeline/metadata/L1_metadata/CRC.txt rename to pipeline/metadata/site_files/CRC.txt diff --git a/pipeline/metadata/L1_metadata/GCW.txt b/pipeline/metadata/site_files/GCW.txt similarity index 100% rename from pipeline/metadata/L1_metadata/GCW.txt rename to pipeline/metadata/site_files/GCW.txt diff --git a/pipeline/metadata/L1_metadata/GWI.txt b/pipeline/metadata/site_files/GWI.txt similarity index 100% rename from pipeline/metadata/L1_metadata/GWI.txt rename to pipeline/metadata/site_files/GWI.txt diff --git a/pipeline/metadata/L1_metadata/MSM.txt b/pipeline/metadata/site_files/MSM.txt similarity index 100% rename from pipeline/metadata/L1_metadata/MSM.txt rename to pipeline/metadata/site_files/MSM.txt diff --git a/pipeline/metadata/L1_metadata/OWC.txt b/pipeline/metadata/site_files/OWC.txt similarity index 100% rename from pipeline/metadata/L1_metadata/OWC.txt rename to pipeline/metadata/site_files/OWC.txt diff --git a/pipeline/metadata/L1_metadata/PTR.txt b/pipeline/metadata/site_files/PTR.txt similarity index 100% rename from pipeline/metadata/L1_metadata/PTR.txt rename to pipeline/metadata/site_files/PTR.txt diff --git a/pipeline/metadata/L1_metadata/SWH.txt b/pipeline/metadata/site_files/SWH.txt similarity index 100% rename from pipeline/metadata/L1_metadata/SWH.txt rename to pipeline/metadata/site_files/SWH.txt diff --git a/pipeline/metadata/L1_metadata/TMP.txt b/pipeline/metadata/site_files/TMP.txt similarity index 100% rename from pipeline/metadata/L1_metadata/TMP.txt rename to pipeline/metadata/site_files/TMP.txt diff --git a/pipeline/metadata/L1_metadata/TZTEST.txt b/pipeline/metadata/site_files/TZTEST.txt similarity index 100% rename from pipeline/metadata/L1_metadata/TZTEST.txt rename to pipeline/metadata/site_files/TZTEST.txt diff --git a/pipeline/metadata/L1_metadata/L1_metadata_variables.csv b/pipeline/metadata/variables_metadata.csv similarity index 100% rename from pipeline/metadata/L1_metadata/L1_metadata_variables.csv rename to pipeline/metadata/variables_metadata.csv