COMPASS-DOE · bpbond · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025 · Jun 18, 2025
diff --git a/pipeline/L1.qmd b/pipeline/L1.qmd
@@ -8,8 +8,10 @@ params:
   L1_NORMALIZE: "L1_normalize/"
   L1: "L1/"
   METADATA_ROOT: "metadata/"
+  METADATA_SITE_FILES: "site_files/"
+  RELEASE_README_FILES: "readme_files/"
   L1_METADATA: "L1_metadata/"
-  METADATA_VARS_TABLE: "L1_metadata_variables.csv"
+  METADATA_VARS_TABLE: "variables_metadata.csv"
   METADATA_COLUMNS_TABLE: "L1_metadata_columns.csv"
   # We use "Etc/GMT+5" rather than e.g. "America/New_York" for
   # L1_DATA_TIMEZONE because outputs should always be in STANDARD time
@@ -67,6 +69,10 @@ column_md <- read.csv(file.path(params$METADATA_ROOT,
 L1_NORMALIZE <- file.path(params$DATA_ROOT, params$L1_NORMALIZE)
 dirs_to_process <- scan_folders(L1_NORMALIZE)
 
+# Get the variable metadata
+L1_VAR_MD <- read.csv(file.path(params$METADATA_ROOT, 
+                                params$METADATA_VARS_TABLE))
+
 L1 <- file.path(params$DATA_ROOT, params$L1)
 ```
 
@@ -81,6 +87,7 @@ HTML outfile is "`r params$html_outfile`".
 ## Processing
 
 ```{r processing}
+
 f <- function(dir_name, dirs_to_process, out_dir) {
     message(Sys.time(), " Processing ", basename(dir_name))
     d <- dirs_to_process[[dir_name]]
@@ -120,6 +127,7 @@ f <- function(dir_name, dirs_to_process, out_dir) {
                          data_level = "L1",
                          site = site,
                          plot = plot,
+                         variable_metadata = L1_VAR_MD,
                          version = params$L1_VERSION,
                          write_plots = params$write_plots)
     }
@@ -170,10 +178,14 @@ options(warn = oldwarn)
 L1 metadata template directory is `r params$L1_METADATA`.
 
 ```{r metadata}
+
+source("metadata-utils.R")
+
 # Write the overall README
-readme_fn <- file.path(params$METADATA_ROOT, params$L1_METADATA, 
+readme_fn <- file.path(params$METADATA_ROOT,
+                       params$RELEASE_README_FILES, 
                        paste0("README_v", params$L1_VERSION, ".txt"))
-if(!file.exists(readme_fn)) stop("Couln't find ", readme_fn)
+if(!file.exists(readme_fn)) stop("Couldn't find ", readme_fn)
 readme <- readLines(readme_fn)
 readme <- gsub("[VERSION]", params$L1_VERSION, readme, fixed = TRUE)
 readme <- gsub("[DATESTAMP]", params$L1_RELEASE_DATE, readme, fixed = TRUE)
@@ -196,81 +208,49 @@ L1_metadata_template <- readLines(template_file)
 col_md_for_insert <- paste(sprintf("%-15s", column_md$Column), column_md$Description)
 
 # Get the variable metadata
-var_md <- read.csv(file.path(params$METADATA_ROOT, params$L1_METADATA, params$METADATA_VARS_TABLE))
-var_md_for_insert <- paste(sprintf("%-20s", c("research_name", var_md$research_name)),
-                           sprintf("%-10s", c("Units", var_md$final_units)),
-                           sprintf("%-12s", c("Bounds", paste0(var_md$low_bound, ", ", var_md$high_bound))),
-                           c("Description", var_md$description))
+var_md_for_insert <- md_variable_info(file.path(params$METADATA_ROOT,
+                                                params$METADATA_VARS_TABLE))
 
 message("Main template has ", length(L1_metadata_template), " lines")
 message("Column metadata info has ", length(col_md_for_insert), " lines")
 message("Variable metadata info has ", length(var_md_for_insert), " lines")
 
-message("Inserting data version string")
-L1_metadata_template <- gsub("[VERSION]", params$L1_VERSION,
-                             L1_metadata_template, fixed = TRUE)
-
 # Identify the main data directories in L1/{version}/, which are <site>_<year>
 data_dirs <- list.files(L1, pattern = "^[a-zA-Z]+_[0-9]{4}$")
+site_files_folder <- file.path(params$METADATA_ROOT, 
+                              params$METADATA_SITE_FILES)
 
 for(dd in data_dirs) {
     dd_full <- file.path(L1, dd)
     message("Generating metadata for ", dd_full)
 
-    message("\tInserting verion")
-    md <- gsub("[VERSION]", params$L1_VERSION, L1_metadata_template, fixed = TRUE)
-
     message("\tInserting timestamp and folder name")
-    md <- gsub("[TIMESTAMP]", date(), md, fixed = TRUE)
+    md <- gsub("[TIMESTAMP]", date(), L1_metadata_template, fixed = TRUE)
     md <- gsub("[FOLDER_NAME]", dd, md, fixed = TRUE)
 
-    # File info
-    files <- list.files(path = dd_full, pattern = "csv$", full.names = TRUE)
-    message("\tFound ", length(files), " data files")
-    file_info <- c()
-    # Build up information about files...
-    for(f in files) {
-        fdata <- readLines(f) # just for a quick line count
-        file_info <- c(file_info, 
-                       basename(f),
-                       paste("Rows:", length(fdata) - 1),
-                       paste("md5:", digest::digest(f, file = TRUE)),
-                       "")
-    }
-    # ...and insert into metadata
-    # We used the head(-1) to drop the final empty line, just to keep things pretty
-    file_info_pos <- grep("[FILE_INFO", md, fixed = TRUE)
-    md <- append(md, head(file_info, -1), after = file_info_pos)
-    md <- md[-file_info_pos]
-
+    # Insert info on data files into metadata
+    md <- md_insert_fileinfo(dd_full, md)
+
     # Insert column metadata
     col_info_pos <- grep("[COLUMN_INFO]", md, fixed = TRUE)
     md <- append(md, col_md_for_insert, after = col_info_pos)
     md <- md[-col_info_pos]
-    # The NA code is an in-line replacement
-    md <- gsub("[NA_STRING_L1]", NA_STRING_L1, md, fixed = TRUE)
-    # The time zone is an in-line replacement
-    md <- gsub("[TIMEZONE]", params$L1_DATA_TIMEZONE, md, fixed = TRUE)
-
+
+    # Insert NA code, time zone, and version information
+    md <- md_insert_miscellany(md, 
+                               NA_STRING_L1, 
+                               params$L1_DATA_TIMEZONE, 
+                               params$L1_VERSION)
+
     # Insert variable metadata
     var_info_pos <- grep("[VARIABLE_INFO]", md, fixed = TRUE)
     md <- append(md, var_md_for_insert, after = var_info_pos)
     md <- md[-var_info_pos]
 
     # Site information
     # Folders are <site>_<year>
-    # There MUST be an informational file named <site>.txt
     site <- strsplit(dd, "_")[[1]][1]
-    site_md_file <- file.path(params$METADATA_ROOT, params$L1_METADATA, paste0(site, ".txt"))
-    if(!file.exists(site_md_file)) {
-        stop("Couldn't find file ", site_md_file, " in ", params$L1_METADATA, " for dd=", dd)
-    }
-    site_md_for_insert <- readLines(site_md_file)
-
-    # Insert site information    
-    site_info_pos <- grep("[SITE_INFO]", md, fixed = TRUE)
-    md <- append(md, site_md_for_insert, after = site_info_pos)
-    md <- md[-site_info_pos]
+    md <- md_insert_siteinfo(site, site_files_folder, md)
 
     # Write the final metadata file
     mdfn <- paste0(dd, "_L1_v", params$L1_VERSION, "_metadata.txt")

diff --git a/pipeline/L1_normalize.qmd b/pipeline/L1_normalize.qmd
@@ -14,7 +14,7 @@ params:
   L1_DATA_TIMEZONE: "Etc/GMT+5"
   METADATA_ROOT: "metadata/"
   METADATA_TIMEZONES_TABLE: "L1_metadata/L1_metadata_timezones.csv"
-  METADATA_VARS_TABLE: "L1_metadata/L1_metadata_variables.csv"
+  METADATA_VARS_TABLE: "variables_metadata.csv"
   OOS: "out-of-service/"
   logfile: ""
   run_parallel: false

diff --git a/pipeline/helpers.R b/pipeline/helpers.R
@@ -76,6 +76,7 @@ write_to_folders <- function(x, root_dir,
                              data_level,
                              site, plot,
                              logger, table, # only provided for L1_normalize
+                             variable_metadata, # only provided for L1
                              version = "???",
                              quiet = FALSE, write_plots = TRUE) {
     # Sanity checks
@@ -134,16 +135,34 @@ write_to_folders <- function(x, root_dir,
                 filename <- paste(logger, table, y, rn, short_hash, sep = "_")
                 na_string <- NA_STRING_L1
             } else if(data_level == "L1") {
+                # Isolate this research name's metadata
+                vmd <- variable_metadata[variable_metadata$research_name == rn,]
+
                 folder <- file.path(root_dir, paste(site, y, sep = "_"))
                 filename <- paste(site, plot, time_period, rn, data_level, vversion, sep = "_")
                 na_string <- NA_STRING_L1
                 write_this_plot <- TRUE
                 p <- ggplot(x, aes(TIMESTAMP, Value, group = paste(Instrument_ID, Sensor_ID))) +
                     geom_line(na.rm = TRUE) +
                     facet_wrap(~research_name, scales = "free") +
-                    ggtitle(filename) +
-                    theme(axis.text = element_text(size = 6),
-                          strip.text = element_text(size = 8))
+                    ylab(paste0(vmd$research_name, " (", vmd$final_units, ")")) +
+                    theme(axis.text = element_text(size = 10),
+                          strip.text = element_text(size = 10))
+
+                # If any data are out of bounds, show those bounds
+                if(any(x$F_OOB, na.rm = TRUE)) {
+                    p <- p + geom_hline(yintercept = vmd$low_bound,
+                                        linetype = 2, color = "blue",
+                                        na.rm = TRUE) +
+                        geom_hline(yintercept = vmd$high_bound,
+                                   linetype = 2, color = "blue",
+                                   na.rm = TRUE) +
+                        ggtitle(filename,
+                                subtitle = "Dashed lines show instrument bounds")
+                } else {
+                    p <- p + ggtitle(filename)
+
+                }
             } else if(data_level == "L2_qaqc") {
                 folder <- file.path(root_dir, paste(site, y, sep = "_"))
                 filename <- paste(site, y, rn, data_level, vversion, sep = "_")
@@ -184,9 +203,10 @@ write_to_folders <- function(x, root_dir,
             }
 
             # Write basic QA/QC plot
+            # We use cairo_pdf to better handle Unicode chars in axis labels
             if(write_plots && write_this_plot) {
                 fn_p <- gsub("csv$", "pdf", fqfn)
-                ggsave(fn_p, plot = p, width = 12, height = 8)
+                ggsave(fn_p, plot = p, width = 12, height = 8, device = cairo_pdf)
             }
 
             lines_written[[fqfn]] <- nrow(dat)

diff --git a/pipeline/metadata-utils.R b/pipeline/metadata-utils.R
@@ -0,0 +1,60 @@
+# metadata-utils.R
+# Helper functions for generating both L1 and L2 metadata
+# BBL June 2025
+
+# Get information about files in a folder and insert into metadata
+# (a character vector)
+md_insert_fileinfo <- function(folder, md) {
+    files <- list.files(path = folder, pattern = "csv$", full.names = TRUE)
+    message("\tFound ", length(files), " data files")
+    file_info <- c()
+    # Build up information about files...
+    for(f in files) {
+        fdata <- readLines(f) # just for a quick line count
+        file_info <- c(file_info,
+                       basename(f),
+                       paste("Rows:", length(fdata) - 1),
+                       paste("md5:", digest::digest(f, file = TRUE)),
+                       "")
+    }
+    # ...and insert into metadata
+    # We used the head(-1) to drop the final empty line, just to keep things pretty
+    file_info_pos <- grep("[FILE_INFO", md, fixed = TRUE)
+    md <- append(md, head(file_info, -1), after = file_info_pos)
+    md[-file_info_pos]
+}
+
+# Read the variable metadata table and return a formatted extract from it
+md_variable_info <- function(variable_md_file) {
+    var_md <- read.csv(variable_md_file)
+    paste(sprintf("%-20s", c("research_name", var_md$research_name)),
+          sprintf("%-10s", c("Units", var_md$final_units)),
+          sprintf("%-12s", c("Bounds", paste0(var_md$low_bound, ", ", var_md$high_bound))),
+          c("Description", var_md$description))
+}
+
+# Insert site information into metadata
+# There MUST be an informational file named <site>.txt
+md_insert_siteinfo <- function(site, site_files_folder, md) {
+    site_md_file <- file.path(site_files_folder, paste0(site, ".txt"))
+    if(!file.exists(site_md_file)) {
+        stop("Couldn't find file ", site_md_file, " in ", site_files_folder)
+    }
+    site_md_for_insert <- readLines(site_md_file)
+
+    # Insert site information
+    site_info_pos <- grep("[SITE_INFO]", md, fixed = TRUE)
+    md <- append(md, site_md_for_insert, after = site_info_pos)
+    md[-site_info_pos]
+}
+
+md_insert_miscellany <- function(md, na_string, time_zone, version) {
+    message("Inserting NA code, time zone, and version strings")
+
+    # The NA code is an in-line replacement
+    md <- gsub("[NA_STRING]", na_string, md, fixed = TRUE)
+    # The time zone is an in-line replacement
+    md <- gsub("[TIMEZONE]", time_zone, md, fixed = TRUE)
+
+    gsub("[VERSION]", version, md, fixed = TRUE)
+}
diff --git a/pipeline/metadata/L1_metadata/L1_metadata_columns.csv b/pipeline/metadata/L1_metadata/L1_metadata_columns.csv
@@ -7,7 +7,7 @@ Instrument,Name of measurement instrument (character)
 Instrument_ID,Identifier of instrument within plot (character)
 Sensor_ID,"Identifier of individual sensor, tree, etc. being measured (character)"
 Location,"Spatial location; for TEMPEST, grid square (character)"
-Value,Observed value (numeric). The no-data value is '[NA_STRING_L1]'
+Value,Observed value (numeric). The no-data value is '[NA_STRING]'
 research_name,Measurement name (character)
 ID,Observation identifier (character)
 F_OOB,Flag: out of instrumental bounds (logical; 1=TRUE)

diff --git a/pipeline/metadata/readme_files/README.md b/pipeline/metadata/readme_files/README.md
@@ -0,0 +1,4 @@
+# readme_files
+
+This folder holds the README files for each data release. The L1.qmd step
+will error if a version doesn't have an associated README file here.
diff --git a/...line/metadata/L1_metadata/README_v0-9.txt → ...ine/metadata/readme_files/README_v0-9.txt b/...line/metadata/L1_metadata/README_v0-9.txt → ...ine/metadata/readme_files/README_v0-9.txt
diff --git a/...line/metadata/L1_metadata/README_v1-0.txt → ...ine/metadata/readme_files/README_v1-0.txt b/...line/metadata/L1_metadata/README_v1-0.txt → ...ine/metadata/readme_files/README_v1-0.txt
diff --git a/...line/metadata/L1_metadata/README_v1-1.txt → ...ine/metadata/readme_files/README_v1-1.txt b/...line/metadata/L1_metadata/README_v1-1.txt → ...ine/metadata/readme_files/README_v1-1.txt
diff --git a/...line/metadata/L1_metadata/README_v1-2.txt → ...ine/metadata/readme_files/README_v1-2.txt b/...line/metadata/L1_metadata/README_v1-2.txt → ...ine/metadata/readme_files/README_v1-2.txt
diff --git a/...line/metadata/L1_metadata/README_v2-0.txt → ...ine/metadata/readme_files/README_v2-0.txt b/...line/metadata/L1_metadata/README_v2-0.txt → ...ine/metadata/readme_files/README_v2-0.txt
@@ -34,7 +34,7 @@ Data are organized into {SITE}_{YEAR} folders, with comma-separated
 value (CSV) files in each folder for each plot and output variable at
 that site.
 
-The data file naming convention is 
+The data file naming convention is
 {SITE}_{PLOT}_{DATE RANGE}_{OUTPUT VARIABLE}_L1_{VERSION}.csv
 
 Sites include CRC (Crane Creek), GCW (GCReW), GWI (Goodwin
@@ -64,12 +64,13 @@ CHANGELOG
 Version 2-0 released [DATESTAMP]
 * Covers late 2019 through June 2025 for TEMPEST and all synoptic sites
 * Data files are now annual and single-variable, rather than monthly and multi-variable
+* Data plots now include out-of-bounds indicators and informative axis labels
 * Back-corrected two years of corrupted AQ600 files at TEMPEST; thanks to SJW
-* Minor data fixes: CD8 sapflux sensor, wx_par_tot15 calculation, MSM Buoy time zone, sapflux sensor depth, ClimaVue VP units 
+* Minor data fixes: CD8 sapflux sensor, wx_par_tot15 calculation, MSM Buoy time zone, sapflux sensor depth, ClimaVue VP units
 * New code examples, documentation improvements, and more
 * Many backend improvements; see https://github.com/COMPASS-DOE/sensor-data-pipeline/issues/244
 
-Version 1-2 released [DATESTAMP]
+Version 1-2 released 2025-02-14
 * Covers late 2019 through December 2024 for TEMPEST and all synoptic sites
 * All sonde (EXO) data now appear in their own "OW" (open water) plot
 * The TEMPEST (TMP) folder README files now include detailed information on flood timings, volumes, etc.

diff --git a/...line/metadata/L1_metadata/README_v???.txt → ...ine/metadata/readme_files/README_v???.txt b/...line/metadata/L1_metadata/README_v???.txt → ...ine/metadata/readme_files/README_v???.txt
diff --git a/pipeline/metadata/L1_metadata/CRC.txt → pipeline/metadata/site_files/CRC.txt b/pipeline/metadata/L1_metadata/CRC.txt → pipeline/metadata/site_files/CRC.txt
diff --git a/pipeline/metadata/L1_metadata/GCW.txt → pipeline/metadata/site_files/GCW.txt b/pipeline/metadata/L1_metadata/GCW.txt → pipeline/metadata/site_files/GCW.txt
diff --git a/pipeline/metadata/L1_metadata/GWI.txt → pipeline/metadata/site_files/GWI.txt b/pipeline/metadata/L1_metadata/GWI.txt → pipeline/metadata/site_files/GWI.txt
diff --git a/pipeline/metadata/L1_metadata/MSM.txt → pipeline/metadata/site_files/MSM.txt b/pipeline/metadata/L1_metadata/MSM.txt → pipeline/metadata/site_files/MSM.txt
diff --git a/pipeline/metadata/L1_metadata/OWC.txt → pipeline/metadata/site_files/OWC.txt b/pipeline/metadata/L1_metadata/OWC.txt → pipeline/metadata/site_files/OWC.txt
diff --git a/pipeline/metadata/L1_metadata/PTR.txt → pipeline/metadata/site_files/PTR.txt b/pipeline/metadata/L1_metadata/PTR.txt → pipeline/metadata/site_files/PTR.txt
diff --git a/pipeline/metadata/L1_metadata/SWH.txt → pipeline/metadata/site_files/SWH.txt b/pipeline/metadata/L1_metadata/SWH.txt → pipeline/metadata/site_files/SWH.txt
diff --git a/pipeline/metadata/L1_metadata/TMP.txt → pipeline/metadata/site_files/TMP.txt b/pipeline/metadata/L1_metadata/TMP.txt → pipeline/metadata/site_files/TMP.txt
diff --git a/pipeline/metadata/L1_metadata/TZTEST.txt → pipeline/metadata/site_files/TZTEST.txt b/pipeline/metadata/L1_metadata/TZTEST.txt → pipeline/metadata/site_files/TZTEST.txt
diff --git a/...ata/L1_metadata/L1_metadata_variables.csv → pipeline/metadata/variables_metadata.csv b/...ata/L1_metadata/L1_metadata_variables.csv → pipeline/metadata/variables_metadata.csv