Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 31 additions & 51 deletions pipeline/L1.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ params:
L1_NORMALIZE: "L1_normalize/"
L1: "L1/"
METADATA_ROOT: "metadata/"
METADATA_SITE_FILES: "site_files/"
RELEASE_README_FILES: "readme_files/"
L1_METADATA: "L1_metadata/"
METADATA_VARS_TABLE: "L1_metadata_variables.csv"
METADATA_VARS_TABLE: "variables_metadata.csv"
METADATA_COLUMNS_TABLE: "L1_metadata_columns.csv"
# We use "Etc/GMT+5" rather than e.g. "America/New_York" for
# L1_DATA_TIMEZONE because outputs should always be in STANDARD time
Expand Down Expand Up @@ -67,6 +69,10 @@ column_md <- read.csv(file.path(params$METADATA_ROOT,
L1_NORMALIZE <- file.path(params$DATA_ROOT, params$L1_NORMALIZE)
dirs_to_process <- scan_folders(L1_NORMALIZE)

# Get the variable metadata
L1_VAR_MD <- read.csv(file.path(params$METADATA_ROOT,
params$METADATA_VARS_TABLE))

L1 <- file.path(params$DATA_ROOT, params$L1)
```

Expand All @@ -81,6 +87,7 @@ HTML outfile is "`r params$html_outfile`".
## Processing

```{r processing}

f <- function(dir_name, dirs_to_process, out_dir) {
message(Sys.time(), " Processing ", basename(dir_name))
d <- dirs_to_process[[dir_name]]
Expand Down Expand Up @@ -120,6 +127,7 @@ f <- function(dir_name, dirs_to_process, out_dir) {
data_level = "L1",
site = site,
plot = plot,
variable_metadata = L1_VAR_MD,
version = params$L1_VERSION,
write_plots = params$write_plots)
}
Expand Down Expand Up @@ -170,10 +178,14 @@ options(warn = oldwarn)
L1 metadata template directory is `r params$L1_METADATA`.

```{r metadata}

source("metadata-utils.R")

# Write the overall README
readme_fn <- file.path(params$METADATA_ROOT, params$L1_METADATA,
readme_fn <- file.path(params$METADATA_ROOT,
params$RELEASE_README_FILES,
paste0("README_v", params$L1_VERSION, ".txt"))
if(!file.exists(readme_fn)) stop("Couln't find ", readme_fn)
if(!file.exists(readme_fn)) stop("Couldn't find ", readme_fn)
readme <- readLines(readme_fn)
readme <- gsub("[VERSION]", params$L1_VERSION, readme, fixed = TRUE)
readme <- gsub("[DATESTAMP]", params$L1_RELEASE_DATE, readme, fixed = TRUE)
Expand All @@ -196,81 +208,49 @@ L1_metadata_template <- readLines(template_file)
col_md_for_insert <- paste(sprintf("%-15s", column_md$Column), column_md$Description)

# Get the variable metadata
var_md <- read.csv(file.path(params$METADATA_ROOT, params$L1_METADATA, params$METADATA_VARS_TABLE))
var_md_for_insert <- paste(sprintf("%-20s", c("research_name", var_md$research_name)),
sprintf("%-10s", c("Units", var_md$final_units)),
sprintf("%-12s", c("Bounds", paste0(var_md$low_bound, ", ", var_md$high_bound))),
c("Description", var_md$description))
var_md_for_insert <- md_variable_info(file.path(params$METADATA_ROOT,
params$METADATA_VARS_TABLE))

message("Main template has ", length(L1_metadata_template), " lines")
message("Column metadata info has ", length(col_md_for_insert), " lines")
message("Variable metadata info has ", length(var_md_for_insert), " lines")

message("Inserting data version string")
L1_metadata_template <- gsub("[VERSION]", params$L1_VERSION,
L1_metadata_template, fixed = TRUE)

# Identify the main data directories in L1/{version}/, which are <site>_<year>
data_dirs <- list.files(L1, pattern = "^[a-zA-Z]+_[0-9]{4}$")
site_files_folder <- file.path(params$METADATA_ROOT,
params$METADATA_SITE_FILES)

for(dd in data_dirs) {
dd_full <- file.path(L1, dd)
message("Generating metadata for ", dd_full)

message("\tInserting verion")
md <- gsub("[VERSION]", params$L1_VERSION, L1_metadata_template, fixed = TRUE)

message("\tInserting timestamp and folder name")
md <- gsub("[TIMESTAMP]", date(), md, fixed = TRUE)
md <- gsub("[TIMESTAMP]", date(), L1_metadata_template, fixed = TRUE)
md <- gsub("[FOLDER_NAME]", dd, md, fixed = TRUE)

# File info
files <- list.files(path = dd_full, pattern = "csv$", full.names = TRUE)
message("\tFound ", length(files), " data files")
file_info <- c()
# Build up information about files...
for(f in files) {
fdata <- readLines(f) # just for a quick line count
file_info <- c(file_info,
basename(f),
paste("Rows:", length(fdata) - 1),
paste("md5:", digest::digest(f, file = TRUE)),
"")
}
# ...and insert into metadata
# We used the head(-1) to drop the final empty line, just to keep things pretty
file_info_pos <- grep("[FILE_INFO", md, fixed = TRUE)
md <- append(md, head(file_info, -1), after = file_info_pos)
md <- md[-file_info_pos]

# Insert info on data files into metadata
md <- md_insert_fileinfo(dd_full, md)

# Insert column metadata
col_info_pos <- grep("[COLUMN_INFO]", md, fixed = TRUE)
md <- append(md, col_md_for_insert, after = col_info_pos)
md <- md[-col_info_pos]
# The NA code is an in-line replacement
md <- gsub("[NA_STRING_L1]", NA_STRING_L1, md, fixed = TRUE)
# The time zone is an in-line replacement
md <- gsub("[TIMEZONE]", params$L1_DATA_TIMEZONE, md, fixed = TRUE)


# Insert NA code, time zone, and version information
md <- md_insert_miscellany(md,
NA_STRING_L1,
params$L1_DATA_TIMEZONE,
params$L1_VERSION)

# Insert variable metadata
var_info_pos <- grep("[VARIABLE_INFO]", md, fixed = TRUE)
md <- append(md, var_md_for_insert, after = var_info_pos)
md <- md[-var_info_pos]

# Site information
# Folders are <site>_<year>
# There MUST be an informational file named <site>.txt
site <- strsplit(dd, "_")[[1]][1]
site_md_file <- file.path(params$METADATA_ROOT, params$L1_METADATA, paste0(site, ".txt"))
if(!file.exists(site_md_file)) {
stop("Couldn't find file ", site_md_file, " in ", params$L1_METADATA, " for dd=", dd)
}
site_md_for_insert <- readLines(site_md_file)

# Insert site information
site_info_pos <- grep("[SITE_INFO]", md, fixed = TRUE)
md <- append(md, site_md_for_insert, after = site_info_pos)
md <- md[-site_info_pos]
md <- md_insert_siteinfo(site, site_files_folder, md)

# Write the final metadata file
mdfn <- paste0(dd, "_L1_v", params$L1_VERSION, "_metadata.txt")
Expand Down
2 changes: 1 addition & 1 deletion pipeline/L1_normalize.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ params:
L1_DATA_TIMEZONE: "Etc/GMT+5"
METADATA_ROOT: "metadata/"
METADATA_TIMEZONES_TABLE: "L1_metadata/L1_metadata_timezones.csv"
METADATA_VARS_TABLE: "L1_metadata/L1_metadata_variables.csv"
METADATA_VARS_TABLE: "variables_metadata.csv"
OOS: "out-of-service/"
logfile: ""
run_parallel: false
Expand Down
28 changes: 24 additions & 4 deletions pipeline/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ write_to_folders <- function(x, root_dir,
data_level,
site, plot,
logger, table, # only provided for L1_normalize
variable_metadata, # only provided for L1
version = "???",
quiet = FALSE, write_plots = TRUE) {
# Sanity checks
Expand Down Expand Up @@ -134,16 +135,34 @@ write_to_folders <- function(x, root_dir,
filename <- paste(logger, table, y, rn, short_hash, sep = "_")
na_string <- NA_STRING_L1
} else if(data_level == "L1") {
# Isolate this research name's metadata
vmd <- variable_metadata[variable_metadata$research_name == rn,]

folder <- file.path(root_dir, paste(site, y, sep = "_"))
filename <- paste(site, plot, time_period, rn, data_level, vversion, sep = "_")
na_string <- NA_STRING_L1
write_this_plot <- TRUE
p <- ggplot(x, aes(TIMESTAMP, Value, group = paste(Instrument_ID, Sensor_ID))) +
geom_line(na.rm = TRUE) +
facet_wrap(~research_name, scales = "free") +
ggtitle(filename) +
theme(axis.text = element_text(size = 6),
strip.text = element_text(size = 8))
ylab(paste0(vmd$research_name, " (", vmd$final_units, ")")) +
theme(axis.text = element_text(size = 10),
strip.text = element_text(size = 10))

# If any data are out of bounds, show those bounds
if(any(x$F_OOB, na.rm = TRUE)) {
p <- p + geom_hline(yintercept = vmd$low_bound,
linetype = 2, color = "blue",
na.rm = TRUE) +
geom_hline(yintercept = vmd$high_bound,
linetype = 2, color = "blue",
na.rm = TRUE) +
ggtitle(filename,
subtitle = "Dashed lines show instrument bounds")
} else {
p <- p + ggtitle(filename)

}
} else if(data_level == "L2_qaqc") {
folder <- file.path(root_dir, paste(site, y, sep = "_"))
filename <- paste(site, y, rn, data_level, vversion, sep = "_")
Expand Down Expand Up @@ -184,9 +203,10 @@ write_to_folders <- function(x, root_dir,
}

# Write basic QA/QC plot
# We use cairo_pdf to better handle Unicode chars in axis labels
if(write_plots && write_this_plot) {
fn_p <- gsub("csv$", "pdf", fqfn)
ggsave(fn_p, plot = p, width = 12, height = 8)
ggsave(fn_p, plot = p, width = 12, height = 8, device = cairo_pdf)
}

lines_written[[fqfn]] <- nrow(dat)
Expand Down
60 changes: 60 additions & 0 deletions pipeline/metadata-utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# metadata-utils.R
# Helper functions for generating both L1 and L2 metadata
# BBL June 2025

# Get information about files in a folder and insert into metadata
# (a character vector)
md_insert_fileinfo <- function(folder, md) {
files <- list.files(path = folder, pattern = "csv$", full.names = TRUE)
message("\tFound ", length(files), " data files")
file_info <- c()
# Build up information about files...
for(f in files) {
fdata <- readLines(f) # just for a quick line count
file_info <- c(file_info,
basename(f),
paste("Rows:", length(fdata) - 1),
paste("md5:", digest::digest(f, file = TRUE)),
"")
}
# ...and insert into metadata
# We used the head(-1) to drop the final empty line, just to keep things pretty
file_info_pos <- grep("[FILE_INFO", md, fixed = TRUE)
md <- append(md, head(file_info, -1), after = file_info_pos)
md[-file_info_pos]
}

# Read the variable metadata table and return a formatted extract from it
md_variable_info <- function(variable_md_file) {
var_md <- read.csv(variable_md_file)
paste(sprintf("%-20s", c("research_name", var_md$research_name)),
sprintf("%-10s", c("Units", var_md$final_units)),
sprintf("%-12s", c("Bounds", paste0(var_md$low_bound, ", ", var_md$high_bound))),
c("Description", var_md$description))
}

# Insert site information into metadata
# There MUST be an informational file named <site>.txt
md_insert_siteinfo <- function(site, site_files_folder, md) {
site_md_file <- file.path(site_files_folder, paste0(site, ".txt"))
if(!file.exists(site_md_file)) {
stop("Couldn't find file ", site_md_file, " in ", site_files_folder)
}
site_md_for_insert <- readLines(site_md_file)

# Insert site information
site_info_pos <- grep("[SITE_INFO]", md, fixed = TRUE)
md <- append(md, site_md_for_insert, after = site_info_pos)
md[-site_info_pos]
}

md_insert_miscellany <- function(md, na_string, time_zone, version) {
message("Inserting NA code, time zone, and version strings")

# The NA code is an in-line replacement
md <- gsub("[NA_STRING]", na_string, md, fixed = TRUE)
# The time zone is an in-line replacement
md <- gsub("[TIMEZONE]", time_zone, md, fixed = TRUE)

gsub("[VERSION]", version, md, fixed = TRUE)
}
2 changes: 1 addition & 1 deletion pipeline/metadata/L1_metadata/L1_metadata_columns.csv
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Instrument,Name of measurement instrument (character)
Instrument_ID,Identifier of instrument within plot (character)
Sensor_ID,"Identifier of individual sensor, tree, etc. being measured (character)"
Location,"Spatial location; for TEMPEST, grid square (character)"
Value,Observed value (numeric). The no-data value is '[NA_STRING_L1]'
Value,Observed value (numeric). The no-data value is '[NA_STRING]'
research_name,Measurement name (character)
ID,Observation identifier (character)
F_OOB,Flag: out of instrumental bounds (logical; 1=TRUE)
Expand Down
4 changes: 4 additions & 0 deletions pipeline/metadata/readme_files/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# readme_files

This folder holds the README files for each data release. The L1.qmd step
will error if a version doesn't have an associated README file here.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Data are organized into {SITE}_{YEAR} folders, with comma-separated
value (CSV) files in each folder for each plot and output variable at
that site.

The data file naming convention is
The data file naming convention is
{SITE}_{PLOT}_{DATE RANGE}_{OUTPUT VARIABLE}_L1_{VERSION}.csv

Sites include CRC (Crane Creek), GCW (GCReW), GWI (Goodwin
Expand Down Expand Up @@ -64,12 +64,13 @@ CHANGELOG
Version 2-0 released [DATESTAMP]
* Covers late 2019 through June 2025 for TEMPEST and all synoptic sites
* Data files are now annual and single-variable, rather than monthly and multi-variable
* Data plots now include out-of-bounds indicators and informative axis labels
* Back-corrected two years of corrupted AQ600 files at TEMPEST; thanks to SJW
* Minor data fixes: CD8 sapflux sensor, wx_par_tot15 calculation, MSM Buoy time zone, sapflux sensor depth, ClimaVue VP units
* Minor data fixes: CD8 sapflux sensor, wx_par_tot15 calculation, MSM Buoy time zone, sapflux sensor depth, ClimaVue VP units
* New code examples, documentation improvements, and more
* Many backend improvements; see https://github.com/COMPASS-DOE/sensor-data-pipeline/issues/244

Version 1-2 released [DATESTAMP]
Version 1-2 released 2025-02-14
* Covers late 2019 through December 2024 for TEMPEST and all synoptic sites
* All sonde (EXO) data now appear in their own "OW" (open water) plot
* The TEMPEST (TMP) folder README files now include detailed information on flood timings, volumes, etc.
Expand Down