diff --git a/README.md b/README.md index 4604f6f..bf32ff9 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ This repository contains the source code for an EESSI status page generator. The Create a configuration file (e.g., config.json). See [config.json](config.json) for an example. The only optional key is `backend_type` for servers. It defaults to `AutoDetect` if missing. See the section on server backend types for more information. +Note that `limit_scraping_to_repositories` controls how the scraper determines which repositories to scrape from each server. If set to `true`, only the repositories explicitly listed as `repositories` in the configuration will be scraped (and `ignored_repositories` will have no meaning). If set to `false`, the scraper will also consider repositories detected from the server itself (if applicable), filtered by `ignored_repositores`. The default is `false`. + ## Usage Run the binary with the desired options: @@ -136,8 +138,17 @@ In this example, as the rules are applied in order, the engine will check, in or ## Prometheus Metrics Prometheus metrics can be enabled with the `--prometheus-metrics` option. The metrics are exposed as the file `metrics` in the -output directory and are generated with the timestamp being the start of the application. A typical metrics file will look -like this: +output directory and are generated with the timestamp being the start of the application. + +The status codes used in the metrics are as follows: + +- `0`: OK +- `1`: Degraded +- `2`: Warning +- `3`: Failed +- `9`: Maintenance + +A typical metrics file might look like this: ```prometheus # HELP eessi_status EESSI status @@ -155,12 +166,80 @@ syncservers_status 0 1720525887957 # HELP repositories_status Repositories status # TYPE repositories_status gauge repositories_status 0 1720525887957 -``` - -The status codes are: +# HELP status_overview Status overview +# TYPE status_overview gauge +status_overview{category="overall"} 0 1761206997670 +status_overview{category="stratum0"} 0 1761206997670 +status_overview{category="stratum1"} 0 1761206997670 +status_overview{category="syncservers"} 0 1761206997670 +status_overview{category="repositories"} 0 1761206997670 +# HELP repo_catalogue_size Repository catalogue size +# TYPE repo_catalogue_size gauge +repo_catalogue_size{type="stratum0",server="rug-nl-s0.eessi.science",repository="dev.eessi.io"} 9526272 1761206997670 +repo_catalogue_size{type="stratum0",server="rug-nl-s0.eessi.science",repository="riscv.eessi.io"} 26624 1761206997670 +repo_catalogue_size{type="stratum0",server="rug-nl-s0.eessi.science",repository="software.eessi.io"} 133120 1761206997670 +repo_catalogue_size{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="dev.eessi.io"} 9526272 1761206997670 +repo_catalogue_size{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="riscv.eessi.io"} 26624 1761206997670 +repo_catalogue_size{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="software.eessi.io"} 133120 1761206997670 +repo_catalogue_size{type="stratum1",server="azure-us-east-s1.eessi.science",repository="dev.eessi.io"} 9526272 1761206997670 +repo_catalogue_size{type="stratum1",server="azure-us-east-s1.eessi.science",repository="riscv.eessi.io"} 26624 1761206997670 +repo_catalogue_size{type="stratum1",server="azure-us-east-s1.eessi.science",repository="software.eessi.io"} 133120 1761206997670 +repo_catalogue_size{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="dev.eessi.io"} 9526272 1761206997670 +repo_catalogue_size{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="riscv.eessi.io"} 26624 1761206997670 +repo_catalogue_size{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="software.eessi.io"} 133120 1761206997670 +repo_catalogue_size{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="dev.eessi.io"} 9526272 1761206997670 +repo_catalogue_size{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="riscv.eessi.io"} 26624 1761206997670 +repo_catalogue_size{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="software.eessi.io"} 133120 1761206997670 +# HELP repo_revision Repository revision +# TYPE repo_revision gauge +repo_revision{type="stratum0",server="rug-nl-s0.eessi.science",repository="dev.eessi.io"} 415 1761206997670 +repo_revision{type="stratum0",server="rug-nl-s0.eessi.science",repository="riscv.eessi.io"} 522 1761206997670 +repo_revision{type="stratum0",server="rug-nl-s0.eessi.science",repository="software.eessi.io"} 9744 1761206997670 +repo_revision{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="dev.eessi.io"} 415 1761206997670 +repo_revision{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="riscv.eessi.io"} 522 1761206997670 +repo_revision{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="software.eessi.io"} 9744 1761206997670 +repo_revision{type="stratum1",server="azure-us-east-s1.eessi.science",repository="dev.eessi.io"} 415 1761206997670 +repo_revision{type="stratum1",server="azure-us-east-s1.eessi.science",repository="riscv.eessi.io"} 522 1761206997670 +repo_revision{type="stratum1",server="azure-us-east-s1.eessi.science",repository="software.eessi.io"} 9744 1761206997670 +repo_revision{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="dev.eessi.io"} 415 1761206997670 +repo_revision{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="riscv.eessi.io"} 522 1761206997670 +repo_revision{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="software.eessi.io"} 9744 1761206997670 +repo_revision{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="dev.eessi.io"} 415 1761206997670 +repo_revision{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="riscv.eessi.io"} 522 1761206997670 +repo_revision{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="software.eessi.io"} 9744 1761206997670 +# HELP repo_timestamp Repository timestamp +# TYPE repo_timestamp gauge +repo_timestamp{type="stratum0",server="rug-nl-s0.eessi.science",repository="dev.eessi.io"} 1760706941 1761206997670 +repo_timestamp{type="stratum0",server="rug-nl-s0.eessi.science",repository="riscv.eessi.io"} 1750670430 1761206997670 +repo_timestamp{type="stratum0",server="rug-nl-s0.eessi.science",repository="software.eessi.io"} 1761150935 1761206997670 +repo_timestamp{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="dev.eessi.io"} 1760706941 1761206997670 +repo_timestamp{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="riscv.eessi.io"} 1750670430 1761206997670 +repo_timestamp{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="software.eessi.io"} 1761150935 1761206997670 +repo_timestamp{type="stratum1",server="azure-us-east-s1.eessi.science",repository="dev.eessi.io"} 1760706941 1761206997670 +repo_timestamp{type="stratum1",server="azure-us-east-s1.eessi.science",repository="riscv.eessi.io"} 1750670430 1761206997670 +repo_timestamp{type="stratum1",server="azure-us-east-s1.eessi.science",repository="software.eessi.io"} 1761150935 1761206997670 +repo_timestamp{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="dev.eessi.io"} 1760706941 1761206997670 +repo_timestamp{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="riscv.eessi.io"} 1750670430 1761206997670 +repo_timestamp{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="software.eessi.io"} 1761150935 1761206997670 +repo_timestamp{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="dev.eessi.io"} 1760706941 1761206997670 +repo_timestamp{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="riscv.eessi.io"} 1750670430 1761206997670 +repo_timestamp{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="software.eessi.io"} 1761150935 1761206997670 +# HELP repo_ttl Repository TTL +# TYPE repo_ttl gauge +repo_ttl{type="stratum0",server="rug-nl-s0.eessi.science",repository="dev.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum0",server="rug-nl-s0.eessi.science",repository="riscv.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum0",server="rug-nl-s0.eessi.science",repository="software.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="dev.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="riscv.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="aws-eu-central-s1.eessi.science",repository="software.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="azure-us-east-s1.eessi.science",repository="dev.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="azure-us-east-s1.eessi.science",repository="riscv.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="azure-us-east-s1.eessi.science",repository="software.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="dev.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="riscv.eessi.io"} 240 1761206997670 +repo_ttl{type="stratum1",server="cvmfs-ext.gridpp.rl.ac.uk:8000",repository="software.eessi.io"} 240 1761206997670 +repo_ttl{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="dev.eessi.io"} 240 1761206997670 +repo_ttl{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="riscv.eessi.io"} 240 1761206997670 +repo_ttl{type="syncserver",server="aws-eu-west-s1-sync.eessi.science",repository="software.eessi.io"} 240 1761206997670 -- `0`: OK -- `1`: Degraded -- `2`: Warning -- `3`: Failed -- `9`: Maintenance +``` diff --git a/config.json b/config.json index 3cd7b06..11c371b 100644 --- a/config.json +++ b/config.json @@ -32,6 +32,7 @@ "dev.eessi.io", "riscv.eessi.io" ], + "limit_scraping_to_repositories": false, "ignored_repositories": [ "test.eessi.io" ], diff --git a/src/main.rs b/src/main.rs index cd0c8e0..963e0e5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,12 +7,14 @@ use std::path::{Path, PathBuf}; mod config; mod dependencies; mod models; +mod prometheus; mod templating; use config::{get_config_manager, init_config}; use cvmfs_server_scraper::{Scraper, ScraperCommon, ServerType}; use dependencies::{atomic_write, populate}; -use models::{EESSIStatus, Status, StatusManager, StatusPageData, StratumStatus}; +use models::{EESSIStatus, Status, StatusManager, StatusPageData, StratumStatus, ToEESSILabel}; +use prometheus::MetricsBuilder; use templating::{render_template_to_file, RepoStatus, StatusInfo}; #[derive(Parser, Debug)] @@ -91,7 +93,7 @@ async fn main() -> Result<()> { render_output(&args, &status_page_data)?; if args.prometheus_metrics { - generate_prometheus_metrics(&args, &status_page_data, &run_start_time)?; + generate_prometheus_metrics(&args, &status_page_data, &status_manager, &run_start_time)?; } Ok(()) @@ -180,33 +182,132 @@ fn generate_status_page_data( fn generate_prometheus_metrics( args: &Opt, status_page_data: &StatusPageData, + status_manager: &StatusManager, timestamp: &DateTime, ) -> Result<()> { use crate::models::StatusLevel; + let filename = args.destination.join("metrics"); trace!("Generating Prometheus metrics file: {:?}", filename); - let ms_since_epoch = timestamp.timestamp_millis(); - - let metrics = format!( - "# HELP eessi_status EESSI status\n# TYPE eessi_status gauge\n\ - eessi_status {} {ms_since_epoch}\n\ - # HELP stratum0_status Stratum0 status\n# TYPE stratum0_status gauge\n\ - stratum0_status {} {ms_since_epoch}\n\ - # HELP stratum1_status Stratum1 status\n# TYPE stratum1_status gauge\n\ - stratum1_status {} {ms_since_epoch}\n\ - # HELP syncservers_status SyncServers status\n# TYPE syncservers_status gauge\n\ - syncservers_status {} {ms_since_epoch}\n\ - # HELP repositories_status Repositories status\n# TYPE repositories_status gauge\n\ - repositories_status {} {ms_since_epoch}\n", - status_page_data.eessi_status.level(), - status_page_data.stratum0.level(), - status_page_data.stratum1.level(), - status_page_data.syncservers.level(), - status_page_data.repositories_status.level() + let ts = timestamp.timestamp_millis(); + + let mut b = MetricsBuilder::new(); + b.add_gauge( + "eessi_status", + "EESSI status", + status_page_data.eessi_status.level() as f64, + &[], + Some(ts), + ) + .add_gauge( + "stratum0_status", + "Stratum0 status", + status_page_data.stratum0.level() as f64, + &[], + Some(ts), + ) + .add_gauge( + "stratum1_status", + "Stratum1 status", + status_page_data.stratum1.level() as f64, + &[], + Some(ts), + ) + .add_gauge( + "syncservers_status", + "SyncServers status", + status_page_data.syncservers.level() as f64, + &[], + Some(ts), + ) + .add_gauge( + "repositories_status", + "Repositories status", + status_page_data.repositories_status.level() as f64, + &[], + Some(ts), ); - atomic_write(&filename, metrics.as_bytes())?; + let maps = vec![ + ("overall", status_page_data.eessi_status.level() as f64), + ("stratum0", status_page_data.stratum0.level() as f64), + ("stratum1", status_page_data.stratum1.level() as f64), + ("syncservers", status_page_data.syncservers.level() as f64), + ( + "repositories", + status_page_data.repositories_status.level() as f64, + ), + ]; + + for (category, level) in maps { + b.add_gauge( + "status_overview", + "Status overview", + level, + &[("category", category)], + Some(ts), + ); + } + + for server in status_manager.get_all_servers() { + let ts_ms = Some(ts); + + for repo in server.repositories.iter() { + let repo_labels: [(&str, &str); 3] = [ + ("type", server.server_type.to_label()), + ("server", server.hostname.to_str()), + ("repository", repo.name.as_str()), + ]; + + // The fields are: + // - c: Cryptographic hash of the repository’s current root catalog + // - b: Size of the root file catalog in bytes + // - a: true if the catalog should be fetched under its alternative name + // - r: MD5 hash of the repository’s current root path (usually always d41d8cd98f00b204e9800998ecf8427e) + // - x: Cryptographic hash of the signing certificate + // - g: true if the repository is garbage-collectable + // - h: Cryptographic hash of the repository’s named tag history database + // - t: Unix timestamp of this particular revision + // - d: Time To Live (TTL) of the root catalog + // - s: Revision number of this published revision + // - n: The full name of the manifested repository + // - m: Cryptographic hash of the repository JSON metadata + // - y: Cryptographic hash of the reflog checksum + // - l: currently unused (reserved for micro catalogs) + b.add_gauge( + "repo_revision", + "Repository revision", + repo.revision as f64, + &repo_labels, + ts_ms, + ) + .add_gauge( + "repo_timestamp", + "Repository timestamp", + repo.manifest.t as f64, + &repo_labels, + ts_ms, + ) + .add_gauge( + "repo_ttl", + "Repository TTL", + repo.manifest.d as f64, + &repo_labels, + ts_ms, + ) + .add_gauge( + "repo_catalogue_size", + "Repository catalogue size", + repo.manifest.b as f64, + &repo_labels, + ts_ms, + ); + } + } + + let text = b.build(); + atomic_write(&filename, text.as_bytes())?; info!("Prometheus metrics file written to: {:?}", filename); Ok(()) } diff --git a/src/models.rs b/src/models.rs index 217af0c..aa76bb2 100644 --- a/src/models.rs +++ b/src/models.rs @@ -8,8 +8,8 @@ use strum::IntoEnumIterator; use strum_macros::{AsRefStr, EnumIter}; use cvmfs_server_scraper::{ - Hostname, PopulatedRepositoryOrReplica, PopulatedServer, ScrapedServer, ServerBackendType, - ServerMetadata, ServerType, + Hostname, Manifest, PopulatedRepositoryOrReplica, PopulatedServer, ScrapedServer, + ServerBackendType, ServerMetadata, ServerType, }; use crate::config::{Condition, ConfigFile}; @@ -204,6 +204,7 @@ impl StatusLevel for RepoStatus {} pub struct Repositories { pub name: String, pub revision: i32, + pub manifest: Manifest, pub status: Status, /// Is the revision in sync with either the stratum0 or the stratum1s? pub status_revision: Status, @@ -232,6 +233,20 @@ impl Server { } } +pub trait ToEESSILabel { + fn to_label(&self) -> &str; +} + +impl ToEESSILabel for ServerType { + fn to_label(&self) -> &str { + match self { + ServerType::Stratum0 => "stratum0", + ServerType::Stratum1 => "stratum1", + ServerType::SyncServer => "syncserver", + } + } +} + pub struct StatusManager { pub servers: Vec, } @@ -251,6 +266,7 @@ impl StatusManager { Repositories { name: repo.name.clone(), revision: repo.revision(), + manifest: repo.manifest.clone(), status: status_revision, status_revision, } @@ -292,6 +308,10 @@ impl StatusManager { self.servers.iter().map(Server::to_server_status).collect() } + pub fn get_all_servers(&self) -> Vec<&Server> { + self.servers.iter().collect() + } + pub fn get_by_type(&self, server_type: ServerType) -> Vec<&Server> { self.servers .iter() diff --git a/src/prometheus.rs b/src/prometheus.rs new file mode 100644 index 0000000..14d8cba --- /dev/null +++ b/src/prometheus.rs @@ -0,0 +1,223 @@ +use std::collections::BTreeMap; +use std::fmt::Write as _; + +#[derive(Clone, Copy)] +#[allow(dead_code)] +pub enum MetricType { + Gauge, + Counter, + Summary, + Histogram, + Untyped, +} +impl MetricType { + fn as_str(self) -> &'static str { + match self { + MetricType::Gauge => "gauge", + MetricType::Counter => "counter", + MetricType::Summary => "summary", + MetricType::Histogram => "histogram", + MetricType::Untyped => "untyped", + } + } +} + +#[derive(Clone)] +pub struct Sample { + pub labels: Vec<(String, String)>, + pub value: f64, + pub timestamp_ms: Option, +} + +impl Sample { + pub fn new(value: f64) -> Self { + Self { + labels: Vec::new(), + value, + timestamp_ms: None, + } + } + + #[allow(dead_code)] + pub fn with_label(mut self, k: impl Into, v: impl Into) -> Self { + self.labels.push((k.into(), v.into())); + self + } + + #[allow(dead_code)] + pub fn with_ts(mut self, ts_ms: i64) -> Self { + self.timestamp_ms = Some(ts_ms); + self + } +} + +struct MetricDef { + help: Option, + mtype: Option, + samples: Vec, +} +impl MetricDef { + fn new() -> Self { + Self { + help: None, + mtype: None, + samples: Vec::new(), + } + } +} + +pub struct MetricsBuilder { + metrics: BTreeMap, +} +impl MetricsBuilder { + pub fn new() -> Self { + Self { + metrics: BTreeMap::new(), + } + } + + pub fn set_help(&mut self, name: &str, help: impl Into) -> &mut Self { + self.metrics + .entry(name.to_string()) + .or_insert_with(MetricDef::new) + .help = Some(help.into()); + self + } + + pub fn set_type(&mut self, name: &str, mtype: MetricType) -> &mut Self { + self.metrics + .entry(name.to_string()) + .or_insert_with(MetricDef::new) + .mtype = Some(mtype); + self + } + + pub fn add_sample(&mut self, name: &str, sample: Sample) -> &mut Self { + self.metrics + .entry(name.to_string()) + .or_insert_with(MetricDef::new) + .samples + .push(sample); + self + } + + // Convenience helpers + pub fn add_gauge( + &mut self, + name: &str, + help: &str, + value: f64, + labels: &[(&str, &str)], + ts_ms: Option, + ) -> &mut Self { + self.set_help(name, help).set_type(name, MetricType::Gauge); + let mut s = Sample::new(value); + s.labels = labels + .iter() + .map(|(k, v)| ((*k).to_string(), (*v).to_string())) + .collect(); + s.timestamp_ms = ts_ms; + self.add_sample(name, s) + } + + #[allow(dead_code)] + pub fn add_counter( + &mut self, + name: &str, + help: &str, + value: f64, + labels: &[(&str, &str)], + ts_ms: Option, + ) -> &mut Self { + self.set_help(name, help) + .set_type(name, MetricType::Counter); + let mut s = Sample::new(value); + s.labels = labels + .iter() + .map(|(k, v)| ((*k).to_string(), (*v).to_string())) + .collect(); + s.timestamp_ms = ts_ms; + self.add_sample(name, s) + } + + #[allow(dead_code)] + pub fn add_untyped( + &mut self, + name: &str, + help: &str, + value: f64, + labels: &[(&str, &str)], + ts_ms: Option, + ) -> &mut Self { + self.set_help(name, help) + .set_type(name, MetricType::Untyped); + let mut s = Sample::new(value); + s.labels = labels + .iter() + .map(|(k, v)| ((*k).to_string(), (*v).to_string())) + .collect(); + s.timestamp_ms = ts_ms; + self.add_sample(name, s) + } + + /// Render to Prometheus text exposition format. + pub fn build(self) -> String { + let mut out = String::with_capacity(1024); + for (name, def) in self.metrics { + if let Some(help) = &def.help { + let _ = writeln!(&mut out, "# HELP {} {}", name, escape_help(help)); + } + if let Some(mt) = def.mtype { + let _ = writeln!(&mut out, "# TYPE {} {}", name, mt.as_str()); + } + for s in def.samples { + let _ = write!(&mut out, "{}", name); + if !s.labels.is_empty() { + let _ = write!(&mut out, "{{"); + for (i, (k, v)) in s.labels.iter().enumerate() { + if i > 0 { + let _ = write!(&mut out, ","); + } + let _ = write!(&mut out, "{}=\"{}\"", k, escape_label(v)); + } + let _ = write!(&mut out, "}}"); + } + let _ = write!(&mut out, " {}", format_value(s.value)); + if let Some(ts) = s.timestamp_ms { + let _ = write!(&mut out, " {}", ts); + } + let _ = writeln!(&mut out); + } + } + out + } +} + +fn escape_label(s: &str) -> String { + let mut out = String::with_capacity(s.len() + 8); + for ch in s.chars() { + match ch { + '\\' => out.push_str(r#"\\"#), + '"' => out.push_str(r#"\""#), + '\n' => out.push_str(r#"\n"#), + _ => out.push(ch), + } + } + out +} +fn escape_help(s: &str) -> String { + s.replace('\n', r"\n") +} +fn format_value(v: f64) -> String { + if v.is_nan() { + "NaN".to_string() + } else if v.is_infinite() { + if v.is_sign_positive() { + "+Inf".into() + } else { + "-Inf".into() + } + } else { + format!("{}", v) + } +}