diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml new file mode 100644 index 0000000..f543be6 --- /dev/null +++ b/.github/workflows/build.yaml @@ -0,0 +1,24 @@ +name: Build and Deploy + +on: + push: + branches: + - master + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - name: Checkout master + uses: actions/checkout@v1 + with: + submodules: true + + - name: Hugo Deploy GitHub Pages + uses: benmatselby/hugo-deploy-gh-pages@master + env: + HUGO_EXTENDED: true + TARGET_REPO: FAIRDataPipeline/FAIRDataPipeline.github.io + TOKEN: ${{ secrets.FDP_HUGO }} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..09bc988 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/resources \ No newline at end of file diff --git a/archetypes/default.md b/archetypes/default.md new file mode 100644 index 0000000..00e77bd --- /dev/null +++ b/archetypes/default.md @@ -0,0 +1,6 @@ +--- +title: "{{ replace .Name "-" " " | title }}" +date: {{ .Date }} +draft: true +--- + diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..d2e374e --- /dev/null +++ b/config.toml @@ -0,0 +1,98 @@ +# hugo server --minify --themesDir ... --baseURL=http://0.0.0.0:1313/theme/hugo-book/ + +baseURL = 'https://FAIRDataPipeline.github.io/' +title = 'FAIRDataPipeline' +theme = 'hugo-book' +enableEmoji = true + +# Book configuration +disablePathToLower = true +enableGitInfo = true +pygmentsUseClasses = true + +# Needed for mermaid/katex shortcodes +[markup] +[markup.goldmark.renderer] + unsafe = true + +[markup.tableOfContents] + startLevel = 1 + +[menu] +# [[menu.before]] +[[menu.after]] + name = "SCRC UoG" + url = "https://www.gla.ac.uk/research/az/scrc/" + weight = 10 + +[[menu.after]] + name = "Data Registry" + url = "https://data.scrc.uk/" + weight = 20 + +[params] + # (Optional, default true) Controls table of contents visibility on right side of pages. + # Start and end levels can be controlled with markup.tableOfContents setting. + # You can also specify this parameter per page in front matter. + BookToC = true + + # (Optional, default none) Set the path to a logo for the book. If the logo is + # /static/logo.png then the path would be logo.png + # BookLogo = 'logo.png' + + # (Optional, default none) Set leaf bundle to render as side menu + # When not specified file structure and weights will be used + # BookMenuBundle = '/menu' + + # (Optional, default docs) Specify root page to render child pages as menu. + # Page is resoled by .GetPage function: https://gohugo.io/functions/getpage/ + # For backward compatibility you can set '*' to render all sections to menu. Acts same as '/' + BookSection = 'docs' + + # Set source repository location. + # Used for 'Last Modified' and 'Edit this page' links. + BookRepo = 'https://github.com/FAIRDataPipeline/FDP_hugo' + + # Enable "Edit this page" links for 'doc' page type. + # Disabled by default. Uncomment to enable. Requires 'BookRepo' param. + # Edit path must point to root directory of repo. + BookEditPath = 'tree/master/content' + + # Configure the date format used on the pages + # - In git information + # - In blog posts + BookDateFormat = 'January 2, 2006' + + # (Optional, default true) Enables search function with flexsearch, + # Index is built on fly, therefore it might slowdown your website. + # Configuration for indexing can be adjusted in i18n folder per language. + BookSearch = false + + SearchEnabled = true + + # (Optional, default true) Enables comments template on pages + # By default partals/docs/comments.html includes Disqus template + # See https://gohugo.io/content-management/comments/#configure-disqus + # Can be overwritten by same param in page frontmatter + BookComments = true + + # /!\ This is an experimental feature, might be removed or changed at any time + # (Optional, experimental, default false) Enables portable links and link checks in markdown pages. + # Portable links meant to work with text editors and let you write markdown without {{< relref >}} shortcode + # Theme will print warning if page referenced in markdown does not exists. + BookPortableLinks = true + + # /!\ This is an experimental feature, might be removed or changed at any time + # (Optional, experimental, default false) Enables service worker that caches visited pages and resources for offline use. + BookServiceWorker = true + + # To change the fenced code block theme. + # 1. Put this in config.toml: + # pygmentsUseClasses = true + # 2. Run this: + # hugo gen chromastyles --style=arduino > syntax.css + # 3. Copy the css into a new stylesheet, themes/hugo-book/assets/_scriptstyle.scss + # 4. Add an import to the new stylesheet in _custom.scss: + # @import "scriptstyle"; + + BookLogo = "logo.jpg" \ No newline at end of file diff --git a/content/_index.md b/content/_index.md new file mode 100644 index 0000000..ca22c3b --- /dev/null +++ b/content/_index.md @@ -0,0 +1,24 @@ +--- +title: Introduction +type: docs +--- + +# SCRC + +## Who are we? + +The Scottish COVID-19 Response Consortium is formed of [dozens of individuals from over 30 academic and commercial organisations](https://www.gla.ac.uk/research/az/scrc/ourpeople/#members). + +Researchers in these organisations jointly responded to a call by the [Royal Society](https://royalsociety.org/topics-policy/health-and-wellbeing/ramp/) to develop more epidemiological models of COVID-19 spread - [RAPID ASSISTANCE IN MODELLING THE PANDEMIC: RAMP](https://epcced.github.io/ramp/) - in order to develop a more robust and clearer understanding of the impacts of different exit strategies from lockdown. Scientists from several other organisations across the UK and abroad have now joined the consortium to provide additional expertise in specific areas. + +## Our outputs: + +During and since the initial three months of RAMP work, our major achievements have been: + +- Seven [software epidemiological models](https://scottishcovidresponse.github.io/docs/models/) in four different programming languages and using multiple scientific approaches. These models have been assessed favourably in internal review against a [software checklist](https://github.com/ScottishCovidResponse/modelling-resources/blob/main/software-checklist.md) we developed. +- Data APIs in five languages (python, Julia, R, Java and C++) that simplify provenance recording, allowing input data to be verified as it is used, and model outputs to be traced back to the model code and input data that were used to produce them. +- A set of curated, traceable source data useful for epidemiological modelling on COVID-19. +- A database to hold metadata and index the data. +- Data processing code in `R` to populate the database. + +These are discoverable via our [GitHub organisation](https://github.com/ScottishCovidResponse). diff --git a/content/docs/API/R/1_generate_dp/_index.md b/content/docs/API/R/1_generate_dp/_index.md new file mode 100644 index 0000000..1a1c3ea --- /dev/null +++ b/content/docs/API/R/1_generate_dp/_index.md @@ -0,0 +1,342 @@ +--- +title: Generate a data product +weight: 1 +--- + +# How to generate a data product + +The data product itself should be producted in the correct format: + +* Point estimates, distributions, and samples should be generated as TOML files (`*.toml`) +* Tables and arrays should be generated as HDF5 files (`*.h5`/`*.hdf5`) +* The filename (of the TOML or HDF5 file) should be the version number of the data product + +## Filenames and version numbers + +* The version of a data product is identified by its filename +* The version of a raw data file is the same as that of the data product + +### When a dataset is static (downloaded only once) + +Filenames are written `major.minor.patch.extension`, *e.g.* `0.1.0.toml`, `0.1.0.h5`, `0.1.0.csv`. + +Major +: Changes only for the initial stable release (go from `0.y.z` to `1.0.0`) and when incompatible changes are made + +Minor +: Changes when new functionality is added such as a new component, or for the initial release that is *probably* stable, or a script that definitely works better even though the output is technically the same (go from `0.0.z` to `0.y.0`) + +Patch +: Changes for small bug fixes + +### When a dataset is dynamic (downloaded daily, weekly, etc.) + +Filenames are written thus (named after the download date): + +* `1.20200716.0.csv` +* `1.20200716.0.h5` + +For example, you might want to start the disease data with `0.20200722.0`, etc. until you're really confident that it's all good and you're happy to make a `1.2020mmdd.0` release. + +If there's a bugfix to the dataset, then you'd go from `1.20200716.0.csv` to `1.20200716.1.csv`. + +However if you have a completely new storage format, where you have different components and/or different formats of the components, then you go from `1.20200716.0.csv` to `2.20200716.0.csv`, because major number changes are supposed to not be backward compatible. + +## Data product names + +The name of a data product should be the same as its path. + +In the example below, the location of a data product, +ftp://boydorr.gla.ac.uk/scrc/records/SARS-CoV-2/scotland/cases-and-management/testing/0.20200923.0.h5[^1], is generated from: + +`ftp://boydorr.gla.ac.uk/[namespace]/[data_product_name]/[version_number].h5` + +![image alt text](testing.png) + +Thus, the `data_product_name` is used to locate the file, as well as describe its contents (since the filename is its version number). + +![image alt text](front.png) + +[^1]: Note that if you try to follow this URL using Safari, your file will be renamed to `Unknown.dms`. However, we don't recommend using this method to download files, so nothing to worry about. + +## TOML files + +There are only three types of TOML file: + +* point-estimate +* distribution +* samples + +These are all different ways of representing the estimate for a value, which can be anything - the mean of something, the standard deviation, etc. + +### TOML components + +You could have a data product called `latent-period` with a single point estimate: + +``` toml +[latent-period] +type = "point-estimate" +value = 1.0 +``` + +In this case, the component is taken as the last part of the name (in the above example, period). + +Alternatively, the data product could have several components, for instance: + +``` toml +[latent-period] +type = "distribution" +distribution = "gamma" +shape = 1.0 +scale = 1.0 + +[mean] +type = "point-estimate" +value = 1.0 + +[standard-deviation] +type = "point-estimate" +value = 1.0 +``` + +and so on. + +As far as units are concerned, there will be a unit-respecting system in that (if we get the data pipeline grant) every data product component will have to say what units it is in, and the pipeline API will do conversions and error if the units aren't compatible (or aren't provided). However, it's not there yet. The only thing we can suggest at the moment is to decide to use the component names to specify the units, so add an additional component to the TOML file such as: + +``` toml +[hours] +type = "point-estimate" +value = 24.0 +``` + +The functions `create_estimate()` and `create_distribution()` can be used to generate a TOML file. Note that these functions can't be used to edit existing files. If a file already exists at the specified location, an error will be returned. + +### Generate a TOML file from a point-estimate + +1. Load the SCRCdataAPI package into R: + + ``` R + library(SCRCdataAPI) + ``` + +2. Choose an appropriate [filename]({{% ref "/docs/API/R/1_generate_dp/_index.md#filenames-and-version-numbers" %}}): + + ``` R + filename <- "0.1.0.toml" + ``` + +3. Choose an appropriate [data product name]({{% ref "/docs/API/R/1_generate_dp/_index.md#data-product-names" %}}): + + ``` R + data_product_name <- "human/infection/SARS-CoV-2/asymptomatic-period" + ``` + +4. List a single component (see [TOML components]({{% ref "/docs/API/R/1_generate_dp/_index.md#toml-components" %}})): + + ``` R + estimate <- list(`asymptomatic-period` = 192.0) + ``` + + or list multiple components with the same data product name: + + ``` R + estimate <- list(`asymptomatic-period-1` = 192.0, + `asymptomatic-period-2` = 190.2) + ``` + + Note that `asymptomatic-period-1` needs to be enclosed by backticks because of the dash. + +5. Write the point-estimate into a TOML file: + + ``` R + create_estimate(filename = filename, + path = data_product_name, + parameters = estimate) + ``` + + Note that, `create_estimate()` will create the directory structure for you if it doesn't already exist. Your TOML file should now exist at `[data_product_name]/[filename]`. + +### Generate a TOML file from a distribution + +1. Load the SCRCdataAPI package into R: + + ``` R + library(SCRCdataAPI) + ``` + +2. Choose an appropriate [filename]({{% ref "/docs/API/R/1_generate_dp/_index.md#filenames-and-version-numbers" %}}): + + ``` R + filename <- "0.1.0.toml" + ``` + +3. Choose an appropriate [data product name]({{% ref "/docs/API/R/1_generate_dp/_index.md#data-product-names" %}}): + + ``` R + data_product_name <- "human/infection/SARS-CoV-2/latency-period" + ``` + +4. List a single component (see [TOML components]({{% ref "/docs/API/R/1_generate_dp/_index.md#toml-components" %}})): + + ``` R + distribution <- list(name = "latency-period", + distribution = "gamma", + parameters = list(shape = 2.0, + scale = 3.0)) + ``` + + or list multiple components with the same data product name: + + ``` R + dist1 <- list(name = "latency-period-1", + distribution = "gamma", + parameters = list(shape = 2.0, scale = 3.0)) + dist2 <- list(name = "latency-period-2", + distribution = "gamma", + parameters = list(shape = 2.2, scale = 4.0)) + distribution <- list(dist1, dist2) + ``` + + or list a single distribution and associated point-estimates with the same data product name: + + ``` R + dist <- list(name = "latency-period", + distribution = "gamma", + parameters = list(shape = 2.0, scale = 3.0)) + estimate1 <- list(mean = 1.0) + estimate2 <- list(`standard-deviation` = 1.0) + distribution <- list(dist, estimate1, estimate2) + ``` + + Note that `standard-deviation` needs to be enclosed by backticks because of the dash. + +5. Write the distribution into a TOML file: + + ``` R + create_distribution(filename = filename, + path = data_product_name, + distribution = distribution) + ``` + +## HDF5 files + +An HDF5 file can be either a table or an array. A table is always 2-dimentional and might typically be used when each column contains different classes of data (*e.g.* integers and strings). Conversely, all elements in an array should be the same class, though the array itself might be 1-dimensional, 2-dimensional, or more (*e.g.* a 3-dimensional array comprising population counts, with rows as area, columns as age, and a third dimension representing gender). + +You should create a single HDF5 file for a single dataset. Unless you have a dataset that really should have been generated as multiple datasets in the first place (*e.g.* testing data mixed with carehome data), in which case use your own judgement. + +The functions `create_array()` and `create_table()` can be used to generate an HDF5 file. + +### Generate an HDF5 file from an array + +1. Load the SCRCdataAPI package into R: + + ``` R + library(SCRCdataAPI) + ``` + +2. Choose an appropriate [filename]({{% ref "/docs/API/R/1_generate_dp/_index.md#filenames-and-version-numbers" %}}): + + ``` R + filename <- "0.1.0.h5" + ``` + +3. Choose an appropriate [data product name]({{% ref "/docs/API/R/1_generate_dp/_index.md#data-product-names" %}}): + + ``` R + data_product_name <- "some/descriptive/name" + ``` + +4. Choose an appropriate component name: + + ``` R + component_name <- "row/column-constant" + ``` + + If your dataset contains a single data topic, then the component should be named "`array`". However, if your dataset contains multiple data topics, then these can be included as separate components within a single HDF5 file. In doing so, a particular naming convention is required. + + For example, the human-mortality data product contains the following components: + + * `age_group/week/gender-country-all_deaths` + * `age_group/week/gender-country-covid_related_deaths` + * `age_group/week-persons-country-all_deaths` + + Taking the first component as an example: + + * Dimensionality is represented by `age_group/week/gender`, corresponding to the first, second, and third dimensions of the component (rows, columns, and levels) + * A description of the contents follows the dash, where `-country-all_deaths` describes all elements + * Spaces are replaced with underscores + +5. Source your dataset: + + ``` R + # Here we're creating a fake dataset + df <- data.frame(column_1 = 1:2, column_2 = 3:4) + rownames(df) <- c("row_1", "row_2") + ``` + + Rather than creating a fake dataset like we did here, you might want to use `download_from_url()` or `download_from_database()` to source your data. These functions are well documented with examples provided in the SCRCdataAPI package. + +6. Generate an HDF5 file: + + ``` R + # Create an h5 file from a 2-dimensional array + create_array(filename = filename, + path = data_product_name, + component = component_name, + array = as.matrix(df), + dimension_names = list(rowvalue = rownames(df), + colvalue = colnames(df))) + ``` + +### Generate an HDF5 file from a table + +1. Load the SCRCdataAPI package into R: + + ``` R + library(SCRCdataAPI) + ``` + +2. Choose an appropriate [filename]({{% ref "/docs/API/R/1_generate_dp/_index.md#filenames-and-version-numbers" %}}): + + ``` R + filename <- "0.1.0.h5" + ``` + +3. Choose an appropriate [data product name]({{% ref "/docs/API/R/1_generate_dp/_index.md#data-product-names" %}}): + + ``` R + data_product_name <- "some/descriptive/name" + ``` + +4. Choose an appropriate component name: + + ``` R + component_name <- "descriptive_component_name" + ``` + + The component naming scheme used in the previous section (generating HDF5 files from an array) doesn't make sense here. So if you have multiple data topics and therefore need multiple components in a single HDF5 file, just pick a name that's suitably descriptive. If your dataset contains a single data topic, then name the component "`table`". + +5. Source your dataset: + + ``` R + # Here we're creating a fake dataset + df <- data.frame(column_1 = 1:2, column_2 = 3:4) + rownames(df) <- c("informative_rowname_1", "informative_rowname_2") + ``` + + Rather than creating a fake dataset like we did here, you might want to use `download_from_url()` or `download_from_database()` to source your data. These functions are well documented with examples provided in the SCRCdataAPI package. + +6. Generate an HDF5 file: + + ``` R + # Create an h5 file from a table + create_table(filename = filename, + path = data_product_name, + component = component_name, + df = df, + row_names = rownames(df), + column_units = c(NA, "m^2")) + ``` + + Note that `row_names` and `column_units` are optional arguments. In this case, `row_names` is informative, but it might not always be the case. Likewise, `column_units` is shown here to demonstrate how to input the lack of units in column 1. + diff --git a/content/docs/API/R/1_generate_dp/front.png b/content/docs/API/R/1_generate_dp/front.png new file mode 100644 index 0000000..cc109aa Binary files /dev/null and b/content/docs/API/R/1_generate_dp/front.png differ diff --git a/content/docs/API/R/1_generate_dp/testing.png b/content/docs/API/R/1_generate_dp/testing.png new file mode 100644 index 0000000..b153141 Binary files /dev/null and b/content/docs/API/R/1_generate_dp/testing.png differ diff --git a/content/docs/API/R/2_upload_dp/_index.md b/content/docs/API/R/2_upload_dp/_index.md new file mode 100644 index 0000000..580bb3f --- /dev/null +++ b/content/docs/API/R/2_upload_dp/_index.md @@ -0,0 +1,92 @@ +--- +title: Upload a data product +weight: 2 +--- + +# How to upload a data product + +## Workflow + +1. Push metadata associated with your data product to the data registry + * Choose a submission script (below) + * Edit the script + * Push the script to GitHub + * Check to make sure the version of the script you're about to run is exactly the same as the one you've pushed to GitHub – if it isn't, push all changes now! + * Run the script + +2. Upload your raw data (*e.g.* a csv file) to the Boydorr server + * Ask Richard Reeve for access to the server + * Upload your file to the Boydorr server, at this location:
+ `/srv/ftp/[namespace]/[data_product_name]/[version_number].csv` + + In terminal (MacOS): + + ``` bash + scp mylocaldir/[data_product_name]/[version_number].csv myusername@boydorr.gla.ac.uk:/srv/ftp/[namespace]/[data_product_name]/[version_number].csv + ``` + + or at the command prompt (Windows): + ``` cmd + scp c:\path\to\local\folder\[data_product_name]\[version_number].csv myusername@boydorr.gla.ac.uk:/srv/ftp/[namespace]/[data_product]/[version_number].csv + ``` + +3. Upload your data product + * If it's an HDF5 file, upload it to the Boydorr server at this location:
+ `/srv/ftp/[namespace]/[data_product_name]/[version_number].h5` + * If it's a TOML file, push it to the [ScottishCovidResponse/DataRepository](https://github.com/ScottishCovidResponse/DataRepository) repository on GitHub, at this location:
+ `/[namespace]/[data_product_name]/[version_number].toml` + +## Submission script templates + +Submission scripts are used to upload metadata to the data registry. A number of submission script templates are available in the SCRCdata package. To access them either clone the [repository](https://github.com/ScottishCovidResponse/SCRCdata) and go to the `inst/templates` directory, or just follow the links on this page. + +### Good templates + +Use these scripts if you have a single original source (*e.g.* a website or database), a single raw data file (*e.g.* a csv file), a single processing script (to convert the raw data file into an h5/hdf5 file), and a single data product (*e.g.* an h5/hdf5 file). + +[upload_everything2][10] +: You have everything mentioned above. + +[upload_everything][9] +: You have everything mentioned above. This is an unwrapped version of `upload_everything2` with more scope for tweaking. + +[upload_distribution_from_paper][8] +: You have a distribution (raw data) taken from a paper (original source). You can generate the toml file (data product) here so you don't need a processing script. + +[upload_multiple_estimates_from_paper][7] +: You have multiple point-estimates (raw data) taken from a paper (original source). You can generate the toml file (data product) here so you don't need a processing script. + +[upload_estimate_from_paper][6] +: You have a point-estimate (raw data) taken from a paper (original source). You can generate the toml file (data product) here so you don't need a processing script. + +:exclamation: If you have a more complicated situation, *e.g.* [multiple original sources](https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/SCRC/scotgov_dz_lookup.R), please seek guidance. + +### Try not to use these templates... + +Use these scripts if you don't have an original source or a processing script and want to upload a data product anyway. Bad! + +[upload_array][5] +: You have an array converted to an h5 file (data product), but no original source, no raw data, and no processing script. + +[upload_table][4] +: You have a table converted to an h5 file (data product), but no original source, no raw data, and no processing script. + +[upload_multiple_parameters_in_one_toml][3] +: You have a toml file containing multiple point-estimates and/or distributions (data product), but no original source, no raw data, and no processing script. + +[upload_distribution][2] +: You have a distribution (that you made up in your head) and you want to create a toml file (data product). You have no original source, no raw data, and no processing script. + +[upload_estimate][1] +: You have a point-estimate (that you made up in your head) and you want to create a toml file (data product). You have no original source, no raw data, and no processing script. + +[1]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_estimate.R +[2]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_distribution.R +[3]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_multiple_parameters_in_one_toml.R +[4]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_table.R +[5]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_array.R +[6]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_estimate_from_paper.R +[7]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_multiple_estimates_from_paper.R +[8]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_distribution_from_paper.R +[9]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_everything.R +[10]: https://raw.githubusercontent.com/ScottishCovidResponse/SCRCdata/master/inst/templates/upload_everything2.R diff --git a/content/docs/API/R/3_download_data/_index.md b/content/docs/API/R/3_download_data/_index.md new file mode 100644 index 0000000..6e862ea --- /dev/null +++ b/content/docs/API/R/3_download_data/_index.md @@ -0,0 +1,64 @@ +--- +title: Download a data product +weight: 3 +--- + +# How to download a data product + +1. Go to the [data registry](https://data.scrc.uk/) and browse the data products. + +2. Identify the name of the data product (**Data Product Name**): +
*e.g.* records/SARS-CoV-2/scotland/cases-and-management/testing. + +3. Download the H5 (or TOML) file: + + ``` R + library(SCRCdataAPI) + file <- download_data_product(name = "records/SARS-CoV-2/scotland/cases-and-management/testing", + data_dir = "my_data") + ``` + + where name corresponds to the data product name and data_dir identifies the location in which the data will be saved. If the directory does not already exist, it will be created. + + When this code is run, an H5 (or TOML) file will be downloaded to `data_dir`, unless a file by the same name already exists at the specified location, in which case a message will notify you. The `download_data_product()` function will return a list comprising two named elements: `downloaded_to`, the absolute path of H5 file after downloading; and `components`, the components contained within the H5 (or TOML) file. + + Note that, omitting the `version` argument will result in the most recent version of the file being downloaded. If you want to specify a particular version number however, you can do so. + +4. An H5 (or TOML) file will always contain at least one component, each containing of a particular dataset. These are listed in the data registry, or can be listed in R using: + + ``` R + get_components(file$downloaded_to) + ``` + +4. Now, pick one of these components and read it into R using either `read_array()`, `read_table()`, `read_estimate()`, or `read_distribution()`: + + ``` R + tmp <- read_array(filepath = file$downloaded_to, + component = file$components[4]) + head(tmp) + ``` + +## How to download raw data (an external object) + +You shouldn't need to download the raw data file, since the data product should already be available, but for the sake of completeness... + +An external object is an original source file. This might be a reference to a paper or a raw, unedited csv file. Papers are not stored by the SCRC. Raw data objects are and can be downloaded in the following way. + +1. Go to the [data registry](https://data.scrc.uk/) and browse the external objects. + +2. Identify the name of the external object (**External Object DOI or Unique Name**): +
*e.g.* scottish coronavirus-covid-19-management-information. + +3. Download the data file (*e.g.* csv file): + + ``` R + library(SCRCdataAPI) + file <- download_external_object(name = "scottish coronavirus-covid-19-management-information", + data_dir = "my_data") + ``` + + where name corresponds to the data product name and data_dir identifies the location in which the data will be saved. If the directory does not already exist, it will be created. + + When this code is run, an hdf5 file will be downloaded, unless a file by the same name already exists at the specified location, in which case a message will notify you. The `download_external_object()` function will return a list comprising two named elements, `downloaded_to` (the absolute path of H5 file after downloading) and `components` (the components contained within the H5 file). + + Note that, omitting the `version` argument will result in the most recent version of the file being downloaded. If you want to specify a particular version number however, you can do so. diff --git a/content/docs/API/R/4_helpers/_index.md b/content/docs/API/R/4_helpers/_index.md new file mode 100644 index 0000000..1137edd --- /dev/null +++ b/content/docs/API/R/4_helpers/_index.md @@ -0,0 +1,137 @@ +--- +title: Helpers +weight: 4 +--- + +# Helper functions + +These helpers are mostly for internal use. + +Detailed documentation and examples can be found [here](https://scottishcovidresponse.github.io/SCRCdataAPI/reference/index.html). + +## Pipeline helpers + +`attach_issue()` +: Attach an issue to an object in the data registry + +`check_exists()` +: Check if entry exists in the data registry + +`create_version_number()` +: Create version number + +`get_entry()` +: Return all fields associated with a table entry in the data registry + +`get_existing()` +: Return all entries posted to a table in the data registry + +`get_file_hash()` +: Calculate hash from file + +`get_github_hash()` +: Get current GitHub hash + +`get_package_info()` +: Get GitHub package info + +`get_url()` +: Get URL + +`get_version_numbers()` +: Get data product version numbers + +`increment_version()` +: Increment version number + +`paper_exists()` +: Check whether paper exists + +`upload_toml_to_github()` +: Upload TOML file to GitHub + +## Post new entry to table + +The following functions can be used to post an entry to a table in the data registry. + +The database schema can be viewed [here](https://data.scrc.uk/static/images/schema.svg). + +`new_author()` +: Post entry to author table + +`new_code_repo_release()` +: Post entry to author table + +`new_coderun()` +: Post entry to code_repo_release table + +`new_data_product()` +: Post entry to data_product table + +`new_external_object()` +: Post entry to external_object table + +`new_issue()` +: Post entry to issue table + +`new_keyword()` +: Post entry to keyword table + +`new_namespace()` +: Post entry to namespace table + +`new_object()` +: Post entry to object table + +`new_object_component()` +: Post entry to object_component table + +`new_source()` +: Post entry to source table + +`new_storage_location()` +: Post entry to storage_location table + +`new_storage_root()` +: Post entry to storage_root table + +`new_text_file()` +: Post entry to text_file table + +## Wrapper functions + +The following functions combine the `new_*()` functions to post entries across multiple tables in the data registry. + +`upload_data_product()` +: Post data_product metadata to the data registry + +`upload_github_repo()` +: Post github_repo metadata to the data registry + +`upload_object_links()` +: Post object_links metadata to the data registry + +`upload_paper()` +: Post paper metadata to the data registry + +`upload_source_data()` +: Post source_data metadata to the data registry + +`upload_submission_script()` +: Post submission_script metadata to the data registry + +The following function combines the `upload_*()` functions to post entries across all tables in the data registry associated with a particular data product. + +`register_everything()` +: Post everything associated with a data product to the data registry + +## Dataset manipulation + +`bin_ages()` +: Bin ages + +`convert_to_grid()` +: Convert census geographies to grid based system + +`convert_to_lower()` +: Convert census geographies to lower resolution diff --git a/content/docs/API/R/_index.md b/content/docs/API/R/_index.md new file mode 100644 index 0000000..227c75b --- /dev/null +++ b/content/docs/API/R/_index.md @@ -0,0 +1,29 @@ +--- +bookCollapseSection: true +weight: 10 +--- + +# SCRCdataAPI + +The `SCRCdataAPI` package contains functions used to interface with the SCRC data pipeline (*e.g.* to upload metadata to the registry or download data products from the Boydorr server) in R. + +To install it, run: + +``` R +install.packages("devtools") +devtools::install_github("scottishcovidresponse/SCRCdataAPI") +``` + +To view the package documentation, go [here](https://scottishcovidresponse.github.io/SCRCdataAPI/index.html). + +# SCRCdata + +The `SCRCdata` package includes submission script templates that can be used to interact with the SCRC data pipeline. These can be accessed by cloning the repo and going to `inst/templates` or by browsing the GitHub [repository](https://github.com/ScottishCovidResponse/SCRCdata/tree/master/inst/templates). + +The `SCRCdata` package contains submission scripts and functions used to process specific datasets and upload their metadata to the SCRC data pipeline. If you haven't added code to this package, you likely don't need to install it. If you have added code to this package, please ensure the data registry has been updated appropriately. + +To install it (assuming `devtools` is already installed), run: + +``` R +devtools::install_github("scottishcovidresponse/SCRCdata") +``` diff --git a/content/docs/API/_index.md b/content/docs/API/_index.md new file mode 100644 index 0000000..4f66fd6 --- /dev/null +++ b/content/docs/API/_index.md @@ -0,0 +1,161 @@ +--- +weight: 6 +title: "Modelling API" +bookCollapseSection: true +--- + +# API + +The API is defined more in terms of file formats than it is in terms of data types. There are two file formats that are native to the data pipeline, and files in these formats are referred to as *data products*: TOML files, and HDF5 files. TOML files store “small” parameter data, representing individual parameters. HDF5 files are used to store structured data, encoded either as “arrays” or “tables”. Both formats are described in more detail below, alongside API functions used to interact with them. Data in any other file format are treated as binary blobs, and are referred to as *external objects*. + +Different metadata is stored about each -- data products record information about their internal structure and naming of their components, whereas external objects record information about their provenance (since data products are internal to the pipeline, provenance is recorded separately). A single object can be both an external object and a data product, and thus have both sets of metadata recorded. + +## Initialisation + +The API must be initialised with the model URI and git sha, which should then be set as run-level metadata. + +## Additional metadata on write + +The write functions all accept `description` and `issues` arguments. + +## TOML (parameter) files + +A parameter file contains representations of one or more parameters, each a single number, possibly with some associated uncertainty. Parameters may by represented as point-estimates, parametric distributions, and sample data. + +### File format + +Parameters are stored in toml-formatted files, with the extension “toml”, containing sections corresponding to different components. The following is an example of the internal encoding, defining three components: "`my-point-estimate`", "`my-distribution`", and "`my-samples`": + +``` +[my-point-estimate] +type = "point-estimate" +value = 0.1 + +[my-distribution] +type = "distribution" +distribution = "gamma" +shape = 1 +scale = 2 + +[my-samples] +type = "samples" +samples = [1.0, 2.0, 3.0, 4.0, 5.0] +``` + +Point estimates are used when our knowledge of the parameter is only sufficient for a single value, with no notion of uncertainty. A point estimate component must have type = "point-estimate" and a value that is either a float or an integer. + +Distributions are used when our knowledge of a parameter can be represented by a parametric distribution. A distribution component must have type = "distribution", a distribution set to a string name of the distribution, and other parameters determined by the distribution. The set of distributions required to be supported is currently undefined. + +Samples are used when our knowledge of a parameter is represented by samples, from either empirical measurements, or a posterior distribution. A samples component must have type = "samples" and a value that is a list of floats and integers. + +#### Distributions + +The supported distributions,each with a link to information about their parameterisation, and their standardised parameter names are as follows: + + +| Distribution | Standardised parameter names | +| -------------------------- | ------------------------------------------ | +| categorical (non-standard) | bins (string array), weights (float array) | +| gamma | k (float), theta (float) | +| normal | mu (float), sigma (float) | +| uniform | a (float), b (float) | +| poisson | lambda (float) | +| exponential | lambda (float) | +| beta | alpha (float), beta (float) | +| binomial | n (int), p (float) | +| multinomial | n (int), p (float array) | + +### API functions + +`read_estimate(data_product, component) -> float or integer` + +If the component is represented as a point estimate, return that value. + +If the component is represented as a distribution, return the distribution mean. + +If the component is represented as samples, return the sample mean. + +`read_distribution(data_product, component) -> distribution object` + +If the component is represented as a point estimate, fail. + +If the component is represented as a distribution, return an object representing that distribution. + +If the component is represented as samples, failreturn an empirical distribution. + +`read_samples(data_product, component) -> list of floats or integers` + +If the component is represented as a point estimate, fail. + +If the component is represented as a distribution, fail. + +If the component is represented as samples, return the samples. + +`write_estimate(data_product, component, estimate, description, issues)` + +`write_distribution(data_product, component, distribution object, description, issues)` + +`write_samples(data_product, component, samples, description, issues)` + +## HDF5 files + +Note that the following is subject to change. For example, we may want to add all of the metadata as attributes. + +HDF5 files contain structured data, encoded as either an “array”, or a “table”, both of which are described in more detail below. + +### File format + +HDF5 files are stored with the extension “h5”. Internally, each component is stored in a different (possibly nested) group, where the full path defines the component name (*e.g.* “path/to/component”). Inside the group for each component is either a value named “array”, or a value named “table”. It is an error for there to be both. + +#### array format + +{component}/array +: An n-dimensional array of numerical data + +{component}/Dimension_{i}_title +: The string name of dimension {{< katex >}}i{{< /katex >}} + +{component}/Dimension_{i}_names +: String labels for dimension {{< katex >}}i{{< /katex >}} + +{component}/Dimension_{i}_values +: Values for dimension {{< katex >}}i{{< /katex >}} + +{component}/Dimension_{i}_units +: Units for dimension {{< katex >}}i{{< /katex >}} + +{component}/units +: Units for the data in array + +#### table format + +{component}/table +: A dataframe + +{component}/row_names +: String labels for the row axis + +{component}/column_units +: Units for the columns + +### API functions + +`read_array(data_product, component) -> array` + +If the component does not terminate in an array-formatted value, raise an error. + +Return an array, currently with no structural information. + +`read_table(data_product, component) -> dataframe` + +If the component does not terminate in a table-formatted value, raise an error. + +Return a dataframe, with labelled columns. + +`write_array(data_product, component, array, description, issues)` + +If the array argument is not array-formatted, raise an error. + +`write_table(data_product, component, table, description, issues)` + +If the table argument is not table-formatted, raise an error. diff --git a/content/docs/API/julia/_index.md b/content/docs/API/julia/_index.md new file mode 100644 index 0000000..b69f272 --- /dev/null +++ b/content/docs/API/julia/_index.md @@ -0,0 +1,33 @@ +--- +weight: 20 +--- + +# DataRegistryUtils.jl + +The `DataRegistryUtils.jl` package contains functions used to interface with the SCRC data pipeline (*e.g.* to upload metadata to the registry or download data products from the Boydorr server) in Julia. + +## Features +- Conveniently download Data Products from the SCRC [Data Registry](https://data.scrc.uk/). +- File hash-based version checking: new data is downloaded only when necessary. +- A SQLite layer for convenient pre-processing (typically aggregation, and the joining of disparate datasets based on common identifiers.) +- Easily register model code or realisations (i.e. 'runs') with a single line of code. + +## Installation + +The package is not yet registered and must be added via the package manager Pkg: + +``` julia +using Pkg +Pkg.add(url="https://github.com/ScottishCovidResponse/DataRegistryUtils.jl") +``` + +## Usage + +See the [package documentation][docs] for instructions and examples. + +## Source code + +See the package's [code repo][repo]. + +[docs]: https://scottishcovidresponse.github.io/DataRegistryUtils.jl/stable/ +[repo]: https://github.com/ScottishCovidResponse/DataRegistryUtils.jl diff --git a/content/docs/API/python/_index.md b/content/docs/API/python/_index.md new file mode 100644 index 0000000..89c3d35 --- /dev/null +++ b/content/docs/API/python/_index.md @@ -0,0 +1,8 @@ +--- +bookCollapseSection: true +weight: 30 +--- + +# Standardised data type API + +The standard API (or standardised data type API) is used to read/write arrays/tables/distributions/samples/estimates in python. diff --git a/content/docs/API/python/file_api.md b/content/docs/API/python/file_api.md new file mode 100644 index 0000000..a585a52 --- /dev/null +++ b/content/docs/API/python/file_api.md @@ -0,0 +1,177 @@ +--- +weight: 2 +--- + +# File API + +## The file API manages file access, provenance, and metadata + +The API is accessed as a "session". All reads and writes are recorded and logged into a file when the session closes. Files are identified by their metadata, though the metadata is handled differently for reads (where the files are expected to exist) and writes (where they typically do not), described in more detail below. + +The file API behaviour is entirely determined by a yaml configuration file (referred to here as a “config.yaml” file, and described below) provided at initialisation. This configuration file defines the “data directory” that the file API should interact with. That directory must contain a file called “metadata.yaml”, described below, that defines the metadata associated with the files in the data directory. The data directory and the metadata.yaml file can be automatically created by a [download script](https://github.com/ScottishCovidResponse/data_pipeline_api/tree/master/data_pipeline_api/registry) which reads the config.yaml file and downloads appropriate data and metadata. + +When a model or script is run (in the “run”), any output files are written to the data directory, and an “access.yaml” file is created that enumerates exactly which files were read and written to during the run. The access.yaml file contains sufficient information to upload all of the data and metadata from the run to the data store and data registry respectively. This can be carried out automatically using an [upload script](https://github.com/ScottishCovidResponse/data_pipeline_api/tree/master/data_pipeline_api/registry) if desired. Note that the access.yaml file may not be written until the connection to the API is closed (this is certainly true for the python implementation). When the file API is initialised a “run_id” is created to uniquely identify that invocation. It is constructed by forming the SHA1 hash of the configuration file content, plus the date time string. + +For normal modelling runs, the only interaction with the File API happens through setting the config.yaml file (and running the download and upload scripts), but the rest of the information (formats of the metadata.yaml and access.yaml file, and the low-level File API calls themselves are provided here for completeness). + +## config.yaml file format + +The config file lets users specify metadata to be used during file lookup, and configure overall file API behaviour. A simple example: + +``` +data_directory: . +access_log: access-{run_id}.yaml +fail_on_hash_mismatch: True +run_metadata: + description: A test model + data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: model_test + submission_script: model.py + remote_uri: ssh://boydorr.gla.ac.uk/srv/ftp/scrc/ + remote_uri_override: ftp://boydorr.gla.ac.uk/scrc/ + +read: +- where: + data_product: human/commutes + use: + version: 1.0 +- where: + data_product: human/population + use: + filename: my-human-population.csv + +write: +- where: + data_product: human/outbreak-timeseries + Component: + use: + namespace: simple_network_sim + data_product: human/outbreak-timeseries +``` + +`data_directory` specifies the file system root used for data access (default “.”). It may be relative; in which case it is relative to the directory containing the config file. The data directory must contain a `metadata.yaml` file. + +`access_log` specifies the filename used to record the access log (default “access-{run_id}.yaml”). It may be relative; in which case it is relative to the directory containing the config file. It may contain the string `{run_id}`, which will be replaced with the run id. It may be set to the boolean value False to indicate that no access log should be written. + +`fail_on_hash_mismatch` will, if set to True (the default), cause the API to fail is an attempt is made to read a file whose computed hash differs from the one stored in the `metadata.yaml` file. + +`run_id` specifies the run id to be used, otherwise a hash of the config contents and the date will be used. + +`run_metadata` provides metadata for the run that will be passed through to the access log. + +The `where` sections specify metadata subsets that are matched in the read and write processes. The metadata values may use glob syntax, in which case matching is done against the glob. The corresponding `use` sections contain metadata that is used to update the call metadata before the file access is attempted. A `filename` may be specified directly, in which case it will be used without any further lookup. + +Any other attributes will be ignored. + +## metadata.yaml file format + +The metadata file contains metadata for all files in the file system “database”. A simple example: + +``` +- + data_product: human/commutes + version: 1 + extension: csv + filename: human/commutes/1.csv + verified_hash: 075abd810909918419cf7495c16f1afec6fa010c +- + data_product: human/compartment-transition + version: 1 + extension: csv + filename: human/compartment-transition/1.csv + verified_hash: 65662d0461471f36a06b32ca6d4003ca4493848f +``` + +Each section defines the metadata for a single file, including its filename, relative to the directory containing the metadata.yaml file. + +## access.yaml format + +The access file is generated whenever `close()` is called on the API. It records basic information about the run, and a log of file accesses. An example: + +``` +data_directory: . +run_id: 84b87c5f60 +open_timestamp: 2020-06-24 14:30:22.010927 +close_timestamp: 2020-06-24 14:30:22.038766 +config: + ... +run_metadata: + git_repo: https://github.com/ScottishCovidResponse/simple_network_sim + git_sha: 353697d0a04ef5d6d5a04ef9aef514cbd72a55fd + ... +io: +- type: read + timestamp: 2020-06-24 14:30:22.018370 + call_metadata: + data_product: human/mixing-matrix + extension: csv + access_metadata: + data_product: human/mixing-matrix + version: 1.0.0 + extension: csv + filename: human/mixing-matrix/1.csv + verified_hash: 075abd810909918419cf7495c16f1afec6fa010c + calculated_hash: 075abd810909918419cf7495c16f1afec6fa010c +- type: write + timestamp: 2020-06-24 14:30:22.038511 + call_metadata: + data_product: human/estimatec + extension: csv + access_metadata: + data_product: human/estimatec + extension: csv + source: simple_network_sim + filename: human/estimatec/84b87c5f60.csv + calculated_hash: 91a6791ab4f6d3a4616066ffcae641ca3da79567 +``` + +`data_directory` specifies the file system root used for data access, either as an absolute path, or relative to the config.yaml file used to generate the run. + +`run_id` specifies the run id of the run. + +`config` reproduces the config.yaml used to generate this run verbatim. + +`run_metadata` contains additional metadata about the file API execution taken from the config and possibly overridden using the `set_run_metadata` function. + +`open_timestamp` and `close_timestamp` record time at which the file API was initialised, and the time at which `close()` was called. + +`io` points to a list of file access sections containing a common format: + +`type` is either read or write. + +`timestamp` is the timestamp of the access. + +`call_metadata` contains the metadata provided to the `open_for_read` or `open_for_write` call. + +`access_metadata` contains the metadata used to open the file. The process for obtaining this metadata is described in the `open_for_read` process and `open_for_write` process sections below. + +## open_for_read process + +1. Search config for all read sections that are a subset of the given metadata and update the call metadata with the corresponding overrides. +2. Search the metadata file for all metadata sections that are a superset of the updated metadata; if any results, use the metadata section with the highest version. +3. Use the filename defined in the metadata; fail if there is no filename specified, or if the file is not found. +4. Calculate the hash of the file and store it in the metadata. +5. If hash verification is enabled, check that there is a verified hash in the metadata, and that it matches the calculated hash; fail otherwise. +6. Record the read. +7. Open the file for read and return the file handle. + +## open_for_write process + +1. Search config for all write sections that are a subset of the given metadata and update the call metadata with the corresponding overrides. +2. If the metadata does not contain a filename, use the metadata and the run id to construct a standard filename, and add it to the metadata. +3. Create all missing parent directories. +4. If the file already exists, open the file for update, else open the file for write (and thus create it implicitly). +5. Register a call-back to record the write on close and return the open file handle. + +The File API consists of five logical functions (in python, implementation details may vary): + + +| Function | Description | +| ----------------------------- | --------------------------------------------- | +| init(configuration_filename) | Initialise the API with a configuration file. | +| open_for_read(metadata) | Use metadata to open a file for reading. | +| open_for_write(metadata) | Use the metadata to open a file for writing. | +| close() | Write the access log to disk. May be called again. | +| set_run_metadata(key, value) | Associate a (key, value) metadata pair with the run. This is used by the Standard API to transmit the model uri and git_sha to the access.yaml. | + diff --git a/content/docs/API/python/standard_api.md b/content/docs/API/python/standard_api.md new file mode 100644 index 0000000..a87621c --- /dev/null +++ b/content/docs/API/python/standard_api.md @@ -0,0 +1,175 @@ +--- +weight: 1 +--- + +# Standardised data type API + +Unless otherwise specified this document uses the “current” metadata scheme defined in the "Metadata and data representation” document. You do not need to have read that document to be able to read and understand (the majority of) this one. + +The standardised data type API is defined more in terms of file formats than it is in terms of data types. There are two file formats: parameter files, and hdf5 files. Parameter files store “small” parameter data in toml format, representing individual parameters. HDF5 files are used to store structured data, encoded either as “arrays” or “tables”. Both formats are described in more detail below, alongside API functions used to interact with them. + +## Initialisation + +The standard API must be initialised with the model URI and git sha, which should then be set as run-level metadata using the file API set_metadata function. + +## Additional metadata on write + +The write functions all accept `description` and `issues` arguments, which are passed through to the file API as component-level metadata. + +## parameter files + +A parameter file contains representations of one or more parameters, each a single number, possibly with some associated uncertainty. Parameters may by represented as point-estimates, parametric distributions, and sample data. + +## Metadata + +``` +extension = "toml" +``` + +## Alternative metadata + +``` +format = "parameter" +extension = "toml" (could be inferred) +``` + +## File format + +Parameters are stored in toml-formatted files, with the extension “toml”, containing sections corresponding to different components. The following is an example of the internal encoding, defining three components: "`my-point-estimate`", "`my-distribution`", and "`my-samples`": + +``` +[my-point-estimate] +type = "point-estimate" +value = 0.1 + +[my-distribution] +type = "distribution" +distribution = "gamma" +shape = 1 +scale = 2 + +[my-samples] +type = "samples" +samples = [1.0, 2.0, 3.0, 4.0, 5.0] +``` + +Point estimates are used when our knowledge of the parameter is only sufficient for a single value, with no notion of uncertainty. A point estimate component must have type = "point-estimate" and a value that is either a float or an integer. + +Distributions are used when our knowledge of a parameter can be represented by a parametric distribution. A distribution component must have type = "distribution", a distribution set to a string name of the distribution, and other parameters determined by the distribution. The set of distributions required to be supported is currently undefined. + +Samples are used when our knowledge of a parameter is represented by samples, from either empirical measurements, or a posterior distribution. A samples component must have type = "samples" and a value that is a list of floats and integers. + +## Distributions + +The supported distributions,each with a link to information about their parameterisation, and their standardised parameter names are as follows: + + +| Distribution | Standardised parameter names | +| ----------------------------- | --------------------------------------------- | +| categorical (non-standard) | bins (string array), weights (float array) | +| gamma | k (float), theta (float) | +| normal | mu (float), sigma (float) | +| uniform | a (float), b (float) | +| poisson | lambda (float) | +| exponential | lambda (float) | +| beta | alpha (float), beta (float) | +| binomial | n (int), p (float) | +| multinomial | n (int), p (float array) | + +## API functions + +`read_estimate(data_product, component) -> float or integer` + +If the component is represented as a point estimate, return that value. + +If the component is represented as a distribution, return the distribution mean. + +If the component is represented as samples, return the sample mean. + +`read_distribution(data_product, component) -> distribution object` + +If the component is represented as a point estimate, fail. + +If the component is represented as a distribution, return an object representing that distribution. + +If the component is represented as samples, failreturn an empirical distribution. + +`read_samples(data_product, component) -> list of floats or integers` + +If the component is represented as a point estimate, fail. + +If the component is represented as a distribution, fail. + +If the component is represented as samples, return the samples. + +`write_estimate(data_product, component, estimate, description, issues)` + +`write_distribution(data_product, component, distribution object, description, issues)` + +`write_samples(data_product, component, samples, description, issues)` + +## HDF5 files + +HDF5 files contain structured data, encoded as either an “array”, or a “table”, both of which are described in more detail below. + +## Metadata + +extension = "h5" + +## Alternative metadata + +format = "hdf5" +extension = "h5" (could be inferred) + +## File format + +HDF5 files are stored with the extension “h5”. Internally, each component is stored in a different (possibly nested) group, where the full path defines the component name (*e.g.* “path/to/component”). Inside the group for each component is either a value named “array”, or a value named “table”. It is an error for there to be both. + +## array format + +{component}/array +: An n-dimensional array of numerical data + +{component}/Dimension_{i}_title +: The string name of dimension {{< katex >}}i{{< /katex >}} + +{component}/Dimension_{i}_names +: String labels for dimension {{< katex >}}i{{< /katex >}} + +{component}/Dimension_{i}_values +: Values for dimension {{< katex >}}i{{< /katex >}} + +{component}/Dimension_{i}_units +: Units for dimension {{< katex >}}i{{< /katex >}} + +{component}/units +: Units for the data in array + +## table format + +{component}/table +: A dataframe + +{component}/row_names +: String labels for the row axis + +{component}/column_units +: Units for the columns + +## API functions + +`read_array(data_product, component) -> array` + +If the component does not terminate in an array-formatted value, raise an error. + +Return an array, currently with no structural information. + +`read_table(data_product, component) -> dataframe` + +If the component does not terminate in a table-formatted value, raise an error. + +Return a dataframe, with labelled columns. + +`write_array(data_product, component, array, description, issues)` + +`write_tab` diff --git a/content/docs/API/python/terminology.md b/content/docs/API/python/terminology.md new file mode 100644 index 0000000..03291a2 --- /dev/null +++ b/content/docs/API/python/terminology.md @@ -0,0 +1,53 @@ +--- +title: Terminology +weight: 3 +--- + +# Terminology used in this document + +datum +: A specific value, encoded in a particular way, that travels through the data pipeline. + +config.yaml +: A file (potentially with a different name) used by the data pipeline to allow users to override default pipeline behaviour. See the File API specification for more details. + +metadata.yaml +: A file used by the data pipeline to describe available data files, listing their associated metadata. See the File API specification for more details. + +access.yaml +: A file (potentially with a different name) generated by the data pipeline API to record file access. See the File API specification for more details. + +## Metadata + +data_product +: Identifies which kind of quantity a datum represents (e.g. “human/mixing-matrix”). Path-formatted to permit structure in the filename scheme (defined below). The desired data_product is typically specified in model code, and it is a core part of the data identifiers used in config.yaml, metadata.yaml, and access.yaml. + +version +: A semver identifying a version of a data_product (the file API will select the most recent version if this is not specified). + +component +: Identifies a part of a data_product. + +filename +: Specifies the path to a file, typically relative to the data root. Only required on read, and typically inferred from metadata.yaml. + +extension +: Specifies the extension of a file. Required on write to generate a standard filename. Typically provided by a datatype API. + +run_id +: Specifies a unique identifier for a model run. Required on write to generate a standard filename, typically generated by the file API. + +verified_hash +: Specifies a "verified good” SHA1 hash for a file. Used by the file API to verify file contents. Typically defined in metadata.yaml. + +calculated_hash +: Specifies the SHA1 hash computed by the file API for a file. Typically only defined in access.yaml. + +max_warning +: Specifies the maximum known warning level for a particular datum. Could be used by the file API to filter “bad” data (currently not supported). Typically defined in metadata.yaml. + +## Filenames + +{data root}/{data_product}.../{run_id}.{extension} + +*e.g.* `{data root}/human/mixing-matrix/12345.h5` diff --git a/content/docs/edit-site/_index.md b/content/docs/edit-site/_index.md new file mode 100644 index 0000000..ffc8034 --- /dev/null +++ b/content/docs/edit-site/_index.md @@ -0,0 +1,8 @@ +--- +title: "Edit this site" +--- +# How to edit this site + +If you want to edit or add to any of the pages here, please click on the "Edit this page" link at the bottom of the page you want to edit. This will take you to a GitHub repository, which stores the source files for that particular page. From there you can click on the tiny pencil (on hover: Edit this file), which will allow you to make any edits you want. Please remember that any tables or fancy formatting will have to be written in markdown / html to be visible. Once you're finished, just click the link at the bottom of the page to create a new branch and start a pull request. + +If you want to add new pages to the website remember that the headings are weighted (ordered) in the _index file. Sub headings are ordered within those directories and I've given them a different unit for clarity. Best to try clicking on everything when you're done, to make sure they don't randomly rearrange themselves. diff --git a/content/docs/hidden.md b/content/docs/hidden.md new file mode 100644 index 0000000..fb7c953 --- /dev/null +++ b/content/docs/hidden.md @@ -0,0 +1,11 @@ +--- +bookHidden: true +--- + +# This page is hidden in menu + +# Quondam non pater est dignior ille Eurotas + +Reference another [page]({{% ref "/docs/API/R/1_generate_dp" %}}) + +Reference same [page]({{% ref "/docs/API/R/1_generate_dp/_index.md#toml-components" %}}) \ No newline at end of file diff --git a/content/docs/interface/_index.md b/content/docs/interface/_index.md new file mode 100644 index 0000000..b494d72 --- /dev/null +++ b/content/docs/interface/_index.md @@ -0,0 +1,19 @@ +--- +weight: 2 +title: "API Interface" +bookCollapseSection: true +--- + +# Data pipeline + +The data pipeline is moving towards this new structure: + +![SCRC Data API design figure](scrc-api-new.svg) + +The API is accessed as a *session*, which corresponds in the registry to a *code run*. All reads and writes are logged directly to a local registry as the session progresses, as is the script which generates that i/o. Files are identified by their metadata, though the metadata is different for reads (where the files must exist) and writes (where they must not). The metadata is also different for *files in core pipeline data formats*, where a significant amounts of metadata are recorded, and *other arbitrary files*, where only a limited amount of data can be collected. Either type of file can be used for input or output, giving a total of four different interactions, two for input and two for output. These differences are described in more detail below. + +The underlying data to which the API refers is determined by the interaction between a yaml configuration file (referred to here as a *config.yaml* file, and described below) provided at initialisation and the state of the remote registry at the time of processing of the *config.yaml* by the download synchronisation script (which is considered to contain the definitive version of all data at that time). The specific remote registry used is itself defined in the *config.yaml* file. + +This interaction between the configuration file and the remote registry defines the “*local filesystem data repository*” that the local pipeline interacts with. The data directory can be automatically created by a *download synchronisation script* [(currently found here)](https://github.com/ScottishCovidResponse/data_pipeline_api/tree/master/data_pipeline_api/registry) which reads the *config.yaml* file, queries the appropriate remote registry, downloads appropriate data, and populates the local registry with the relevant metadata for those data. + +When a model or script is run (as a *session* / “*code run*”), any output files are written to the data directory, and those outputs are logged in the local registry, which has itself been created (or updated) by the *download synchronisation script*. The local registry can be queried to determine whether the data generated is as intended, and if so it can then by synchronised back to the remote registry. This can be carried out automatically using an *upload synchronisation script* [(currently here)](https://github.com/ScottishCovidResponse/data_pipeline_api/tree/master/data_pipeline_api/registry). When the *session* is initialised a “*run id*” is created to uniquely identify that *code run*. It is constructed by forming the SHA1 hash of the configuration file content, plus the date time string. diff --git a/content/docs/interface/config/_index.md b/content/docs/interface/config/_index.md new file mode 100644 index 0000000..f2bc59d --- /dev/null +++ b/content/docs/interface/config/_index.md @@ -0,0 +1,182 @@ +--- +weight: 10 +title: "User written config file" +--- + +Note that this is a living document and the following is subject to change. Fields may be missing or named incorrectly! + +# User written *config.yaml* file + +The Data Pipeline API hinges on a *config.yaml* file, which lets users specify metadata to be used during file lookup for read or write, and configure overall API behaviour. + +## Simple inputs and outputs + +The following example reads various pieces of data and writes an external object. + +```yaml +fail_on_hash_mismatch: True +run_metadata: + description: A simple analysis + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: johnsmith + default_data_store: /datastore/ + always_copy_to_store: False + local_repo: /Users/johnsmith/git/myproject/ + # `script:` points to the submission script (relative to local_repo) + script: python path/submission_script.py {CONFIG_PATH} + # `script_path:` can be used instead of `script:` + +read: +# Read version 1.0 of human/commutes +- data_product: human/commutes + version: 1.0 +# Read human/health from the cache +- data_product: human/health + cache: /local/file.h5 +# Read crummy_table with specific doi and title +- external_object: crummy_table + doi: 10.1111/ddi.12887 + title: Supplementary Table 2 +# Read secret_data with specific doi and title from the cache +- external_object: secret_data + doi: 10.1111/ddi.12887 + title: Supplementary Table 3 + cache: /local/secret.csv +# Read weird_lost_file (which perhaps has no metadata) with specific hash +- object: weird_lost_file + hash: b5a514810b4cb6dc795848464572771f + +write: +# Write beautiful_figure and increment version number +- external_object: beautiful_figure + unique_name: My amazing figure + version: {MINOR} +``` + +- `fail_on_hash_mismatch:` will, if set to True (the default), cause the API to fail is an attempt is made to read a file whose computed hash differs from the one stored in the local registry + +- `run_metadata:` provides metadata for the run: + - `description:` is a human readable description of the purpose of the config.yaml + - `local_data_registry_url:` specifies the local data registry root, which defaults to https://localhost:8000/api/ + - `remote_data_registry_url:` specifies the remote data registry endpoint, which defaults to https://data.scrc.uk/api/ + - `default_input_namespace:` and `default_output_namespace:` specify the default namespace for reading and writing + - `default_data_store:` specifies the file system root used for data writes, which defaults to /datastore (it may be relative, in which case it is relative to the directory containing the config file) + - `always_copy_to_store` specifies whether files that already exist in the local filesystem (files specified in `read: use: cache:`) but not in the `default_data_store` should be copied to the data store (set to `True`) or not (set to `False`, default) + - The submission script itself should either be written in `script` or stored in a text file in `script_path`, which can be absolute or relative to `local_repo:` (the root of the local repository) + - Any other fields will be ignored + +- `read:` and `write:` provide references to data: + - `data_product:` (within `read:` and `write:`), `external_object:` (`read:` and `write:`) and `object:` (`read:` only) specify metadata subsets that are matched in the read and write processes. The metadata values may use glob syntax, in which case matching is done against the glob. + - For reads, a `cache:` may be specified directly, in which case it will be used without any further lookup. + - If a write is carried out to a data product where no such `data_product:` entry exists, then a new data product is created with that name in the local namespace, or the patch version of an existing data product is suitably incremented. The level of incrementation or version number can be explicitly defined by `version:`. + - If a write is carried out to an object that is not a data product and no such `external_object:` entry exists, then a new object is created with no associated external object or data product, and an issue is raised with the object to note the absence of an appropriate reference, referencing the name given in the write API call. + - `version:` can be specified explicitly (*e.g.* `0.1.0` or `0.20210414.0`), by reference (*e.g.* `0.{DATETIME}.0`, meaning `0.20210414.0`), or by increment (*i.e.* `{MAJOR}`, `{MINOR}`, or `{PATCH}`). If an object already exists and no version is specified, it will be incremented by patch, by default. + +## Extended inputs and outputs + +The following example registers a new external object and writes a data product component. + +```yaml +run_metadata: + description: Register a file in the pipeline + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: johnsmith + default_data_store: /datastore/ + local_repo: /Users/johnsmith/git/myproject/ + script: # Points to the Python script, below (relative to local_repo) + python path/submission_script.py {CONFIG_PATH} +# `script_path:` can be used instead of `script:` + +register: +- external_object: raw-mortality-data + # Who owns the data? + source_name: Scottish Government Open Data Repository + source_abbreviation: Scottish Government Open Data Repository + source_website: https://statistics.gov.scot/ + # Where does the data come from? + root_name: Scottish Government Open Data Repository + root: https://statistics.gov.scot/sparql.csv?query= + path: |- + PREFIX qb: + PREFIX data: + PREFIX rdfs: + PREFIX dim: + PREFIX sdim: + PREFIX stat: + PREFIX mp: + SELECT ?featurecode ?featurename ?areatypename ?date ?cause ?location ?gender ?age ?type ?count + WHERE { + ?indicator qb:dataSet data:deaths-involving-coronavirus-covid-19; + mp:count ?count; + qb:measureType ?measType; + sdim:age ?value; + sdim:causeOfDeath ?causeDeath; + sdim:locationOfDeath ?locDeath; + sdim:sex ?sex; + dim:refArea ?featurecode; + dim:refPeriod ?period. + + ?measType rdfs:label ?type. + ?value rdfs:label ?age. + ?causeDeath rdfs:label ?cause. + ?locDeath rdfs:label ?location. + ?sex rdfs:label ?gender. + ?featurecode stat:code ?areatype; + rdfs:label ?featurename. + ?areatype rdfs:label ?areatypename. + ?period rdfs:label ?date. + } + # Metadata + title: Deaths involving COVID19 + description: Nice description of the dataset + unique_name: Scottish deaths involving COVID19 # or doi + product_name: records/SARS-CoV-2/scotland/human-mortality + file_type: csv + release_date: {DATETIME} + version: 0.{DATETIME}.0 + primary: True + accessibility: open # Other option is "closed" + +write: +- data_product: records/SARS-CoV-2/scotland/human-mortality + description: human mortality data + version: 0.{DATETIME}.0 +``` + +## Flexible inputs and outputs + +The following example describes an analysis which typically reads *human/population* and writes *human/outbreak-timeseries*. Instead, a test model is run using Scottish data, whereby *scotland/human/population* is read from the *eera* namespace, rather than *human/population*. Likewise, the output is written as *scotland/human/outbreak-timeseries* rather than *human/outbreak-timeseries*. + +```yaml +fail_on_hash_mismatch: True +run_metadata: + description: A test model + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: johnsmith + default_data_store: /datastore/ + always_copy_to_store: False + +read: +- data_product: human/population + use: + namespace: eera + data_product: scotland/human/population + +write: +- data_product: human/outbreak-timeseries + use: + data_product: scotland/human/outbreak-timeseries +- data_product: human/outbreak/simulation_run + use: + data_product: human/outbreak/simulation_run-{RUN_ID} +``` + +- `read:` and `write:` provide references to data: + - The corresponding `use:` sections contain metadata that is used to update the call metadata before the file access is attempted + - Any part of a `use:` statement may contain the string `{RUN_ID}`, which will be replaced with the run id, otherwise a hash of the config contents and the date will be used diff --git a/content/docs/interface/example1/_index.md b/content/docs/interface/example1/_index.md new file mode 100644 index 0000000..b990183 --- /dev/null +++ b/content/docs/interface/example1/_index.md @@ -0,0 +1,277 @@ +--- +weight: 30 +title: "Working example (with Data Pipeline API functionality)" +--- + +Note that this is a living document and the following is subject to change. For reference, my R code lives [here](https://github.com/ScottishCovidResponse/SCRCdataAPI/tree/implement_yaml). Please post any questions on Zulip. + +# Working example (with Data Pipeline API functionality) + +The following example downloads some data from outside the pipeline, does some processing in R (for example), and records the original file and the resultant data product into the pipeline. + +In this simple example, the user should run the following from the terminal: + +```bash +fdp pull config.yaml +fdp run config.yaml +fdp push config.yaml +``` + +These functions require a *config.yaml* file to be supplied by the user. This file should specify various metadata associated with the code run, including where external objects comes from and the aliases that will be used in the submission script, data objects to be read and written, and the submission scipt location. + +## User written *config.yaml* + +```yaml +run_metadata: + description: Register a file in the pipeline + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: soniamitchell + default_data_store: /Users/SoniaM/datastore/ + local_repo: /Users/Soniam/Desktop/git/SCRC/SCRCdata + script: |- + R -f inst/SCRC/scotgov_management/submission_script.R {CONFIG_DIR} +register: +- external_object: management-data + source_name: Scottish Government Open Data Repository + source_abbreviation: Scottish Government Open Data Repository + source_website: https://statistics.gov.scot/ + root_name: Scottish Government Open Data Repository + root: https://statistics.gov.scot/sparql.csv?query= + path: | + PREFIX qb: + PREFIX data: + PREFIX rdfs: + PREFIX mp: + PREFIX dim: + PREFIX sdim: + PREFIX stat: + SELECT ?featurecode ?featurename ?date ?measure ?variable ?count + WHERE { + ?indicator qb:dataSet data:coronavirus-covid-19-management-information; + dim:refArea ?featurecode; + dim:refPeriod ?period; + sdim:variable ?varname; + qb:measureType ?type. + {?indicator mp:count ?count.} UNION {?indicator mp:ratio ?count.} + + ?featurecode ?featurename. + ?period rdfs:label ?date. + ?varname rdfs:label ?variable. + ?type rdfs:label ?measure. + } + title: Data associated with COVID-19 + description: The data provide past data around COVID-19 for the daily updates provided by the Scottish Government. + unique_name: COVID-19 management information + product_name: records/SARS-CoV-2/scotland/cases-and-management + file_type: csv + release_date: {DATETIME} + version: 0.{DATETIME}.0 + primary: True + accessibility: open + +write: +- data_product: records/SARS-CoV-2/scotland/cases-and-management/ambulance + description: Ambulance data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/calls + description: Calls data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/carehomes + description: Care homes data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/hospital + description: Hospital data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/mortality + description: Mortality data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/nhsworkforce + description: NHS workforce data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/schools + description: Schools data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/testing + description: Testing data + version: 0.{DATETIME}.0 +``` + +## Working *config.yaml* + +`fdp run` should create a working *config.yaml* file, which is then read by the Data Pipeline API. + +```yaml +run_metadata: + description: Register a file in the pipeline + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: soniamitchell + default_data_store: /Users/SoniaM/datastore/ + local_repo: /Users/Soniam/Desktop/git/SCRC/SCRCdata + script: R -f inst/SCRC/scotgov_management/submission_script.R /Users/SoniaM/datastore/coderun/20210511-231444/ +read: +- external_object: management-data + doi_or_unique_name: COVID-19 management information + title: Data associated with COVID-19 + version: 0.20210414.0 +write: +- data_product: records/SARS-CoV-2/scotland/cases-and-management/ambulance + description: Ambulance data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/calls + description: Calls data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/carehomes + description: Care homes data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/hospital + description: Hospital data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/mortality + description: Mortality data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/nhsworkforce + description: NHS workforce data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/schools + description: Schools data + version: 0.{DATETIME}.0 +- data_product: records/SARS-CoV-2/scotland/cases-and-management/testing + description: Testing data + version: 0.{DATETIME}.0 +``` + +## *submission_script.R* + +A submission script should be supplied by the user, which in this case registers an external object, reads it in, and then writes it back to the pipeline as a data product component. In the above example, this script is located in */inst/SCRC/scotgov_management/submission_script.R*. + +```R +library(SCRCdataAPI) + +# Open the connection to the local registry with a given config file +# You can put in a file if you really want to, but otherwise read from +# the environment directly or from a command line argument +handle <- initialise(Sys.getenv("FDP_CONFIG_DIR")) + +# Return location of file stored in the pipeline +input_path <- link_read(handle, "raw-mortality-data") + +# Process raw data and write data product +data <- read.csv(input_path) +array <- some_processing(data) +index <- write_array(array, + handle, + data_product = "human/mortality", + component = "mortality_data", + dimension_names = list(location = rownames(array), + date = colnames(array))) +issue_with_component(index, + handle, + issue = "this data is bad", + severity = 7) + +finalise(handle) +``` + +### `initialise()` + +- read the working *config.yaml* file +- return a `handle` containing: + - the working *config.yaml* file contents + - the object id for this file + - the object id for the submission script file + +### `link_read()` + +- this function returns the path of an external object in the local data store +- if the alias is already recorded in the handle, return the path +- if the alias is not recorded in the handle, find the location of the file referenced by its `alias` + - in the above example the alias is `management-data` + - note that the alias is not recorded in the data registry, rather, it's a means to reference external objects in the *config.yaml*) +- store metadata associated with the external object + +### `read_array()` + +- responsible for reading the correct data product, which at this point has been downloaded from the remote data store by `fdp pull` +- should by default read the latest version of the data, which at this point has been downloaded from the remote data store by `fdp pull` + +### `link_write()` + +- when writing external objects, we use `link_read()` and `link_write()` to read and write objects, rather than the standard API `read_xxx()` and `write_xxx()` calls. + +### `write_array()` + +- responsible for writing an array as a component to an hdf5 file +- should allocate the correct data product name to the data (*e.g.* for *human/outbreak/simulation_run-{RUN_ID}*, `{RUN_ID}` is replaced with an appropriate index) +- should by default increment the data product version by PATCH if none is specified +- if the **component** is already recorded in the handle, return the index of this handle reference invisibly +- otherwise: + - if this is the first component to be written, record the save location in the handle, conversely, if this is not the first component to be written, reference the save location from the handle + - write the component to the hdf5 file + - determine the correct version number to be associated with the new data product + - update the `handle` with the component that was written and its location in the data store + +### `issue_with_component()` + +- **this is very much a work in progress** +- find the input or output reference (in the handle) that the issue is associated with +- note that issues can also be associated with scripts, etc. (I've not gone near this yet) +- record issue metadata in the handle +- NOTE that in the above example, `issue_with_component()` takes an `index` that references an object recorded in the handle, alternatively it may take a `dataproduct`, `component`, and `version` as identifiers +- NOTE that `issue_with_component()` is but one of a series of functions including `inssue_with_dataproduct()`, `issue_with_externalobject()`, and `issue_with_script()`; it might make more sense for you to write a generic `raise_issue()` function depending on language constraints + +### `finalise()` + +- rename the data product as *.h5* +- record data product metadata (*e.g.* location, components, various descriptions, issues) in the data registry +- record the code run in the data registry + +## *submission_script.py* + +Alternatively, the submission script may be written in Python. + +```python +from data_pipeline_api.standard_api import StandardAPI + +with StandardAPI.from_config("config.yaml") as api: + matrix = read(api.link_read("raw-mortality-data")) + api.write_array("human/mortality", "mortality_data", matrix) + api.issue_with_component("human/mortality", "mortality_data", "this data is bad", "7") + api.finalise() +``` + +## *submission_script.jl* + +Alternatively, the submission script may be written in Julia. + +```julia +using DataPipeline + +# Open the connection to the local registry with a given config file +handle = initialise("config.yaml") + +# Return location of file stored in the pipeline +input_path = link_read(handle) + +# Process raw data and write data product +data = read_csv(input_path) +array = some_processing(data) +index = write_estimate(array, + handle, + data_product = "human/mortality", + component = "mortality_data") + +issue_with_component(index, + handle, + issue = "this data is bad", + severity = 7) + +finalise(handle) +``` + +## C++ + +## Java \ No newline at end of file diff --git a/content/docs/interface/example2/_index.md b/content/docs/interface/example2/_index.md new file mode 100644 index 0000000..0ac356b --- /dev/null +++ b/content/docs/interface/example2/_index.md @@ -0,0 +1,135 @@ +--- +weight: 40 +title: "Additional examples" +--- + +Note that this is a living document and the following is subject to change. + +# Additional examples + +## Read data product, process, and write data product (with aliases) + +### User written *config.yaml* + +```yaml +run_metadata: + description: A test model + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: soniamitchell + default_data_store: /Users/SoniaM/datastore/ + local_repo: /Users/Soniam/Desktop/git/SCRC/SCRCdata + script: |- + R -f inst/SCRC/scotgov_management/submission_script.R {CONFIG_DIR} + +read: +- data_product: human/population + use: + namespace: johnsmith + data_product: scotland/human/population + +write: +- data_product: human/outbreak-timeseries + use: + data_product: scotland/human/outbreak-timeseries +- data_product: human/outbreak/simulation_run + use: + data_product: human/outbreak/simulation_run-{RUN_ID} +``` + +### Working *config.yaml* + +`fdp run` should create a working *config.yaml* file, which is read by the Data Pipeline API. In this example, the working *config.yaml* file is pretty much identical to the original *config.yaml* file, only `{CONFIG_DIR}` is replaced by the directory in which the working *config.yaml* file resides. + +```yaml +run_metadata: + description: A test model + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: soniamitchell + default_data_store: /Users/SoniaM/datastore/ + local_repo: /Users/Soniam/Desktop/git/SCRC/SCRCdata + script: |- + R -f inst/SCRC/scotgov_management/submission_script.R /Users/SoniaM/datastore/coderun/20210511-231444/ + +read: +- data_product: human/population + use: + namespace: johnsmith + data_product: scotland/human/population + +write: +- data_product: human/outbreak-timeseries + use: + data_product: scotland/human/outbreak-timeseries +- data_product: human/outbreak/simulation_run + use: + data_product: human/outbreak/simulation_run-{RUN_ID} +``` + +## Read and write an external object + +A script to read and write an external object (*i.e.* something not in a core data pipeline format) in R. First, the yaml file, that gives the `doi_or_unique_name` and `title` of the external objects being read and written, and the aliases that will be used in the submission script: + +### User written *config.yaml* + +```yaml +run_metadata: + description: A simple example using external objects + local_data_registry_url: https://localhost:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: SCRC + default_output_namespace: johnsmith + default_data_store: /datastore/ + local_repo: /Users/johnsmith/git/myproject/ + script: # Points to the R script, below (relative to local_repo) + R -f path/submission_script.R {CONFIG_DIR} + +read: +- external_object: time-series + unique_name: An exciting time series + title: Table 1 + +write: +- external_object: revised-time-series + unique_name: An new, revised, time series + title: Table 1 + file_type: csv + primary: True +``` + +### Working *config.yaml* + +```yaml +... +``` + +## Read then write a data product component + +Now that the pipeline is populated, one of the simplest possible use cases is just to read in a value, calculate a new value from it, and write out the new value. Again, we need to write a *config.yaml* file: + +### User written *config.yaml* + +```yaml +run_metadata: + description: A simple example reading and writing data products + default_input_namespace: SCRC + local_repo: /Users/johnsmith/git/myproject + script: | # addresses are relative to local_repo + julia -f path/submission_script.jl {CONFIG_DIR} + +read: +- data_product: human/infection/SARS-CoV-2 + +write: +- data_product: human/infection/SARS-CoV-2/doubled + component: doubled-infectious-period +``` + +### Working *config.yaml* + +```yaml +... +``` diff --git a/content/docs/interface/fdp/_index.md b/content/docs/interface/fdp/_index.md new file mode 100644 index 0000000..92a547e --- /dev/null +++ b/content/docs/interface/fdp/_index.md @@ -0,0 +1,43 @@ +--- +weight: 20 +title: "Core FDP functionality" +--- + +Note that this is a living document and the following is subject to change. + +# Core `fdp` functionality + +A simple example of how the data pipline should run from the command line: + +```bash +fdp pull config.yaml +fdp run config.yaml +fdp push config.yaml +``` + +## `fdp pull` + +- download any data required by `read:` from the remote data store and record metadata in the data registry (whilst editing relevant entries, *e.g.* `storage_root`) +- pull meta data associated with all previous versions of these objects listed in `write:` from the remote data registry +- download any data listed in `register:` from the original source and record metadata in the data registry + +## `fdp run` + +- read (and validate) the *config.yaml* file +- generate a working *config.yaml* file (see [Working example]({{% ref "/docs/interface/example1" %}})) + - globbing (`*` and `**` replaced with all matching objects, all components listed), specific version numbers, and any variables in `run_metadata:`, `register:`, and `read:` (not `write:`) are replaced with true values + - *e.g.* `{CONFIG_DIR}` is replaced by the directory within which the working *config.yaml* file resides, `release_date: {DATETIME}` is replaced by `release_date: 2021-04-14 11:34:37`, and `version: 0.{DATETIME}.0` is replaced by `version: 0.20210414.0` + - `register:` is removed and external objects / data products are written in `read:` +- save the working *config.yaml* file in the local data store (in */coderun/\-\