Merge pull request #374 from AlexsLemonade/development

Merging in `development` for `v0.5.2` release
AlexsLemonade · Jul 11, 2023 · 4b4de3c · 4b4de3c
2 parents b8daa2a + 363e94e
commit 4b4de3c
Show file tree

Hide file tree

Showing 92 changed files with 2,762 additions and 91 deletions.
diff --git a/.github/workflows/nextflow-config-check.yaml b/.github/workflows/nextflow-config-check.yaml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   nf-config-check:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
       - name: Check nextflow params

diff --git a/.github/workflows/nextflow-stub-check.yaml b/.github/workflows/nextflow-stub-check.yaml
@@ -0,0 +1,37 @@
+
+name: Check nextflow stub
+
+on:
+  pull_request:
+    branches:
+      - main
+      - development
+
+jobs:
+  nf-stub-check:
+    runs-on: ubuntu-22.04
+    steps:
+
+      - name: Checkout repo
+        uses: actions/checkout@v3
+
+      - name: Check Nextflow workflow
+        uses: docker://nextflow/nextflow:21.10.6
+        with:
+          args: nextflow -log stub-run.log run main.nf -stub -profile stub -ansi-log false
+
+      - name: Check Nextflow with checkpoints from previous run
+        uses: docker://nextflow/nextflow:21.10.6
+        with:
+          args: nextflow -log checkpoint-run.log run main.nf -stub -profile stub -ansi-log false
+
+      - name: Join log files
+        run: cat stub-run.log checkpoint-run.log > nextflow-runs.log
+
+      - name: Upload nextflow log
+        if: ${{ always() }}
+        uses:  actions/upload-artifact@v3
+        with:
+          name: nextflow-log
+          path: nextflow-runs.log
+
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,11 @@
 
 # ignore work directory
 work/
+
+# ignore scpca-references
+scpca-references/
+
+
+# ignore template htmls
+qc_report.html
+*_qc.html
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -7,16 +7,25 @@ We welcome contributions to the `scpca-nf` workflow, including usage reports and
 Most updates to the `scpca-nf` workflow will begin with an issue that describes a change to be made and the reasoning behind that change.
 This provides an opportunity for discussion before implementation of any changes.
 
-## Pull requests and branch structure 
+## Pull requests and branch structure
 
 The `main` branch holds the current release version of the `scpca-nf` workflow.
 
-New features and other workflow updates that are ready in advance of a new release are found in the `development` branch. 
- 
-Contributions and updates to the `scpca-nf` repository operate on a pull request model. 
+New features and other workflow updates that are ready in advance of a new release are found in the `development` branch.
+
+Contributions and updates to the `scpca-nf` repository operate on a pull request model.
 Changes will typically be made in a new branch that is created from the `development` branch, followed by a pull request back to the `development` branch.
-All pull requests must be reviewed before merging to `development`. 
-To allow for efficient review, please include in any pull request a concise and clear explanation of the changes you have made and the issues addressed. 
+All pull requests must be reviewed before merging to `development`.
+To allow for efficient review, please include in any pull request a concise and clear explanation of the changes you have made and the issues addressed.
 
 When the changes in `development` merit a new release, a pull request will be filed to merge the current version of the `development` branch into `main`, followed by tagging a release on the `main` branch.
 
+## Stub workflows
+
+All Nextflow processes should include a [`stub` block](https://www.nextflow.io/docs/latest/process.html#stub) with a minimal script that can be run quickly to produce files in the expected output locations.
+At this stage this is purely used to allow for testing of the main workflow logic rather than the internal logic of each process.
+
+The [`test/stub-metadata.tsv`](test/stub-metadata.tsv) file is used to define input libraries that will be used for testing.
+Any additions to the overall workflow that will allow processing of a new library type should be added into `test/stub-metadata.tsv`, along with the appropriate input files (usually empty files with the expected names) for that library type in the `test/runs/` directory.
+If a new reference type is needed, that should be defined in the [`test/stub-refs.json`](test/stub-refs.json) file.
+
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ For more information on the contents of the output files and the processing of a
 
 The default configuration of the `scpca-nf` workflow is currently set up to process samples as part of [the ScPCA portal](https://scpca.alexslemonade.org/) and requires access to AWS through the Data Lab.
 For all other users, `scpca-nf` can be set up for your computing environment with a few configuration files.
-|[Instructions for using `scpca-nf` with external systems](external-instructions.md)|
+|[Instructions for using `scpca-nf`](external-instructions.md)|
 |---|
 
 :warning: Please note that processing single-cell and single-nuclei RNA-seq samples requires access to a high performance computing (HPC) environment with nodes that can accommodate jobs requiring up to 24 GB of RAM and 12 CPUs.

diff --git a/add-celltypes.nf b/add-celltypes.nf
@@ -0,0 +1,61 @@
+#!/usr/bin/env nextflow
+nextflow.enable.dsl=2
+
+include { annotate_celltypes } from './modules/classify-celltypes.nf'
+
+// parameter checks
+param_error = false
+
+if (!file(params.run_metafile).exists()) {
+  log.error("The 'run_metafile' file '${params.run_metafile}' can not be found.")
+  param_error = true
+}
+
+if (!file(params.celltype_refs_metafile).exists()) {
+  log.error("The 'celltype_refs_metafile' file '${params.celltype_refs_metafile}' can not be found.")
+  param_error = true
+}
+
+workflow {
+
+  // select runs to use
+  if (params.project){
+    // projects will use all runs in the project & supersede run_ids
+    run_ids = []
+  }else{
+    run_ids = params.run_ids?.tokenize(',') ?: []
+  }
+  run_all = run_ids[0] == "All"
+  if (run_all){
+    log.info("Executing workflow for all runs in the run metafile.")
+  }
+
+  // read in metadata file and filter to libraries/ projects of interest
+  processed_sce_ch = Channel.fromPath(params.run_metafile)
+    .splitCsv(header: true, sep: '\t')
+    .map{[
+        run_id: it.scpca_run_id,
+        library_id: it.scpca_library_id,
+        sample_id: it.scpca_sample_id,
+        project_id: it.scpca_project_id,
+        submitter: it.submitter,
+        technology: it.technology,
+        seq_unit: it.seq_unit,
+    ]}
+    .filter{it.seq_unit in ['cell', 'nucleus']}
+    // filter to only single-cell and remove any CITE-seq or multiplexed data
+    .filter{it.technology.startsWith("10Xv")}
+    .filter{run_all
+             || (it.run_id in run_ids)
+             || (it.library_id in run_ids)
+             || (it.sample_id in run_ids)
+             || (it.submitter == params.project)
+             || (it.project_id == params.project)
+            }
+    // tuple of meta, processed rds file to use as input to cell type annotation
+    .map{meta -> tuple(meta,
+                       file("${params.results_dir}/${meta.project_id}/${meta.sample_id}/${meta.library_id}_processed.rds")
+                       )}
+
+    annotate_celltypes(processed_sce_ch)
+}
diff --git a/assets/NO_FILE.txt b/assets/NO_FILE.txt
diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
@@ -0,0 +1,118 @@
+#!/usr/bin/env Rscript
+
+# This script is used to classify and annotate cells using SingleR
+
+# import libraries
+suppressPackageStartupMessages({
+  library(optparse)
+  library(SingleCellExperiment)
+})
+
+# set up arguments
+option_list <- list(
+  make_option(
+    opt_str = c("-i", "--input_sce_file"),
+    type = "character",
+    help = "path to rds file with input sce object"
+  ),
+  make_option(
+    opt_str = c("-o", "--output_sce_file"),
+    type = "character",
+    help = "path to output rds file to store processed sce object. Must end in .rds"
+  ),
+  make_option(
+    opt_str = c("--singler_models"),
+    type = "character",
+    help = "list of models generated for use with SingleR. Each input file contains 
+      a list of models generated from a single reference, one each for each label type:
+      `label.main`, `label.fine`, and `label.ont`."
+  ),
+  make_option(
+    opt_str = c("--seed"),
+    type = "integer",
+    help = "A random seed for reproducibility."
+  ),
+  make_option(
+    opt_str = c("-t", "--threads"),
+    type = "integer",
+    default = 1,
+    help = "Number of multiprocessing threads to use."
+  )
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# Set up -----------------------------------------------------------------------
+
+# set seed
+set.seed(opt$random_seed)
+
+# check that input file file exists
+if(!file.exists(opt$input_sce_file)){
+  stop("Missing input SCE file")
+}
+
+# check that references all exist
+model_files <- unlist(stringr::str_split(opt$singler_models, ","))
+if(!all(file.exists(model_files))){
+  missing_files <- model_files[which(!file.exists(model_files))]
+  glue::glue("
+             Missing model file(s): {missing_files}
+             ")
+  stop("Please make sure that all provided SingleR models exist.")
+}
+
+# set up multiprocessing params
+if(opt$threads > 1){
+  bp_param = BiocParallel::MulticoreParam(opt$threads)
+} else {
+  bp_param = BiocParallel::SerialParam()
+}
+
+# read in input rds file
+sce <- readr::read_rds(opt$input_sce_file)
+
+# read in references as a list of lists
+# each file contains a named list of models generated using the same reference dataset
+# but unique labels in the reference dataset
+model_names <- stringr::str_remove(basename(model_files), "_model.rds")
+names(model_files) <- model_names
+model_list <- purrr::map(model_files, readr::read_rds) |>
+  # ensure we have label type before reference name
+  # example: label.main_HumanPrimaryCellAtlasData
+  # where `label.main` is the name of the model stored in the file and
+  # `HumanPrimaryCellAtlasData` is the name of the reference used for each file containing a list of models
+  purrr::imap(\(model_list, ref_name){
+                names(model_list) <- glue::glue("{names(model_list)}_{ref_name}")
+                model_list
+              }) |>
+  purrr::flatten() 
+
+# SingleR classify -------------------------------------------------------------
+
+# create a partial function for mapping easily
+classify_sce <- purrr::partial(SingleR::classifySingleR, 
+                               test = sce, 
+                               fine.tune=TRUE, 
+                               BPPARAM = bp_param)
+# run singleR for all provided models
+all_singler_results <- model_list |>
+    purrr::map(classify_sce)
+
+# Annotate sce -----------------------------------------------------------------
+
+# create a dataframe with a single column of annotations for each model used
+all_annotations_df <- all_singler_results |>
+  purrr::map_dfc(\(result) result$pruned.labels ) |>
+  DataFrame()
+
+colData(sce) <- cbind(colData(sce), all_annotations_df)
+
+# store results in metadata
+metadata(sce)$singler_results <- all_singler_results
+
+# export sce with annotations added
+readr::write_rds(sce,
+                 opt$output_sce_file,
+                 compress = 'gz')
+