Merge pull request #680 from AlexsLemonade/allyhawkins/v0.7.2

Prep for v0.7.2
AlexsLemonade · Feb 2, 2024 · 6ae97cf · 6ae97cf
2 parents cd4499b + e2c41de
commit 6ae97cf
Show file tree

Hide file tree

Showing 19 changed files with 462 additions and 129 deletions.
diff --git a/bin/generate_unfiltered_sce.R b/bin/generate_unfiltered_sce.R
@@ -155,7 +155,7 @@ if (opt$feature_dir != "") {
 }
 
 
-# read in sample metadata and filter to sample ids
+# read in sample metadata
 sample_metadata_df <- readr::read_tsv(opt$sample_metadata_file) |>
   # rename sample id column
   dplyr::rename("sample_id" = "scpca_sample_id") |>
@@ -176,7 +176,28 @@ unfiltered_sce <- unfiltered_sce |>
   add_gene_symbols(gene_info = gtf) |>
   scuttle::addPerFeatureQCMetrics() |>
   # add dataframe with sample metadata to sce metadata
+  # `add_sample_metadata` will filter sample_metadata_df to the relevant sample ids
   add_sample_metadata(metadata_df = sample_metadata_df)
 
+# add explicit metadata field for the sample type
+sample_type <- sample_metadata_df |>
+  dplyr::filter(sample_id %in% sample_ids) |>
+  dplyr::mutate(
+    sample_type = dplyr::case_when(
+      is_xenograft ~ "patient-derived xenograft",
+      is_cell_line ~ "cell line",
+      .default = "patient tissue"
+    )
+  ) |>
+  dplyr::select(sample_id, sample_type) |>
+  # convert into named vector
+  tibble::deframe()
+
+# unname if length is 1, and add to sce metadata
+if (length(sample_type) == 1) {
+  sample_type <- unname(sample_type)
+}
+metadata(unfiltered_sce)$sample_type <- sample_type
+
 # write to rds
 readr::write_rds(unfiltered_sce, opt$unfiltered_file, compress = "gz")
diff --git a/external-instructions.md b/external-instructions.md
@@ -78,12 +78,12 @@ Using the above command will run the workflow from the `main` branch of the work
 To update to the latest released version you can run `nextflow pull AlexsLemonade/scpca-nf` before the `nextflow run` command.
 
 To be sure that you are using a consistent version, you can specify use of a release tagged version of the workflow, set below with the `-r` flag.
-The command below will pull the `scpca-nf` workflow directly from Github using the `v0.7.1` version.
+The command below will pull the `scpca-nf` workflow directly from Github using the `v0.7.2` version.
 Released versions can be found on the [`scpca-nf` repository releases page](https://github.com/AlexsLemonade/scpca-nf/releases).
 
 ```sh
 nextflow run AlexsLemonade/scpca-nf \
-  -r v0.7.1 \
+  -r v0.7.2 \
   -config <path to config file>  \
   -profile <name of profile>
 ```
@@ -312,7 +312,7 @@ If you will be analyzing spatial expression data, you will also need the Cell Ra
 
 If your compute nodes do not have internet access, you will likely have to pre-pull the required container images as well.
 When doing this, it is important to be sure that you also specify the revision (version tag) of the `scpca-nf` workflow that you are using.
-For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.7.1`, then you will want to set `-r v0.7.1` for `get_refs.py` as well to be sure you have the correct containers.
+For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.7.2`, then you will want to set `-r v0.7.2` for `get_refs.py` as well to be sure you have the correct containers.
 By default, `get_refs.py` will download files and images associated with the latest release.
 
 If your system uses Docker, you can add the `--docker` flag:

diff --git a/internal-instructions.md b/internal-instructions.md
@@ -82,7 +82,7 @@ Please refer to our [`CONTRIBUTING.md`](CONTRIBUTING.md#stub-workflows) for more
 When running the workflow for a project or group of samples that is ready to be released on ScPCA portal, please use the tag for the latest release:
 
 ```
-nextflow run AlexsLemonade/scpca-nf -r v0.7.1 -profile ccdl,batch --project SCPCP000000
+nextflow run AlexsLemonade/scpca-nf -r v0.7.2 -profile ccdl,batch --project SCPCP000000
 ```
 
 ### Processing example data

diff --git a/lib/Utils.groovy b/lib/Utils.groovy
@@ -44,7 +44,11 @@ class Utils {
    */
   static def getMetaVal(file, key){
     def obj = new JsonSlurper().parse(file)
-    def value = this.parseNA(obj[key])
+    def value = obj[key]
+
+    if (value instanceof String) {
+      value = this.parseNA(value)
+    }
 
     return(value)
   }

diff --git a/main.nf b/main.nf
@@ -25,7 +25,7 @@ citeseq_techs = single_cell_techs.findAll{it.startsWith('CITEseq')}
 cellhash_techs = single_cell_techs.findAll{it.startsWith('cellhash')}
 
 // report template paths
-report_template_dir = file("${projectDir}/templates/qc_report", type: 'dir')
+report_template_dir = file("${projectDir}/templates/qc_report", type: 'dir', checkIfExists: true)
 report_template_file = "main_qc_report.rmd"
 celltype_report_template_file = "celltypes_supplemental_report.rmd"
 report_template_tuple = tuple(report_template_dir, report_template_file, celltype_report_template_file)
@@ -51,7 +51,12 @@ if (!file(params.run_metafile).exists()) {
   param_error = true
 }
 
-sample_metafile = file(params.sample_metafile)
+sample_metafile = file(params.sample_metafile) // we make this for passing into later processes
+if (!sample_metafile.exists()) {
+  log.error("The 'sample_metafile' file '${params.sample_metafile}' can not be found.")
+  param_error = true
+}
+
 if (!sample_metafile.exists()) {
   log.error("The 'sample_metafile' file '${params.sample_metafile}' can not be found.")
   param_error = true
@@ -63,6 +68,17 @@ if (!resolution_strategies.contains(params.af_resolution)) {
   param_error = true
 }
 
+if (params.cellhash_pool_file && !file(params.cellhash_pool_file).exists()){
+  log.error("The 'cellhash_pool_file' file ${cellhash_pool_file} can not be found.")
+  param_error = true
+}
+
+// QC report check
+if (!file("${projectDir}/templates/qc_report/${report_template_file}").exists()) {
+  log.error("The 'report_template_file' file '${report_template_file}' can not be found.")
+  param_error = true
+}
+
 // cell type annotation file checks
 if (params.perform_celltyping) {
   if (!file(params.project_celltype_metafile).exists()) {
@@ -73,8 +89,14 @@ if (params.perform_celltyping) {
     log.error("The 'celltype_ref_metadata' file '${params.celltype_ref_metadata}' can not be found.")
     param_error = true
   }
+
+  if (!file("${projectDir}/templates/qc_report/${celltype_report_template_file}").exists()) {
+    log.error("The 'celltype_report_template_file' file '${celltype_report_template_file}' can not be found.")
+    param_error = true
+  }
 }
 
+
 if(param_error){
   System.exit(1)
 }

diff --git a/modules/af-features.nf b/modules/af-features.nf
@@ -145,7 +145,7 @@ workflow map_quant_feature{
     feature_barcodes_ch = feature_channel
       .map{meta -> tuple(
         meta.feature_barcode_file,
-        file("${meta.feature_barcode_file}")
+        file("${meta.feature_barcode_file}", checkIfExists: true)
       )}
       .unique()
     index_feature(feature_barcodes_ch)
@@ -159,14 +159,27 @@ workflow map_quant_feature{
         meta.barcode_file = "${params.barcode_dir}/${params.cell_barcodes[meta.technology]}";
         meta // return modified meta object
       }
+      // branch based on whether mapping should be run (make_rad) or skipped (has_rad)
+      // if neither fastq or rad dir are present, run goes into missing_inputs branch
       .branch{
-        has_rad: (
-          !params.repeat_mapping
-          && file(it.feature_rad_dir).exists()
-          && Utils.getMetaVal(file("${it.feature_rad_dir}/scpca-meta.json"), "ref_assembly") == "${it.ref_assembly}"
+        make_rad: (
+          // input files exist
+          it.files_directory && file(it.files_directory, type: "dir").exists() && (
+            // and repeat has been requested
+            params.repeat_mapping
+            // or the feature rad file directory does not exist
+            || !file(it.feature_rad_dir).exists()
+          )
         )
-        make_rad: true
-       }
+        has_rad: file(it.feature_rad_dir).exists()
+        missing_inputs: true
+      }
+
+    // send run ids in feature_ch.missing_inputs to log
+    feature_ch.missing_inputs
+      .subscribe{
+        log.error("The expected feature input fastq or rad files for ${it.run_id} are missing.")
+      }
 
     // pull out files that need to be repeated
     feature_reads_ch = feature_ch.make_rad
@@ -175,8 +188,8 @@ workflow map_quant_feature{
       .map{meta -> tuple(
         meta.feature_barcode_file,
         meta,
-        file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz"),
-        file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz")
+        file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz", checkIfExists: true),
+        file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz", checkIfExists: true)
       )}
       .combine(index_feature.out, by: 0) // combine by the feature_barcode_file (reused indices, so combine is needed)
       .map{it.drop(1)} // remove the first element (feature_barcode_file)
@@ -186,7 +199,7 @@ workflow map_quant_feature{
     feature_rad_ch = feature_ch.has_rad
       .map{meta -> tuple(
         Utils.readMeta(file("${meta.feature_rad_dir}/scpca-meta.json")),
-        file(meta.feature_rad_dir, type: 'dir')
+        file(meta.feature_rad_dir, type: 'dir', checkIfExists: true)
       )}
 
     // run Alevin on feature reads

diff --git a/modules/af-rna.nf b/modules/af-rna.nf
@@ -120,22 +120,37 @@ workflow map_quant_rna {
         meta.barcode_file = "${params.barcode_dir}/${params.cell_barcodes[meta.technology]}";
         meta // return modified meta object
       }
-       // split based in whether repeat_mapping is false and a previous dir exists
+       // branch based on whether mapping should be run (make_rad) or skipped (has_rad)
+       // if neither fastq or rad dir are present, run goes into missing_inputs branch
       .branch{
-        has_rad: (
-          !params.repeat_mapping
-          && file(it.rad_dir).exists()
-          && Utils.getMetaVal(file("${it.rad_dir}/scpca-meta.json"), "ref_assembly") == "${it.ref_assembly}"
+        make_rad: (
+          // input files exist
+          it.files_directory && file(it.files_directory, type: "dir").exists() && (
+            // and repeat has been requested
+            params.repeat_mapping
+            // the rad directory does not exist
+            || !file(it.rad_dir).exists()
+            // the assembly has changed; if rad_dir doesn't exist, this line won't get hit
+            || Utils.getMetaVal(file("${it.rad_dir}/scpca-meta.json"), "ref_assembly") != "${it.ref_assembly}"
+          )
         )
-        make_rad: true
-       }
+        has_rad: file(it.rad_dir).exists()
+        missing_inputs: true
+      }
+
+    // send run ids in rna_channel.missing_inputs to log
+    rna_channel.missing_inputs
+      .subscribe{
+        log.error("The expected input fastq or rad files for ${it.run_id} are missing.")
+      }
 
     // If we need to create rad files, create a new channel with tuple of (metadata map, [Read1 files], [Read2 files])
     rna_reads_ch = rna_channel.make_rad
       .map{meta -> tuple(
         meta,
-        file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz"),
-        file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz"),
+        // fail if the fastq files do not exist
+        file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz", checkIfExists: true),
+        file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz", checkIfExists: true),
         file(meta.salmon_splici_index, type: 'dir')
       )}
 
@@ -144,7 +159,7 @@ workflow map_quant_rna {
     rna_rad_ch = rna_channel.has_rad
       .map{meta -> tuple(
         Utils.readMeta(file("${meta.rad_dir}/scpca-meta.json")),
-        file(meta.rad_dir, type: 'dir')
+        file(meta.rad_dir, type: 'dir', checkIfExists: true) // fail if no rad directory
       )}
 
     // run Alevin for mapping on libraries that don't have RAD directory already created

diff --git a/modules/bulk-pileup.nf b/modules/bulk-pileup.nf
@@ -59,8 +59,8 @@ workflow pileup_multibulk{
         ],
         it[4], // bamfiles
         it[5], // bamfile indexes
-        file(it[2][0].ref_fasta),
-        file(it[2][0].ref_fasta_index)
+        file(it[2][0].ref_fasta, checkIfExists: true),
+        file(it[2][0].ref_fasta_index, checkIfExists: true)
       ]}
 
     mpileup(pileup_ch)

diff --git a/modules/bulk-salmon.nf b/modules/bulk-salmon.nf
@@ -121,32 +121,45 @@ workflow bulk_quant_rna {
         meta.salmon_results_dir = "${meta.salmon_publish_dir}/${meta.library_id}";
         meta // return modified meta object
       }
-      // split based on whether repeat_mapping is false and the salmon quant.sf file exists
+      // split based on whether repeat_mapping is true and the salmon results directory exists
       // and whether the assembly matches the current assembly
       .branch{
-        has_quants: (
-          !params.repeat_mapping
-          && file(it.salmon_results_dir).exists()
-          && Utils.getMetaVal(file("${it.salmon_results_dir}/scpca-meta.json"), "ref_assembly") == "${it.ref_assembly}"
-          && Utils.getMetaVal(file("${it.salmon_results_dir}/scpca-meta.json"), "t2g_bulk_path") == "${it.t2g_bulk_path}"
+        make_quants: (
+          // input files exist
+          it.files_directory && file(it.files_directory, type: "dir").exists() && (
+            // and repeat has been requested
+            params.repeat_mapping
+            // the results directory does not exist
+            || !file(it.salmon_results_dir).exists()
+            // the assembly has changed; if salmon_results_dir doesn't exist, these lines won't get hit
+            || Utils.getMetaVal(file("${it.salmon_results_dir}/scpca-meta.json"), "ref_assembly") != "${it.ref_assembly}"
+            || Utils.getMetaVal(file("${it.salmon_results_dir}/scpca-meta.json"), "t2g_bulk_path") != "${it.t2g_bulk_path}"
+          )
         )
-        make_quants: true
+        has_quants: file(it.salmon_results_dir).exists()
+        missing_inputs: true
+      }
+
+    // send run ids in bulk_channel.missing_inputs to log
+    bulk_channel.missing_inputs
+      .subscribe{
+        log.error("The expected input fastq or salmon results files for ${it.run_id} are missing.")
       }
 
     // If the quants are current and repeat_mapping is false
     // create tuple of metadata map (read from output), salmon output directory to use as input to merge_bulk_quants
     quants_ch = bulk_channel.has_quants
       .map{meta -> tuple(
         Utils.readMeta(file("${meta.salmon_results_dir}/scpca-meta.json")),
-        file(meta.salmon_results_dir, type: 'dir')
+        file(meta.salmon_results_dir, type: 'dir', checkIfExists: true)
       )}
 
     // If we need to run salmon, create tuple of (metadata map, [Read 1 files], [Read 2 files])
     bulk_reads_ch = bulk_channel.make_quants
       .map{meta -> tuple(
         meta,
-        file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz"),
-        file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz")
+        file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz", checkIfExists: true),
+        file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz", checkIfExists: true)
       )}
 
     // run fastp and salmon for libraries that are not skipping salmon
@@ -166,7 +179,7 @@ workflow bulk_quant_rna {
       .map{[
         it[1][0], // meta; relevant data should all be the same by project, so take the first
         it[2].sort(), // salmon directories, sorted for consistency (we can do this because there is only one tuple element)
-        file(it[1][0].t2g_bulk_path)
+        file(it[1][0].t2g_bulk_path, checkIfExists: true)
       ]}
 
     // create tsv file and combined metadata for each project containing all libraries

diff --git a/modules/bulk-star.nf b/modules/bulk-star.nf
@@ -38,9 +38,9 @@ workflow star_bulk{
     bulk_reads_ch = bulk_channel
         .map{meta -> tuple(
           meta,
-          file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz"),
-          file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz"),
-          file(meta.star_index, type: 'dir')
+          file("${meta.files_directory}/*_{R1,R1_*}.fastq.gz", checkIfExists: true),
+          file("${meta.files_directory}/*_{R2,R2_*}.fastq.gz", checkIfExists: true),
+          file(meta.star_index, type: 'dir', checkIfExists: true)
         )}
     // map and index
     bulkmap_star(bulk_reads_ch) \