Merge branch 'develop'

COMBINE-lab · Feb 15, 2022 · ec64b77 · ec64b77
2 parents 8073660 + 52ce365
commit ec64b77
Show file tree

Hide file tree

Showing 21 changed files with 490 additions and 382 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "alevin-fry"
-version = "0.4.3"
+version = "0.5.0"
 authors = ["Avi Srivastava <avi.srivastava@nyu.edu>", "Hirak Sarkar <hirak_sarkar@hms.harvard.edu>", "Dongze He <dhe17@umd.edu>", "Mohsen Zakeri <mzakeri@cs.umd.edu>", "Rob Patro <rob@cs.umd.edu>"]
 edition = "2018"
 description = "A suite of tools for the rapid, accurate and memory-frugal processing single-cell and single-nucleus sequencing data."
@@ -26,35 +26,35 @@ categories = ["command-line-utilities", "science"]
 
 [dependencies]	
 # for release
-libradicl = "0.4.3" 
+libradicl = "0.4.4" 
 # for local development
-# libradicl = { path = "libradicl", version = "0.4.3" }
-arrayvec = "0.7.1"
+# libradicl = { path = "libradicl", version = "0.4.4" }
+arrayvec = "0.7.2"
 ahash = "0.7.6"
 bincode = "1.3.3"
 bstr = "0.2.17"
-crossbeam-channel = "0.5.1"
-crossbeam-queue = "0.3.2"
+crossbeam-channel = "0.5.2"
+crossbeam-queue = "0.3.4"
 indicatif = "0.16.2"
 needletail = "0.4.1"
 petgraph = "0.6.0"
 flate2 = "1.0.22"
-scroll = "0.10.2"
-serde = { version = "1.0.130", features = ["derive"] }
-serde_json = "1.0.68"
+scroll = "0.11.0"
+serde = { version = "1.0.136", features = ["derive"] }
+serde_json = "1.0.79"
 sprs = "0.11.0"
 slog = "2.7.0"
-slog-term = "2.8.0"
-slog-async = "2.6.0"
-smallvec = "1.7.0"
+slog-term = "2.8.1"
+slog-async = "2.7.0"
+smallvec = "1.8.0"
 snap = "1"
-rand = "0.8.4"
+rand = "0.8.5"
 chrono = "0.4.19"
 csv = "1.1.6"
 mimalloc = { version = "0.1.26", default-features = false }
 num-format = "0.4.0"
 num_cpus = "1.13.0"
-bio-types = "0.12.0"
+bio-types = "0.12.1"
 itertools = "0.10.1"
 thiserror = "1.0.30"
 quickersort = "3.0.1"
@@ -63,8 +63,8 @@ rust-htslib = { version = "0.38.2", default-features = false, features = ["bzip2
 sce = { git = "https://github.com/parazodiac/SingleCellExperiment", version = "0.1.1" }
 
 [dependencies.clap]
-version = "=3.0.0-beta.5"
-features = ["wrap_help"]
+version = "3.0.14"
+features = ["wrap_help", "cargo"]
 
 [profile.release]
 #debug = true

diff --git a/README.md b/README.md
@@ -22,7 +22,11 @@ Are you curious about processing details like [whether to use a sparse or dense
 
 The generation of the reduced alignment data (RAD) files processed by alevin-fry is done by [salmon](https://github.com/COMBINE-lab/salmon). The latest version of salmon is available [on GitHub](https://github.com/COMBINE-lab/salmon/releases), via [bioconda](https://bioconda.github.io/recipes/salmon/README.html), and on [dockerhub](https://hub.docker.com/layers/combinelab/salmon/latest/images/sha256-f86324c6aeacb627e3c589562ab9e2564a6d51a3892a697669d3f23d0b9d81a8?context=explore). 
 
-The [`usefulaf`](https://github.com/COMBINE-lab/usefulaf) repository contains scripts in functions that are useful in helping to prepare input for alevin-fry processing, importing alevin-fry output into downstream analysis evnironemnts, and even [running common configurations of alevin-fry more simply](https://github.com/COMBINE-lab/usefulaf/blob/main/bash/simpleaf.sh).
+The [`usefulaf`](https://github.com/COMBINE-lab/usefulaf) repository contains scripts in functions that are useful in helping to prepare input for alevin-fry processing, importing alevin-fry output into downstream analysis evnironemnts, and even [running common configurations of alevin-fry more simply](https://github.com/COMBINE-lab/usefulaf/blob/main/bash/simpleaf.sh).  This repository also contains the relevant [Python function](https://github.com/COMBINE-lab/usefulaf/blob/main/python/load_fry.py) for loading fry output (specifically in USA mode) in a convenient way into [scanpy](https://scanpy.readthedocs.io/en/stable/) (i.e. as [AnnData](https://scanpy.readthedocs.io/en/latest/usage-principles.html#anndata) objects) for subsequent Python-based processing in scanpy.
+
+The [`fishpond`](https://github.com/mikelove/fishpond) package — maintained by @mikelove and his lab — contains the recommended relevant functions for reading `alevin-fry` output (particularly USA-mode output) into the R ecosystem, in the form of a [`singleCellExperiment`](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) object.
+
+The [`alevinqc`](https://github.com/csoneson/alevinQC) package — maintained by @csoneson — provides tool and functions for performing quality control and assessment downstream of `alevin-fry`.
 
 ## Installing from bioconda
 
@@ -148,16 +152,16 @@ At the end of this process, the directory `$AF_SAMPLE_DIR/quants/pbmc1k_v3/quant
 **R** : In [R](https://www.r-project.org/), you can make use of the `R` [`load_fry()`](https://github.com/COMBINE-lab/usefulaf/blob/main/R/load_fry.R) function here, and read the input with the command:
 
 ```{R}
-m <- load_fry("$AF_SAMPLE_DIR/quants/pbmc1k_v3/quant", which_counts=c('S', 'A'))
+m <- load_fry("$AF_SAMPLE_DIR/quants/pbmc1k_v3/quant")
 ```
 
-where `$AF_SAMPLE_DIR` is appropriately replaced by the path to the working directory we chose at the start of this exercise.  This will return a [SingleCellExperiment](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) object containing the counts for this experiment.  The stand-alone `load_fry()` function has been merged into the [`fishpond`](https://bioconductor.org/packages/release/bioc/html/fishpond.html) package and will be part of the next release.
+where `$AF_SAMPLE_DIR` is appropriately replaced by the path to the working directory we chose at the start of this exercise.  This will return a [SingleCellExperiment](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) object containing the counts for this experiment.  The stand-alone `load_fry()` function is part of [`fishpond`](https://bioconductor.org/packages/release/bioc/html/fishpond.html), and the function is documented in detail [here](https://mikelove.github.io/fishpond/reference/loadFry.html).
 
 
 **Python** : In [python](https://www.python.org/), you can make use of the `python` [`load_fry()`](https://github.com/COMBINE-lab/usefulaf/blob/main/python/load_fry.py) function, which relies on [scanpy](https://scanpy.readthedocs.io/en/stable/).  To read the input you can use the following command:
 
 ```{python}
-m = load_fry("$AF_SAMPLE_DIR/quants/pbmc1k_v3/quant", which_counts=['S','A'])
+m = load_fry("$AF_SAMPLE_DIR/quants/pbmc1k_v3/quant")
 ```
 
 where, again `$AF_SAMPLE_DIR` is appropriately replaced by the path to the working directory we chose at the start of this exercise.  This will return a `scanpy` [`AnnData`](https://anndata.readthedocs.io/en/latest/) object with the counts.
@@ -195,3 +199,4 @@ that you should pass to `alevin-fry` during the `quant` phase.
 If you have any questions about preparing the splici reference, or otherwise about processing your data with `alevin-fry` please feel free to open an issue 
 here on GitHub!
 
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -18,11 +18,11 @@
 # -- Project information -----------------------------------------------------
 
 project = 'alevin-fry'
-copyright = '2021, Avi Srivastava, Hirak Sarkar, Dongze He, Mohsen Zakeri, Rob Patro'
-author = 'Avi Srivastava, Hirak Sarkar, Dongze He, Mohsen Zakeri, Rob Patro'
+copyright = '2021-2022, Dongze He, Mohsen Zakeri, Hirak Sarkar, Charlotte Soneson, Avi Srivastava, Rob Patro'
+author = 'Dongze He, Mohsen Zakeri, Hirak Sarkar, Charlotte Soneson, Avi Srivastava, Rob Patro'
 
 # The full version, including alpha/beta/rc tags
-release = '0.4.0'
+release = '0.4.3'
 
 master_doc = 'index'
 

diff --git a/docs/source/generate_permit_list.rst b/docs/source/generate_permit_list.rst
@@ -35,7 +35,7 @@ barcodes are decided):
 
 * ``--valid-bc <bcfile>``: This option will read the provided file <bcfile> and treat it as an explicitly-provided list of true, filtered barcodes (i.e. a list of barcodes believed to belong to a set of high-confidence cells truly present in the given sample). Barcodes appearing in this list will be considered to correspond to true and filtered cells, and barcodes will be corrected to this list. This flag is *not* designed to perform unfiltered quantification (i.e. correcting to a list of all *possible* barcodes generated by a technology, like e.g. the `10x v3 permit list <https://raw.githubusercontent.com/10XGenomics/cellranger/master/lib/python/cellranger/barcodes/translation/3M-february-2018.txt.gz>`_).  To correct against an *unfiltered* permit list, you should use the ``--unfiltered-pl`` flag described below (which is currently in beta).
 
-* ``--unfiltered-pl <plist>``: This option accepts as an argument a list of *possible* barcodes for the sample.  For example, this is the flag you should use if you wish to provide an "external permit list", like the 10x v2 or 10x v3 permit lists. Unilike with the ``--valid-bc`` flag, the list passed to this argument is the set of all possible barcodes for the technology being processed, and it is likely that most of the barcodes in the file may not correspond to cells present in this particular sample.  When using this argument, you may also pass the ``--min-reads`` argument to determine the minimum frequency with which a barcode must be seen in order to be retained.  The algorithm used here will pass over the input records (mapped reads) and count how many times each of the barcodes in the unfiltered permit list occur exactly.  Any barcode ocurring >= ``min-reads`` times will be considered as a present cell.  Subsequently, all barcodes that did not match a present cell will be searched (at an edit distance of up to 1) againt the barcodes determined to correspond to present cells.  If an initially non-matching barcode has a unique neighbor among the barcodes for present cells, it will be corrected to that barcode, but if it has no 1-edit neighbor, or if it has 2 or more 1-edit neighbors among that list (i.e. it's correction would be ambiguous), then the record is discarded. *Note* : support for unfiltered permit lists is currently in beta.
+* ``--unfiltered-pl <plist>``: This option accepts as an argument a list of *possible* barcodes for the sample.  For example, this is the flag you should use if you wish to provide an "external permit list", like the 10x v2 or 10x v3 permit lists. Unilike with the ``--valid-bc`` flag, the list passed to this argument is the set of all possible barcodes for the technology being processed, and it is likely that most of the barcodes in the file may not correspond to cells present in this particular sample.  When using this argument, you may also pass the ``--min-reads`` argument to determine the minimum frequency with which a barcode must be seen in order to be retained.  The algorithm used here will pass over the input records (mapped reads) and count how many times each of the barcodes in the unfiltered permit list occur exactly.  Any barcode ocurring >= ``min-reads`` times will be considered as a present cell.  Subsequently, all barcodes that did not match a present cell will be searched (at an edit distance of up to 1) againt the barcodes determined to correspond to present cells.  If an initially non-matching barcode has a unique neighbor among the barcodes for present cells, it will be corrected to that barcode, but if it has no 1-edit neighbor, or if it has 2 or more 1-edit neighbors among that list (i.e. it's correction would be ambiguous), then the record is discarded.
 
 * ``--min-reads <threshold>``: This flag is meant to be used (and currently only applied) in conjunction with ``--unfiltered-pl``.  Any barcodes from the provided permit list that have >= ``<threshold>`` exact occurrences in the input file will be deemed as present cells and will be passed on to subsequent phases of quantification.  Barcodes occurring < ``threshold`` number of times will be corrected against the set of present cells using the procedure described above.
 
@@ -47,11 +47,11 @@ output
 The ``generate-permit-list`` command outputs a number of different files in the output directory.  Not all files are 
 relevant to users of ``alevin-fry``, but the files are described here.
 
-1. The file ``all_freq.tsv`` is a two-column tab-separated file that lists, for each distinct barcode in the input RAD file, the number of read records that were tagged with this barcode.
+1. The file ``all_freq.bin`` is a binary file that records, for each distinct barcode in the input RAD file, the number of read records that were tagged with this barcode.
 
-2. The file ``permit_freq.tsv`` is a two-column tab-separated file that lists, for each barcode in the input RAD file that is determined to be a *true* barcode, the number of read records associated with this barcode.
+2. The file ``permit_freq.bin`` is a binary file that lists, for each barcode in the input RAD file that is determined to be a *true* barcode, the number of read records associated with this barcode.
 
 3. The file ``permit_map.bin`` is a binary file (a serde serialized HashMap) that maps each barcode in the input RAD file that is within an edit distance of 1 to some *true* barcode to the barcode to which it corrects.  This allows the ``collate`` command to group together all of the read records corresponding to the same *corrected* barcode.
 
-4. The file  ``generate_permit_list.json`` that is a JSON file containing information about the run of the command (currently, just the expected orientation).
+4. The file ``generate_permit_list.json`` that is a JSON file containing information about the run of the command (currently, just the expected orientation).
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -18,6 +18,7 @@ It makes use of the selective-alignment and tag processing framework of
    :maxdepth: 2
    :caption: Contents:
 
+   other_resources
    installing
    getting_started
    commands

diff --git a/docs/source/other_resources.rst b/docs/source/other_resources.rst
@@ -0,0 +1,34 @@
+Other resources for alevin-fry
+==============================
+
+In addition to the current documentation page, there are numerous other resources to help you learn more about alevin-fry, how to process data using
+this program, and how to further process the output of alevin-fry in downstream analysis.
+
+Tutorials
+---------
+
+A collection of tutorials describing how to process different types of data with `alevin-fry` and describing different features of `alevin-fry` is 
+available `here <https://combine-lab.github.io/alevin-fry-tutorials/#blog>`_.
+
+FAQ
+---
+
+We hope to make use of GitHub discussions to answer frequently asked questions, and to discuss other issues relevant to the development and use
+of `alevin-fry`.  You can visit the GitHub discussion page for `alevin-fry here <https://github.com/COMBINE-lab/alevin-fry/discussions>`_.  
+GitHub discussions are also a good place to raise large-scale feature requests to see if they make sense in the context of `alevin-fry`.  For 
+small-scale feature requests, or to report bugs or unexpected behavior you encounter when processing data with `alevin-fry`, please make use 
+of our `GitHub issues page <https://github.com/COMBINE-lab/alevin-fry/issues>`_.
+
+Quality Control
+---------------
+
+Support for `alevin-fry` in the `alevinQC <https://github.com/csoneson/alevinQC>`_ package is imminent.
+
+Easy loading of USA-mode data
+-----------------------------
+
+The `fishpond <https://mikelove.github.io/fishpond/>`_ package contains many methods for making the ingestion of quantification results generated 
+by `salmon <https://github.com/COMBINE-lab/salmon>`_ and `alevin-fry` into R easy.  In particular, you can find documentation on the 
+`loadFry function here <https://mikelove.github.io/fishpond/reference/loadFry.html>`_.  This makes it easy to import USA-mode quantification 
+results into a `SingleCellExperiment <https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html>`_ object, and to properly 
+extract or combine the spliced, unspliced, and ambiguous count components.
diff --git a/docs/source/quant.rst b/docs/source/quant.rst
@@ -1,7 +1,7 @@
 quant
 =====
 
-The ``quant`` command takes a collated RAD file and performs feature (e.g. gene) quantification, outputting a sparse matrix of de-duplicated counts as well as a list of labels for the rows and columns.  The ``quant`` command takes an input directory containing the collated RAD file, a transcript-to-gene map, an output directory where the results will be written and a "resolution strategy" (described below).  Quantification is multi-threaded, so it also, optionally, takes as an arguments the number of threads to use concurrently.
+The ``quant`` command takes a collated RAD file and performs feature (e.g. gene) quantification, outputting a sparse matrix of de-duplicated counts as well as a list of labels for the rows and columns.  The ``quant`` command takes an input directory containing the collated RAD file, a transcript-to-gene map, an output directory where the results will be written, and a "resolution strategy" (described below).  Quantification is multi-threaded, so it also, optionally, takes as an arguments the number of threads to use concurrently.
 
 The transcript-to-gene map should be either:
 
@@ -36,7 +36,7 @@ Additionally, this command can optionally take the following flags (note that no
 output
 ------
 
-The output of the ``quant`` command consists of 5 files: ``quants_mat_rows.txt``, ``counts.eds.gz`` (or ``quants_mat.mtx`` if run with the ``--use-mtx`` flag), ``quants_mat_cols.txt``, ``meta_info.json``, and ``features.txt``.  The ``meta_info.json`` file contains information about the quantification run, such as the method used for UMI resolution.  The ``features.txt`` file contains cell-level information designed to be useful in post-quantification cell filtering (better determining "true" cells from background, noise, doublets etc.).  The other three files all correspond to quantification information.
+The output of the ``quant`` command consists of 5 files: ``quants_mat_rows.txt``, ``counts.eds.gz`` (or ``quants_mat.mtx`` if run with the ``--use-mtx`` flag), ``quants_mat_cols.txt``, ``quant.json``, and ``featureDump.txt``.  The ``quant.json`` file contains information about the quantification run, such as the method used for UMI resolution.  The ``featureDump.txt`` file contains cell-level information designed to be useful in post-quantification cell filtering (better determining "true" cells from background, noise, doublets etc.).  The other three files all correspond to quantification information.
 
 If ``quant`` was executed in USA mode, then the resulting count matrix will be of dimension ``C``x``3G`` where ``C`` is the number of quantified cells (barcodes) and ``G`` is the number of genes.  This is because, in USA mode, ``alevin-fry`` quantifies the UMI count attributable to each splicing state of each gene in each cell, where the splicing state is one of spliced (S), unspliced (U) or ambiguous (A).  If ``quant`` was run with a two-column transcript-to-gene map (not in USA-mode), then the resulting count matrix will be a ``C``x``G`` matrix, as splicing status is not tracked.  For more details on USA mode and its uses, please read the ``alevin-fry`` `preprint <https://www.biorxiv.org/content/10.1101/2021.06.29.450377v1>`__, or the `corresponding tutorial <https://combine-lab.github.io/alevin-fry-tutorials/2021/improving-txome-specificity/>`__.
 

diff --git a/libradicl/Cargo.toml b/libradicl/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "libradicl"
-version = "0.4.3"
+version = "0.4.4"
 authors = ["Avi Srivastava <avi.srivastava@nyu.edu>", "Hirak Sarkar <hirak_sarkar@hms.harvard.edu>", "Dongze He <dhe17@umd.edu>", "Mohsen Zakeri <mzakeri@cs.umd.edu>", "Rob Patro <rob@cs.umd.edu>"]
 edition = "2018"
 description = "support library for alevin-fry"
@@ -21,21 +21,21 @@ categories = ["command-line-utilities", "science"]
 
 [dependencies]
 snap = "1"
-scroll = "0.10.2"
+scroll = "0.11.0"
 num = "0.4.0"
 ahash = "0.7.6"
 slog = "2.7.0"
 bstr = "0.2.17"
-serde = { version = "1.0.130", features = ["derive"] }
+serde = { version = "1.0.136", features = ["derive"] }
 csv = "1.1.6"
-dashmap = "^4.0.2"
-crossbeam-channel = "0.5.1"
-bio-types = "0.12.0"
+dashmap = "^5.1.0"
+crossbeam-channel = "0.5.2"
+bio-types = "0.12.1"
 quickersort = "3.0.1"
 needletail = "0.4.1"
 flate2 = "1.0.22"
-smallvec = "1.7.0"
-serde_json = "1.0.68"
+smallvec = "1.8.0"
+serde_json = "1.0.79"
 sprs = "0.11.0"
 rust-htslib = { version = "0.38.2", default-features = false, features = ["bzip2", "lzma"] }
 sce = { git = "https://github.com/parazodiac/SingleCellExperiment", version = "0.1.1" }
diff --git a/libradicl/src/rad_types.rs b/libradicl/src/rad_types.rs
@@ -57,6 +57,7 @@ pub struct Chunk {
 }
 
 #[derive(Debug)]
+#[allow(dead_code)]
 pub struct CorrectedCbChunk {
     pub(crate) remaining_records: u32,
     pub(crate) corrected_bc: u64,