add quant doc

COMBINE-lab · Aug 4, 2020 · e6786d7 · e6786d7
1 parent 735632a
commit e6786d7
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 17 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -21,7 +21,8 @@ It makes use of the selective-alignment and barcode processing framework of
    building
    getting_started
    generate_permit_list  
-   collate 
+   collate
+   quant
    LICENSE.rst
 
 Indices and tables

diff --git a/docs/source/quant.rst b/docs/source/quant.rst
@@ -0,0 +1,58 @@
+quant
+=====
+
+The ``quant'' command takes a collated RAD file and performs feature (e.g. gene) quantification, outputting
+a sparse matrix of de-duplicated counts as well as a list of labels for the rows and columns.  The ``quant``
+command takes an input directory containing the collated RAD file, a transcript-to-gene map, an output directory
+where the results will be written and a "resolution strategy" (described below).  Quantification is 
+multi-threaded, so it also, optionally, takes as an arguments the number of threads to use concurrently.
+
+The transcript-to-gene map should be a two-column (headerless) tab separated file where the first column 
+contains a transcript name and the second column contains the corresponding gene name for this transcript.
+
+The ``quant`` command exposes a number of different resolution strategies.  They are:
+
+* ``full`` : This is the default resolution strategy.  It implements the algorithm described in the alevin_ 
+paper.  Briefly, it builds a graph among the set of reads that align to an overlapping set of transcripts 
+and that have similar (within an edit distance of 1) UMIs.  It then attempts to find a parsimonious cover 
+for this graph using the fewest number of possible transcripts.  If a unique parsimonious cover is found,
+then the (deduplicated) reads are assigned directly to the genes that yield the most parsimonious cover.
+If multiple equally-parsimonious covers exist, then the reads are considered multi-mapping at the gene 
+level and they are probabilistically resolved using an expectation maximization (EM) algorithm.
+
+* ``parsimony`` : This strategy is the same as ``full'', except that it does *not* probabilistically resolve
+reads that remain as gene-multimapping after applying the parsimony criterion.  Instead, reads that do 
+not have a unique most-parsimonious assignment are discarded.
+
+* ``trivial`` : This strategy does not search for 1 edit-distance neighbors of UMIs.  Instead, it first 
+discards any reads that multi-map at the gene level.  The reads that remain then all map uniquely to a 
+single gene.  These reads are deduplicated by (exact) UMI, and the number of distinct UMIs mapping to 
+each gene are taken as that gene's count in the current cell.
+
+* ``cr-like`` : This strategy is like the one adopted in cell-ranger, except that it does not first 
+collapse 1-edit-distance UMIs.  Within each cell barcode, a list of (gene, UMI, count) tuples is created.
+If a read maps to more than one gene, then it generates more than one such tuple.  The tuples are then 
+sorted lexicographically (first by gene id, then by UMI, and then by count).  Any UMI that aligns to only 
+a single gene is assigned to that gene.  UMIs that align to more than one gene are assigned to the gene 
+with the highest count for this UMI.  If there is a tie for the highest count gene for this UMI, then the 
+corresponding reads are simply discarded.
+
+output
+------
+
+The output of the ``quant`` command consists of 3 files: ``barcodes.txt``,
+``counts.mtx`` and ``gene_names.txt``. The ``counts.mtx`` is a matrix market
+coordinate format file where the number of *rows* is equal to the number of
+genes and the number of columns is equal to the number of *cells*. The header
+line encodes the number of rows, columns and non-zero entries. The subsequent
+lines (1-based indexing) encode the locations and values of the non-zero
+entries. The two other files provide the labels for the rows and columns of
+this matrix. The ``gene_names.txt`` file is a text file that contains the
+names of the rows of the matrix, in the order in which it is written, with
+one gene name written per line. The ``barcodes.txt`` file is a text file that
+contains the names of the columns of the matrix, in the order in which it is
+written, with one barcode name written per line.
+
+
+
+.. _alevin: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1670-y
diff --git a/libradicl/src/cellfilter.rs b/libradicl/src/cellfilter.rs
@@ -11,13 +11,13 @@ use crate as libradicl;
 use fasthash::sea::Hash64;
 use fasthash::RandomState;
 use libradicl::exit_codes;
-use num_format::{Locale, ToFormattedString};
 use needletail::bitkmer::*;
-use std::str::from_utf8;
+use num_format::{Locale, ToFormattedString};
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::BufReader;
 use std::io::{BufWriter, Write};
+use std::str::from_utf8;
 
 pub enum CellFilterMethod {
     // cut off at this cell in
@@ -315,15 +315,27 @@ pub fn generate_permit_list(
 
     for (k, v) in permitted_map {
         let bc_mer: BitKmer = (k, ft_vals.bclen as u8);
-        writeln!(&mut writer, "{}\t{}", from_utf8(&bitmer_to_bytes(bc_mer)[..]).unwrap(), v).expect("couldn't write to output file.");
+        writeln!(
+            &mut writer,
+            "{}\t{}",
+            from_utf8(&bitmer_to_bytes(bc_mer)[..]).unwrap(),
+            v
+        )
+        .expect("couldn't write to output file.");
     }
 
     let o_path = parent.join("all_freq.tsv");
     let output = std::fs::File::create(&o_path).expect("could not create output.");
     let mut writer = BufWriter::new(&output);
     for (k, v) in hm {
         let bc_mer: BitKmer = (k, ft_vals.bclen as u8);
-        writeln!(&mut writer, "{}\t{}", from_utf8(&bitmer_to_bytes(bc_mer)[..]).unwrap(), v).expect("couldn't write to output file.");
+        writeln!(
+            &mut writer,
+            "{}\t{}",
+            from_utf8(&bitmer_to_bytes(bc_mer)[..]).unwrap(),
+            v
+        )
+        .expect("couldn't write to output file.");
     }
 
     let s_path = parent.join("permit_map.bin");

diff --git a/src/main.rs b/src/main.rs
@@ -11,7 +11,7 @@ extern crate slog;
 extern crate slog_term;
 
 use bio_types::strand::Strand;
-use clap::{App, Arg};
+use clap::{crate_authors, crate_version, App, Arg};
 use libradicl::cellfilter::{generate_permit_list, CellFilterMethod};
 use libradicl::schema::ResolutionStrategy;
 use mimalloc::MiMalloc;
@@ -22,8 +22,8 @@ use std::unimplemented;
 #[global_allocator]
 static GLOBAL: MiMalloc = MiMalloc;
 
-static VERSION: &str = "0.0.1";
-static AUTHORS: &str = "Avi Srivastava, Rob Patro";
+// static VERSION: &str = "0.0.1";
+// static AUTHORS: &str = "Avi Srivastava, Rob Patro";
 
 #[allow(dead_code)]
 fn gen_random_kmer(k: usize) -> String {
@@ -40,14 +40,15 @@ fn gen_random_kmer(k: usize) -> String {
 
 fn main() {
     let max_num_threads: String = (num_cpus::get() as u32).to_string();
-
+    let crate_authors = crate_authors!("\n");
+    let version = crate_version!();
     // [] add command for just counting barcode frequency
     // [] add other algorithms for determining barcode cutoff
 
     let gen_app = App::new("generate-permit-list")
         .about("Generate a permit list of barcodes from a RAD file")
-        .version(VERSION)
-        .author(AUTHORS)
+        .version(version)
+        .author(crate_authors)
         .arg(Arg::from("-i, --input=<input>  'input RAD file'"))
         .arg(Arg::from(
             "-o, --output-dir=<output-dir>  'output directory'",
@@ -73,8 +74,8 @@ fn main() {
 
     let collate_app = App::new("collate")
     .about("Collate a RAD file by corrected cell barcode")
-    .version(VERSION)
-    .author(AUTHORS)
+    .version(version)
+    .author(crate_authors)
     .arg(Arg::from("-i, --input-dir=<input-dir> 'input directory made by generate-permit-list'"))
     .arg(Arg::from("-r, --rad-file=<rad-file> 'the RAD file to be collated'"))
     .arg(Arg::from("-m, --max-records=[max-records] 'the maximum number of read records to keep in memory at once'")
@@ -84,8 +85,8 @@ fn main() {
 
     let quant_app = App::new("quant")
     .about("Quantify expression from a collated RAD file")
-    .version(VERSION)
-    .author(AUTHORS)
+    .version(version)
+    .author(crate_authors)
     .arg(Arg::from("-i, --input-dir=<input-dir>  'input directory containing collated RAD file'"))
     .arg(Arg::from("-m, --tg-map=<tg-map>  'transcript to gene map'"))
     .arg(Arg::from("-o, --output-dir=<output-dir> 'output directory where quantification results will be written'"))
@@ -97,8 +98,8 @@ fn main() {
         .about("the resolution strategy by which molecules will be counted"));
 
     let opts = App::new("alevin-fry")
-        .version(VERSION)
-        .author(AUTHORS)
+        .version(version)
+        .author(crate_authors)
         .about("Process RAD files from the command line")
         .subcommand(gen_app)
         .subcommand(collate_app)