Skip to content

Commit

Permalink
Merge pull request #59 from COMBINE-lab/parsimony_usa
Browse files Browse the repository at this point in the history
Parsimony USA  mode + other changes
  • Loading branch information
rob-p committed May 30, 2022
2 parents 41770ae + e279153 commit 73c2733
Show file tree
Hide file tree
Showing 6 changed files with 417 additions and 244 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "alevin-fry"
version = "0.5.1"
version = "0.6.0"
authors = [
"Avi Srivastava <avi.srivastava@nyu.edu>",
"Hirak Sarkar <hirak_sarkar@hms.harvard.edu>",
Expand Down
107 changes: 102 additions & 5 deletions src/eq_class.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ pub struct CellEqClass<'a> {
pub umis: Vec<(u64, u32)>,
}

#[derive(Debug, Clone, Copy)]
pub enum EqMapType {
TranscriptLevel,
GeneLevel,
}

#[derive(Debug)]
pub struct EqMapEntry {
pub umis: Vec<(u64, u32)>,
Expand Down Expand Up @@ -212,6 +218,7 @@ pub struct EqMap {
// all transcripts
pub ref_labels: Vec<u32>,
eqid_map: HashMap<Vec<u32>, u32, ahash::RandomState>,
pub map_type: EqMapType,
}

impl EqMap {
Expand All @@ -232,11 +239,11 @@ impl EqMap {

self.ref_offsets.clear();
self.ref_labels.clear();

// keep map_type
//self.eqid_map.clear();
}

pub fn new(nref_in: u32) -> EqMap {
pub fn new(nref_in: u32, map_type: EqMapType) -> EqMap {
let rand_state = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64);
EqMap {
eqc_info: vec![], //HashMap::with_hasher(rs),
Expand All @@ -248,6 +255,7 @@ impl EqMap {
ref_offsets: vec![],
ref_labels: vec![],
eqid_map: HashMap::with_hasher(rand_state),
map_type,
}
}

Expand Down Expand Up @@ -335,6 +343,98 @@ impl EqMap {
}
}

pub fn init_from_chunk_gene_level(
&mut self,
cell_chunk: &mut rad_types::Chunk,
tid_to_gid: &[u32],
) {
self.eqid_map.clear();

let mut gvec: Vec<u32> = vec![];
// gather the equivalence class info
for r in &mut cell_chunk.reads {
// project from txp-level to gene-level
gvec.extend(r.refs.iter().map(|x| tid_to_gid[*x as usize]));
gvec.sort_unstable();
gvec.dedup();

match self.eqid_map.get_mut(&gvec) {
// if we've seen this equivalence class before, just add the new
// umi.
Some(v) => {
self.eqc_info[*v as usize].umis.push((r.umi, 1));
}
// otherwise, add the new umi, but we also have some extra bookkeeping
None => {
// each reference in this equivalence class label
// will have to point to this equivalence class id
let eq_num = self.eqc_info.len() as u32;
self.label_list_size += gvec.len();
for r in gvec.iter() {
let ridx = *r as usize;
self.label_counts[ridx] += 1;
//ref_to_eqid[*r as usize].push(eq_num);
}
self.eq_label_starts.push(self.eq_labels.len() as u32);
self.eq_labels.extend(&gvec);
self.eqc_info.push(EqMapEntry {
umis: vec![(r.umi, 1)],
eq_num,
});
self.eqid_map.insert(gvec.clone(), eq_num);
}
}

gvec.clear();
}
// final value to avoid special cases
self.eq_label_starts.push(self.eq_labels.len() as u32);

self.fill_ref_offsets();
self.fill_label_sizes();

// initially we inserted duplicate UMIs
// here, collapse them and keep track of their count
for idx in 0..self.num_eq_classes() {
// for each reference in this
// label, put it in the next free spot
// TODO: @k3yavi, can we avoid this copy?
let label = self.refs_for_eqc(idx as u32).to_vec();
//println!("{:?}", label);
for r in label {
self.ref_offsets[r as usize] -= 1;
self.ref_labels[self.ref_offsets[r as usize] as usize] = self.eqc_info[idx].eq_num;
}

let v = &mut self.eqc_info[idx];
// sort so dups are adjacent
v.umis.sort_unstable();
// we need a copy of the vector b/c we
// can't easily modify it in place
// at least I haven't seen how (@k3yavi, help here if you can).
let cv = v.umis.clone();
// since we have a copy, clear the original to fill it
// with the new contents.
v.umis.clear();

let mut count = 1;
let mut cur_elem = cv.first().unwrap().0;
for e in cv.iter().skip(1) {
if e.0 == cur_elem {
count += 1;
} else {
v.umis.push((cur_elem, count));
cur_elem = e.0;
count = 1;
}
}

// remember to push the last element, since we
// won't see a subsequent "different" element.
v.umis.push((cur_elem, count));
}
}

pub fn init_from_chunk(&mut self, cell_chunk: &mut rad_types::Chunk) {
/*
if cell_chunk.reads.len() < 10 {
Expand Down Expand Up @@ -393,15 +493,12 @@ impl EqMap {
// initially we inserted duplicate UMIs
// here, collapse them and keep track of their count
for idx in 0..self.num_eq_classes() {
//} self.eqc_info.iter_mut().enumerate() {

// for each reference in this
// label, put it in the next free spot
// TODO: @k3yavi, can we avoid this copy?
let label = self.refs_for_eqc(idx as u32).to_vec();
//println!("{:?}", label);
for r in label {
// k.iter() {
self.ref_offsets[r as usize] -= 1;
self.ref_labels[self.ref_offsets[r as usize] as usize] = self.eqc_info[idx].eq_num;
}
Expand Down
86 changes: 78 additions & 8 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
* License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause
*/

use anyhow::anyhow;
use anyhow::{anyhow, bail};
use bio_types::strand::Strand;
use clap::{arg, crate_authors, crate_version, Command};
use csv::Error as CSVError;
Expand Down Expand Up @@ -131,15 +131,29 @@ fn main() -> anyhow::Result<()> {
.arg(arg!(-b --"num-bootstraps" <NUMBOOTSTRAPS> "number of bootstraps to use").default_value("0"))
.arg(arg!(--"init-uniform" "flag for uniform sampling").requires("num-bootstraps").takes_value(false).required(false))
.arg(arg!(--"summary-stat" "flag for storing only summary statistics").requires("num-bootstraps").takes_value(false).required(false))
.arg(arg!(--"use-mtx" "flag for writing output matrix in matrix market instead of EDS").takes_value(false).required(false))
.arg(arg!(--"use-mtx" "flag for writing output matrix in matrix market format (default)").takes_value(false).required(false))
.arg(arg!(--"use-eds" "flag for writing output matrix in EDS format").takes_value(false).required(false).conflicts_with("use-mtx"))
.arg(arg!(--"quant-subset" <SFILE> "file containing list of barcodes to quantify, those not in this list will be ignored").required(false))
.arg(arg!(-r --resolution <RESOLUTION> "the resolution strategy by which molecules will be counted")
.possible_values(&["full", "trivial", "cr-like", "cr-like-em", "parsimony", "parsimony-em"])
.possible_values(&["full", "trivial", "cr-like", "cr-like-em", "parsimony", "parsimony-em", "parsimony-gene", "parsimony-gene-em"])
.ignore_case(true))
.arg(arg!(--"sa-model" "preferred model of splicing ambiguity")
.possible_values(&["prefer-ambig", "winner-take-all"])
.default_value("winner-take-all")
.hide(true))
.arg(arg!(--"umi-edit-dist" <EDIST> "the Hamming distance within which potentially colliding UMIs will be considered for correction")
.default_value_ifs(&[
("resolution", Some("cr-like"), Some("0")),
("resolution", Some("cr-like-em"), Some("0")),
("resolution", Some("trivial"), Some("0")),
("resolution", Some("parsimony"), Some("1")),
("resolution", Some("parsimony-em"), Some("1")),
("resolution", Some("full"), Some("1")),
("resolution", Some("parsimony-gene"), Some("1")),
("resolution", Some("parsimony-gene-em"), Some("1")),
])
.required(false)
.hide(true))
.arg(arg!(--"small-thresh" <SMALLTHRESH> "cells with fewer than these many reads will be resolved using a custom approach").default_value("10")
.hide(true));

Expand All @@ -154,7 +168,8 @@ fn main() -> anyhow::Result<()> {
.arg(arg!(-t --threads <THREADS> "number of threads to use for processing").default_value(&max_num_threads))
.arg(arg!(--usa "flag specifying that input equivalence classes were computed in USA mode").takes_value(false).required(false))
.arg(arg!(--"quant-subset" <SFILE> "file containing list of barcodes to quantify, those not in this list will be ignored").required(false))
.arg(arg!(--"use-mtx" "flag for writing output matrix in matrix market instead of EDS").takes_value(false).required(false));
.arg(arg!(--"use-mtx" "flag for writing output matrix in matrix market format (default)").takes_value(false).required(false))
.arg(arg!(--"use-eds" "flag for writing output matrix in EDS format").takes_value(false).required(false).conflicts_with("use-mtx"));

let opts = Command::new("alevin-fry")
.subcommand_required(true)
Expand Down Expand Up @@ -339,14 +354,66 @@ fn main() -> anyhow::Result<()> {
let init_uniform = t.is_present("init-uniform");
let summary_stat = t.is_present("summary-stat");
let dump_eq = t.is_present("dump-eqclasses");
let use_mtx = t.is_present("use-mtx");
let use_mtx = !t.is_present("use-eds");
let input_dir: String = t.value_of_t("input-dir").unwrap();
let output_dir = t.value_of_t("output-dir").unwrap();
let tg_map = t.value_of_t("tg-map").unwrap();
let resolution: ResolutionStrategy = t.value_of_t("resolution").unwrap();
let sa_model: SplicedAmbiguityModel = t.value_of_t("sa-model").unwrap();
let small_thresh = t.value_of_t("small-thresh").unwrap();
let filter_list = t.value_of("quant-subset");
let umi_edit_dist: u32 = t.value_of_t("umi-edit-dist").unwrap();
let mut pug_exact_umi = false;

match umi_edit_dist {
0 => {
match resolution {
ResolutionStrategy::Trivial
| ResolutionStrategy::CellRangerLike
| ResolutionStrategy::CellRangerLikeEm => {
// already false, not pug_exact_umi because
// these methods don't use PUG
}
ResolutionStrategy::Parsimony
| ResolutionStrategy::ParsimonyEm
| ResolutionStrategy::ParsimonyGene
| ResolutionStrategy::ParsimonyGeneEm => {
pug_exact_umi = true;
}
}
}
1 => {
match resolution {
ResolutionStrategy::Trivial
| ResolutionStrategy::CellRangerLike
| ResolutionStrategy::CellRangerLikeEm => {
// these methods don't currently support 1 edit UMIs
crit!(
log,
"\n\nResolution strategy {:?} doesn't currently support 1-edit UMI resolution",
resolution
);
bail!("Invalid command line option");
}
ResolutionStrategy::Parsimony
| ResolutionStrategy::ParsimonyEm
| ResolutionStrategy::ParsimonyGene
| ResolutionStrategy::ParsimonyGeneEm => {
pug_exact_umi = false;
}
}
}
j => {
// no method currently supported edit distance 2 or greater correction
crit!(
log,
"\n\nResolution strategy {:?} doesn't currently support {}-edit UMI resolution",
resolution,
j
);
bail!("Invalid command line option");
}
}

if dump_eq && (resolution == ResolutionStrategy::Trivial) {
crit!(
Expand All @@ -358,12 +425,14 @@ fn main() -> anyhow::Result<()> {

if num_bootstraps > 0 {
match resolution {
ResolutionStrategy::CellRangerLikeEm | ResolutionStrategy::Full => {
ResolutionStrategy::CellRangerLikeEm
| ResolutionStrategy::ParsimonyEm
| ResolutionStrategy::ParsimonyGeneEm => {
// sounds good
}
_ => {
eprintln!(
"\n\nThe num_bootstraps argument was set to {}, but bootstrapping can only be used with the cr-like-em or full resolution strategies",
"\n\nThe num_bootstraps argument was set to {}, but bootstrapping can only be used with the cr-like-em, parsimony-em, or parsimony-gene-em resolution strategies",
num_bootstraps
);
std::process::exit(1);
Expand Down Expand Up @@ -436,6 +505,7 @@ fn main() -> anyhow::Result<()> {
dump_eq,
use_mtx,
resolution,
pug_exact_umi,
sa_model,
small_thresh,
filter_list,
Expand Down Expand Up @@ -478,7 +548,7 @@ fn main() -> anyhow::Result<()> {
// and output a target-by-cell count matrix.
if let Some(t) = opts.subcommand_matches("infer") {
let num_threads = t.value_of_t("threads").unwrap();
let use_mtx = t.is_present("use-mtx");
let use_mtx = !t.is_present("use-eds");
let output_dir = t.value_of_t("output-dir").unwrap();
let count_mat = t.value_of_t("count-mat").unwrap();
let eq_label_file = t.value_of_t("eq-labels").unwrap();
Expand Down

0 comments on commit 73c2733

Please sign in to comment.