Permalink
Fetching contributors…
Cannot retrieve contributors at this time
3193 lines (3091 sloc) 111 KB
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <algorithm>
#include <cassert>
#include <stdexcept>
#include <seqan/find.h>
#include <getopt.h>
#include <vector>
#include <time.h>
#include "alphabet.h"
#include "assert_helpers.h"
#include "endian_swap.h"
#include "ebwt.h"
#include "formats.h"
#include "sequence_io.h"
#include "tokenize.h"
#include "hit.h"
#include "pat.h"
#include "bitset.h"
#include "threading.h"
#include "range_cache.h"
#include "refmap.h"
#include "annot.h"
#include "aligner.h"
#include "aligner_0mm.h"
#include "aligner_1mm.h"
#include "aligner_23mm.h"
#include "aligner_seed_mm.h"
#include "aligner_metrics.h"
#include "sam.h"
#include "ebwt_search.h"
#ifdef CHUD_PROFILING
#include <CHUD/CHUD.h>
#endif
#ifdef WITH_TBB
#include <tbb/compat/thread>
#endif
using namespace std;
using namespace seqan;
static vector<string> mates1; // mated reads (first mate)
static vector<string> mates2; // mated reads (second mate)
static vector<string> mates12; // mated reads (1st/2nd interleaved in 1 file)
static string adjustedEbwtFileBase;
static bool verbose; // be talkative
static bool startVerbose; // be talkative at startup
bool quiet; // print nothing but the alignments
static int sanityCheck; // enable expensive sanity checks
static int format; // default read format is FASTQ
static string origString; // reference text, or filename(s)
static int seed; // srandom() seed
static int timing; // whether to report basic timing data
static bool allHits; // for multihits, report just one
static bool rangeMode; // report BWT ranges instead of ref locs
static int showVersion; // just print version and quit?
static int ipause; // pause before maching?
static uint32_t qUpto; // max # of queries to read
static int trim5; // amount to trim from 5' end
static int trim3; // amount to trim from 3' end
static int reportOpps; // whether to report # of other mappings
static int offRate; // keep default offRate
static int isaRate; // keep default isaRate
static int mismatches; // allow 0 mismatches by default
static char *patDumpfile; // filename to dump patterns to
static bool solexaQuals; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
static bool phred64Quals; // quality chars are phred, but must subtract 64 (not 33)
static bool integerQuals; // quality strings are space-separated strings of integers, not ASCII
static int maqLike; // do maq-like searching
static int seedLen; // seed length (changed in Maq 0.6.4 from 24)
static int seedMms; // # mismatches allowed in seed (maq's -n)
static int qualThresh; // max qual-weighted hamming dist (maq's -e)
static int maxBtsBetter; // max # backtracks allowed in half-and-half mode
static int maxBts; // max # backtracks allowed in half-and-half mode
static int nthreads; // number of pthreads operating concurrently
static output_types outType; // style of output
static bool randReadsNoSync; // true -> generate reads from per-thread random source
static int numRandomReads; // # random reads (see Random*PatternSource in pat.h)
static int lenRandomReads; // len of random reads (see Random*PatternSource in pat.h)
static bool noRefNames; // true -> print reference indexes; not names
static string dumpAlBase; // basename of same-format files to dump aligned reads to
static string dumpUnalBase; // basename of same-format files to dump unaligned reads to
static string dumpMaxBase; // basename of same-format files to dump reads with more than -m valid alignments to
static uint32_t khits; // number of hits per read; >1 is much slower
static uint32_t mhits; // don't report any hits if there are > mhits
static bool better; // true -> guarantee alignments from best possible stratum
static bool strata; // true -> don't stop at stratum boundaries
static bool refOut; // if true, alignments go to per-ref files
static int partitionSz; // output a partitioning key in first field
static int readsPerBatch; // # reads to read from input file at once
static int outBatchSz; // # alignments to write to output file at once
static bool noMaqRound; // true -> don't round quals to nearest 10 like maq
static bool fileParallel; // separate threads read separate input files in parallel
static bool useShmem; // use shared memory to hold the index
static bool useMm; // use memory-mapped files to hold the index
static bool mmSweep; // sweep through memory-mapped files immediately after mapping
static bool stateful; // use stateful aligners
static uint32_t prefetchWidth; // number of reads to process in parallel w/ --stateful
static uint32_t minInsert; // minimum insert size (Maq = 0, SOAP = 400)
static uint32_t maxInsert; // maximum insert size (Maq = 250, SOAP = 600)
static bool mate1fw; // -1 mate aligns in fw orientation on fw strand
static bool mate2fw; // -2 mate aligns in rc orientation on fw strand
static bool mateFwSet; // true -> user set --ff/--fr/--rf
static uint32_t mixedThresh; // threshold for when to switch to paired-end mixed mode (see aligner.h)
static uint32_t mixedAttemptLim; // number of attempts to make in "mixed mode" before giving up on orientation
static bool dontReconcileMates; // suppress pairwise all-versus-all way of resolving mates
static uint32_t cacheLimit; // ranges w/ size > limit will be cached
static uint32_t cacheSize; // # words per range cache
static int offBase; // offsets are 0-based by default, but configurable
static bool tryHard; // set very high maxBts, mixedAttemptLim
static uint32_t skipReads; // # reads/read pairs to skip
static bool nofw; // don't align fw orientation of read
static bool norc; // don't align rc orientation of read
static bool strandFix; // attempt to fix strand bias
static bool stats; // print performance stats
static int chunkPoolMegabytes; // max MB to dedicate to best-first search frames per thread
static int chunkSz; // size of single chunk disbursed by ChunkPool
static bool chunkVerbose; // have chunk allocator output status messages?
static bool useV1;
static bool reportSe;
static const char * refMapFile; // file containing a map from index coordinates to another coordinate system
static const char * annotMapFile; // file containing a map from reference coordinates to annotations
static size_t fastaContLen;
static size_t fastaContFreq;
static bool hadoopOut; // print Hadoop status and summary messages
static bool fullRef;
static bool samNoQnameTrunc; // don't truncate QNAME field at first whitespace
static bool samNoHead; // don't print any header lines in SAM output
static bool samNoSQ; // don't print @SQ header lines
bool color; // true -> inputs are colorspace
bool colorExEnds; // true -> nucleotides on either end of decoded cspace alignment should be excluded
static string rgs; // SAM outputs for @RG header line
int snpPhred; // probability of SNP, for scoring colorspace alignments
static Bitset suppressOuts(64); // output fields to suppress
static bool sampleMax; // whether to report a random alignment when maxed-out via -m/-M
static int defaultMapq; // default mapping quality to print in SAM mode
bool colorSeq; // true -> show colorspace alignments as colors, not decoded bases
bool colorQual; // true -> show colorspace qualities as original quals, not decoded quals
static bool printCost; // true -> print stratum and cost
bool showSeed;
static vector<string> qualities;
static vector<string> qualities1;
static vector<string> qualities2;
static string wrapper; // Type of wrapper script
bool gAllowMateContainment;
bool gReportColorPrimer;
bool noUnal; // don't print unaligned reads
MUTEX_T gLock;
static void resetOptions() {
mates1.clear();
mates2.clear();
mates12.clear();
adjustedEbwtFileBase = "";
verbose = 0;
startVerbose = 0;
quiet = false;
sanityCheck = 0; // enable expensive sanity checks
format = FASTQ; // default read format is FASTQ
origString = ""; // reference text, or filename(s)
seed = 0; // srandom() seed
timing = 0; // whether to report basic timing data
allHits = false; // for multihits, report just one
rangeMode = false; // report BWT ranges instead of ref locs
showVersion = 0; // just print version and quit?
ipause = 0; // pause before maching?
qUpto = 0xffffffff; // max # of queries to read
trim5 = 0; // amount to trim from 5' end
trim3 = 0; // amount to trim from 3' end
reportOpps = 0; // whether to report # of other mappings
offRate = -1; // keep default offRate
isaRate = -1; // keep default isaRate
mismatches = 0; // allow 0 mismatches by default
patDumpfile = NULL; // filename to dump patterns to
solexaQuals = false; // quality strings are solexa quals, not phred, and subtract 64 (not 33)
phred64Quals = false; // quality chars are phred, but must subtract 64 (not 33)
integerQuals = false; // quality strings are space-separated strings of integers, not ASCII
maqLike = 1; // do maq-like searching
seedLen = 28; // seed length (changed in Maq 0.6.4 from 24)
seedMms = 2; // # mismatches allowed in seed (maq's -n)
qualThresh = 70; // max qual-weighted hamming dist (maq's -e)
maxBtsBetter = 125; // max # backtracks allowed in half-and-half mode
maxBts = 800; // max # backtracks allowed in half-and-half mode
nthreads = 1; // number of pthreads operating concurrently
outType = OUTPUT_FULL; // style of output
randReadsNoSync = false; // true -> generate reads from per-thread random source
numRandomReads = 50000000; // # random reads (see Random*PatternSource in pat.h)
lenRandomReads = 35; // len of random reads (see Random*PatternSource in pat.h)
noRefNames = false; // true -> print reference indexes; not names
dumpAlBase = ""; // basename of same-format files to dump aligned reads to
dumpUnalBase = ""; // basename of same-format files to dump unaligned reads to
dumpMaxBase = ""; // basename of same-format files to dump reads with more than -m valid alignments to
khits = 1; // number of hits per read; >1 is much slower
mhits = 0xffffffff; // don't report any hits if there are > mhits
better = false; // true -> guarantee alignments from best possible stratum
strata = false; // true -> don't stop at stratum boundaries
refOut = false; // if true, alignments go to per-ref files
partitionSz = 0; // output a partitioning key in first field
readsPerBatch = 16; // # reads to read from input file at once
outBatchSz = 16; // # alignments to wrote to output file at once
noMaqRound = false; // true -> don't round quals to nearest 10 like maq
fileParallel = false; // separate threads read separate input files in parallel
useShmem = false; // use shared memory to hold the index
useMm = false; // use memory-mapped files to hold the index
mmSweep = false; // sweep through memory-mapped files immediately after mapping
stateful = false; // use stateful aligners
prefetchWidth = 1; // number of reads to process in parallel w/ --stateful
minInsert = 0; // minimum insert size (Maq = 0, SOAP = 400)
maxInsert = 250; // maximum insert size (Maq = 250, SOAP = 600)
mate1fw = true; // -1 mate aligns in fw orientation on fw strand
mate2fw = false; // -2 mate aligns in rc orientation on fw strand
mateFwSet = false; // true -> user set mate1fw/mate2fw with --ff/--fr/--rf
mixedThresh = 4; // threshold for when to switch to paired-end mixed mode (see aligner.h)
mixedAttemptLim = 100; // number of attempts to make in "mixed mode" before giving up on orientation
dontReconcileMates = true; // suppress pairwise all-versus-all way of resolving mates
cacheLimit = 5; // ranges w/ size > limit will be cached
cacheSize = 0; // # words per range cache
offBase = 0; // offsets are 0-based by default, but configurable
tryHard = false; // set very high maxBts, mixedAttemptLim
skipReads = 0; // # reads/read pairs to skip
nofw = false; // don't align fw orientation of read
norc = false; // don't align rc orientation of read
strandFix = true; // attempt to fix strand bias
stats = false; // print performance stats
chunkPoolMegabytes = 64; // max MB to dedicate to best-first search frames per thread
chunkSz = 256; // size of single chunk disbursed by ChunkPool (in KB)
chunkVerbose = false; // have chunk allocator output status messages?
useV1 = true;
reportSe = false;
refMapFile = NULL; // file containing a map from index coordinates to another coordinate system
annotMapFile = NULL; // file containing a map from reference coordinates to annotations
fastaContLen = 0;
fastaContFreq = 0;
hadoopOut = false; // print Hadoop status and summary messages
fullRef = false; // print entire reference name instead of just up to 1st space
samNoQnameTrunc = false; // don't truncate at first whitespace?
samNoHead = false; // don't print any header lines in SAM output
samNoSQ = false; // don't print @SQ header lines
color = false; // don't align in colorspace by default
colorExEnds = true; // true -> nucleotides on either end of decoded cspace alignment should be excluded
rgs = ""; // SAM outputs for @RG header line
snpPhred = 30; // probability of SNP, for scoring colorspace alignments
suppressOuts.clear(); // output fields to suppress
sampleMax = false;
defaultMapq = 255;
colorSeq = false; // true -> show colorspace alignments as colors, not decoded bases
colorQual = false; // true -> show colorspace qualities as original quals, not decoded quals
printCost = false; // true -> print cost and stratum
showSeed = false; // true -> print per-read pseudo-random seed
qualities.clear();
qualities1.clear();
qualities2.clear();
wrapper.clear();
gAllowMateContainment = false; // true -> alignments where one mate lies inside the other are valid
gReportColorPrimer = false; // true -> print flag with trimmed color primer and downstream color
noUnal = false; // true -> do not report unaligned reads
}
// mating constraints
static const char *short_options = "fF:qbzhcu:rv:s:at3:5:o:e:n:l:w:p:k:m:M:1:2:I:X:x:B:ySCQ:";
enum {
ARG_ORIG = 256,
ARG_SEED,
ARG_DUMP_PATS,
ARG_RANGE,
ARG_CONCISE,
ARG_SOLEXA_QUALS,
ARG_MAXBTS,
ARG_VERBOSE,
ARG_STARTVERBOSE,
ARG_QUIET,
ARG_NOOUT,
ARG_FAST,
ARG_AL,
ARG_UN,
ARG_MAXDUMP,
ARG_REFIDX,
ARG_SANITY,
ARG_OLDBEST,
ARG_BETTER,
ARG_BEST,
ARG_REFOUT,
ARG_ISARATE,
ARG_PARTITION,
ARG_READS_PER_BATCH,
ARG_integerQuals,
ARG_NOMAQROUND,
ARG_FILEPAR,
ARG_SHMEM,
ARG_MM,
ARG_MMSWEEP,
ARG_STATEFUL,
ARG_PREFETCH_WIDTH,
ARG_FF,
ARG_FR,
ARG_RF,
ARG_MIXED_ATTEMPTS,
ARG_NO_RECONCILE,
ARG_CACHE_LIM,
ARG_CACHE_SZ,
ARG_NO_FW,
ARG_NO_RC,
ARG_SKIP,
ARG_STRAND_FIX,
ARG_STATS,
ARG_ONETWO,
ARG_PHRED64,
ARG_PHRED33,
ARG_CHUNKMBS,
ARG_CHUNKSZ,
ARG_CHUNKVERBOSE,
ARG_STRATA,
ARG_PEV2,
ARG_REFMAP,
ARG_ANNOTMAP,
ARG_REPORTSE,
ARG_HADOOPOUT,
ARG_FUZZY,
ARG_FULLREF,
ARG_USAGE,
ARG_SNPPHRED,
ARG_SNPFRAC,
ARG_SAM_NO_QNAME_TRUNC,
ARG_SAM_NOHEAD,
ARG_SAM_NOSQ,
ARG_SAM_RG,
ARG_SUPPRESS_FIELDS,
ARG_DEFAULT_MAPQ,
ARG_COLOR_SEQ,
ARG_COLOR_QUAL,
ARG_COST,
ARG_COLOR_KEEP_ENDS,
ARG_SHOWSEED,
ARG_QUALS1,
ARG_QUALS2,
ARG_ALLOW_CONTAIN,
ARG_COLOR_PRIMER,
ARG_WRAPPER,
ARG_INTERLEAVED_FASTQ,
ARG_SAM_NO_UNAL,
};
static struct option long_options[] = {
{(char*)"verbose", no_argument, 0, ARG_VERBOSE},
{(char*)"startverbose", no_argument, 0, ARG_STARTVERBOSE},
{(char*)"quiet", no_argument, 0, ARG_QUIET},
{(char*)"sanity", no_argument, 0, ARG_SANITY},
{(char*)"pause", no_argument, &ipause, 1},
{(char*)"orig", required_argument, 0, ARG_ORIG},
{(char*)"all", no_argument, 0, 'a'},
{(char*)"concise", no_argument, 0, ARG_CONCISE},
{(char*)"noout", no_argument, 0, ARG_NOOUT},
{(char*)"solexa-quals", no_argument, 0, ARG_SOLEXA_QUALS},
{(char*)"integer-quals",no_argument, 0, ARG_integerQuals},
{(char*)"time", no_argument, 0, 't'},
{(char*)"trim3", required_argument, 0, '3'},
{(char*)"trim5", required_argument, 0, '5'},
{(char*)"seed", required_argument, 0, ARG_SEED},
{(char*)"qupto", required_argument, 0, 'u'},
{(char*)"al", required_argument, 0, ARG_AL},
{(char*)"un", required_argument, 0, ARG_UN},
{(char*)"max", required_argument, 0, ARG_MAXDUMP},
{(char*)"offrate", required_argument, 0, 'o'},
{(char*)"isarate", required_argument, 0, ARG_ISARATE},
{(char*)"reportopps", no_argument, &reportOpps, 1},
{(char*)"version", no_argument, &showVersion, 1},
{(char*)"reads-per-batch", required_argument, 0, ARG_READS_PER_BATCH},
{(char*)"dumppats", required_argument, 0, ARG_DUMP_PATS},
{(char*)"maqerr", required_argument, 0, 'e'},
{(char*)"seedlen", required_argument, 0, 'l'},
{(char*)"seedmms", required_argument, 0, 'n'},
{(char*)"filepar", no_argument, 0, ARG_FILEPAR},
{(char*)"help", no_argument, 0, 'h'},
{(char*)"threads", required_argument, 0, 'p'},
{(char*)"khits", required_argument, 0, 'k'},
{(char*)"mhits", required_argument, 0, 'm'},
{(char*)"minins", required_argument, 0, 'I'},
{(char*)"maxins", required_argument, 0, 'X'},
{(char*)"quals", required_argument, 0, 'Q'},
{(char*)"Q1", required_argument, 0, ARG_QUALS1},
{(char*)"Q2", required_argument, 0, ARG_QUALS2},
{(char*)"best", no_argument, 0, ARG_BEST},
{(char*)"better", no_argument, 0, ARG_BETTER},
{(char*)"oldbest", no_argument, 0, ARG_OLDBEST},
{(char*)"strata", no_argument, 0, ARG_STRATA},
{(char*)"nomaqround", no_argument, 0, ARG_NOMAQROUND},
{(char*)"refidx", no_argument, 0, ARG_REFIDX},
{(char*)"range", no_argument, 0, ARG_RANGE},
{(char*)"maxbts", required_argument, 0, ARG_MAXBTS},
{(char*)"phased", no_argument, 0, 'z'},
{(char*)"partition", required_argument, 0, ARG_PARTITION},
{(char*)"stateful", no_argument, 0, ARG_STATEFUL},
{(char*)"prewidth", required_argument, 0, ARG_PREFETCH_WIDTH},
{(char*)"ff", no_argument, 0, ARG_FF},
{(char*)"fr", no_argument, 0, ARG_FR},
{(char*)"rf", no_argument, 0, ARG_RF},
{(char*)"mixthresh", required_argument, 0, 'x'},
{(char*)"pairtries", required_argument, 0, ARG_MIXED_ATTEMPTS},
{(char*)"noreconcile", no_argument, 0, ARG_NO_RECONCILE},
{(char*)"cachelim", required_argument, 0, ARG_CACHE_LIM},
{(char*)"cachesz", required_argument, 0, ARG_CACHE_SZ},
{(char*)"nofw", no_argument, 0, ARG_NO_FW},
{(char*)"norc", no_argument, 0, ARG_NO_RC},
{(char*)"offbase", required_argument, 0, 'B'},
{(char*)"tryhard", no_argument, 0, 'y'},
{(char*)"skip", required_argument, 0, 's'},
{(char*)"strandfix", no_argument, 0, ARG_STRAND_FIX},
{(char*)"stats", no_argument, 0, ARG_STATS},
{(char*)"12", required_argument, 0, ARG_ONETWO},
{(char*)"phred33-quals", no_argument, 0, ARG_PHRED33},
{(char*)"phred64-quals", no_argument, 0, ARG_PHRED64},
{(char*)"solexa1.3-quals", no_argument, 0, ARG_PHRED64},
{(char*)"chunkmbs", required_argument, 0, ARG_CHUNKMBS},
{(char*)"chunksz", required_argument, 0, ARG_CHUNKSZ},
{(char*)"chunkverbose", no_argument, 0, ARG_CHUNKVERBOSE},
{(char*)"mm", no_argument, 0, ARG_MM},
{(char*)"shmem", no_argument, 0, ARG_SHMEM},
{(char*)"mmsweep", no_argument, 0, ARG_MMSWEEP},
{(char*)"pev2", no_argument, 0, ARG_PEV2},
{(char*)"refmap", required_argument, 0, ARG_REFMAP},
{(char*)"annotmap", required_argument, 0, ARG_ANNOTMAP},
{(char*)"reportse", no_argument, 0, ARG_REPORTSE},
{(char*)"hadoopout", no_argument, 0, ARG_HADOOPOUT},
{(char*)"fullref", no_argument, 0, ARG_FULLREF},
{(char*)"usage", no_argument, 0, ARG_USAGE},
{(char*)"sam", no_argument, 0, 'S'},
{(char*)"sam-no-qname-trunc", no_argument, 0, ARG_SAM_NO_QNAME_TRUNC},
{(char*)"sam-nohead", no_argument, 0, ARG_SAM_NOHEAD},
{(char*)"sam-nosq", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"sam-noSQ", no_argument, 0, ARG_SAM_NOSQ},
{(char*)"color", no_argument, 0, 'C'},
{(char*)"sam-RG", required_argument, 0, ARG_SAM_RG},
{(char*)"snpphred", required_argument, 0, ARG_SNPPHRED},
{(char*)"snpfrac", required_argument, 0, ARG_SNPFRAC},
{(char*)"suppress", required_argument, 0, ARG_SUPPRESS_FIELDS},
{(char*)"mapq", required_argument, 0, ARG_DEFAULT_MAPQ},
{(char*)"col-cseq", no_argument, 0, ARG_COLOR_SEQ},
{(char*)"col-cqual", no_argument, 0, ARG_COLOR_QUAL},
{(char*)"col-keepends", no_argument, 0, ARG_COLOR_KEEP_ENDS},
{(char*)"cost", no_argument, 0, ARG_COST},
{(char*)"showseed", no_argument, 0, ARG_SHOWSEED},
{(char*)"allow-contain",no_argument, 0, ARG_ALLOW_CONTAIN},
{(char*)"col-primer", no_argument, 0, ARG_COLOR_PRIMER},
{(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
{(char*)"interleaved", required_argument, 0, ARG_INTERLEAVED_FASTQ},
{(char*)"no-unal", no_argument, 0, ARG_SAM_NO_UNAL},
{(char*)0, 0, 0, 0} // terminator
};
/**
* Print a summary usage message to the provided output stream.
*/
static void printUsage(ostream& out) {
#ifdef BOWTIE_64BIT_INDEX
string tool_name = "bowtie-align-l";
#else
string tool_name = "bowtie-align-s";
#endif
if(wrapper == "basic-0") {
tool_name = "bowtie";
}
out << "Usage: " << endl
<< tool_name << " [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | --interleaved <i> | <s>} [<hit>]" << endl
<< endl
<< " <m1> Comma-separated list of files containing upstream mates (or the" << endl
<< " sequences themselves, if -c is set) paired with mates in <m2>" << endl
<< " <m2> Comma-separated list of files containing downstream mates (or the" << endl
<< " sequences themselves if -c is set) paired with mates in <m1>" << endl
<< " <r> Comma-separated list of files containing Crossbow-style reads. Can be" << endl
<< " a mixture of paired and unpaired. Specify \"-\" for stdin." << endl
<< " <i> Files with interleaved paired-end FASTQ reads." << endl
<< " <s> Comma-separated list of files containing unpaired reads, or the" << endl
<< " sequences themselves, if -c is set. Specify \"-\" for stdin." << endl
<< " <hit> File to write hits to (default: stdout)" << endl
<< "Input:" << endl
<< " -q query input files are FASTQ .fq/.fastq (default)" << endl
<< " -f query input files are (multi-)FASTA .fa/.mfa" << endl
<< " -r query input files are raw one-sequence-per-line" << endl
<< " -c query sequences given on cmd line (as <mates>, <singles>)" << endl
<< " -C reads and index are in colorspace" << endl
<< " -Q/--quals <file> QV file(s) corresponding to CSFASTA inputs; use with -f -C" << endl
<< " --Q1/--Q2 <file> same as -Q, but for mate files 1 and 2 respectively" << endl
<< " -s/--skip <int> skip the first <int> reads/pairs in the input" << endl
<< " -u/--qupto <int> stop after first <int> reads/pairs (excl. skipped reads)" << endl
<< " -5/--trim5 <int> trim <int> bases from 5' (left) end of reads" << endl
<< " -3/--trim3 <int> trim <int> bases from 3' (right) end of reads" << endl
<< " --phred33-quals input quals are Phred+33 (default)" << endl
<< " --phred64-quals input quals are Phred+64 (same as --solexa1.3-quals)" << endl
<< " --solexa-quals input quals are from GA Pipeline ver. < 1.3" << endl
<< " --solexa1.3-quals input quals are from GA Pipeline ver. >= 1.3" << endl
<< " --integer-quals qualities are given as space-separated integers (not ASCII)" << endl;
if(wrapper == "basic-0") {
out << " --large-index force usage of a 'large' index, even if a small one is present" << endl;
}
out << "Alignment:" << endl
<< " -v <int> report end-to-end hits w/ <=v mismatches; ignore qualities" << endl
<< " or" << endl
<< " -n/--seedmms <int> max mismatches in seed (can be 0-3, default: -n 2)" << endl
<< " -e/--maqerr <int> max sum of mismatch quals across alignment for -n (def: 70)" << endl
<< " -l/--seedlen <int> seed length for -n (default: 28)" << endl
<< " --nomaqround disable Maq-like quality rounding for -n (nearest 10 <= 30)" << endl
<< " -I/--minins <int> minimum insert size for paired-end alignment (default: 0)" << endl
<< " -X/--maxins <int> maximum insert size for paired-end alignment (default: 250)" << endl
<< " --fr/--rf/--ff -1, -2 mates align fw/rev, rev/fw, fw/fw (default: --fr)" << endl
<< " --nofw/--norc do not align to forward/reverse-complement reference strand" << endl
<< " --maxbts <int> max # backtracks for -n 2/3 (default: 125, 800 for --best)" << endl
<< " --pairtries <int> max # attempts to find mate for anchor hit (default: 100)" << endl
<< " -y/--tryhard try hard to find valid alignments, at the expense of speed" << endl
<< " --chunkmbs <int> max megabytes of RAM for best-first search frames (def: 64)" << endl
<< " --reads-per-batch # of reads to read from input file at once (default: 16)" << endl
<< "Reporting:" << endl
<< " -k <int> report up to <int> good alignments per read (default: 1)" << endl
<< " -a/--all report all alignments per read (much slower than low -k)" << endl
<< " -m <int> suppress all alignments if > <int> exist (def: no limit)" << endl
<< " -M <int> like -m, but reports 1 random hit (MAPQ=0); requires --best" << endl
<< " --best hits guaranteed best stratum; ties broken by quality" << endl
<< " --strata hits in sub-optimal strata aren't reported (requires --best)" << endl
<< "Output:" << endl
<< " -t/--time print wall-clock time taken by search phases" << endl
<< " -B/--offbase <int> leftmost ref offset = <int> in bowtie output (default: 0)" << endl
<< " --quiet print nothing but the alignments" << endl
<< " --refidx refer to ref. seqs by 0-based index rather than name" << endl
<< " --al <fname> write aligned reads/pairs to file(s) <fname>" << endl
<< " --un <fname> write unaligned reads/pairs to file(s) <fname>" << endl
<< " --no-unal suppress SAM records for unaligned reads" << endl
<< " --max <fname> write reads/pairs over -m limit to file(s) <fname>" << endl
<< " --suppress <cols> suppresses given columns (comma-delim'ed) in default output" << endl
<< " --fullref write entire ref name (default: only up to 1st space)" << endl
<< "Colorspace:" << endl
<< " --snpphred <int> Phred penalty for SNP when decoding colorspace (def: 30)" << endl
<< " or" << endl
<< " --snpfrac <dec> approx. fraction of SNP bases (e.g. 0.001); sets --snpphred" << endl
<< " --col-cseq print aligned colorspace seqs as colors, not decoded bases" << endl
<< " --col-cqual print original colorspace quals, not decoded quals" << endl
<< " --col-keepends keep nucleotides at extreme ends of decoded alignment" << endl
<< "SAM:" << endl
<< " -S/--sam write hits in SAM format" << endl
<< " --mapq <int> default mapping quality (MAPQ) to print for SAM alignments" << endl
<< " --sam-nohead supppress header lines (starting with @) for SAM output" << endl
<< " --sam-nosq supppress @SQ header lines for SAM output" << endl
<< " --sam-RG <text> add <text> (usually \"lab=value\") to @RG line of SAM header" << endl
<< "Performance:" << endl
<< " -o/--offrate <int> override offrate of index; must be >= index's offrate" << endl
<< " -p/--threads <int> number of alignment threads to launch (default: 1)" << endl
#ifdef BOWTIE_MM
<< " --mm use memory-mapped I/O for index; many 'bowtie's can share" << endl
#endif
#ifdef BOWTIE_SHARED_MEM
<< " --shmem use shared mem for index; many 'bowtie's can share" << endl
#endif
<< "Other:" << endl
<< " --seed <int> seed for random number generator" << endl
<< " --verbose verbose output (for debugging)" << endl
<< " --version print version information and quit" << endl
<< " -h/--help print this usage message" << endl
;
if(wrapper.empty()) {
cerr << endl
<< "*** Warning ***" << endl
<< tool_name << " was run directly. It is recommended that you run the wrapper script 'bowtie' instead." << endl
<< endl;
}
}
/**
* Parse an int out of optarg and enforce that it be at least 'lower';
* if it is less than 'lower', than output the given error message and
* exit with an error and a usage message.
*/
static int parseInt(int lower, int upper, const char *errmsg, const char *arg) {
long l;
char *endPtr= NULL;
l = strtol(arg, &endPtr, 10);
if (endPtr != NULL) {
if (l < lower || l > upper) {
cerr << errmsg << endl;
printUsage(cerr);
throw 1;
}
return (int32_t)l;
}
cerr << errmsg << endl;
printUsage(cerr);
throw 1;
return -1;
}
/**
* Parse from optarg by default.
*/
static int parseInt(int lower, const char *errmsg) {
return parseInt(lower, INT_MAX, errmsg, optarg);
}
/**
* Upper is INT_MAX by default.
*/
static int parseInt(int lower, const char *errmsg, const char *arg) {
return parseInt(lower, INT_MAX, errmsg, arg);
}
/**
* Upper is INT_MAX, parse from optarg by default.
*/
static int parseInt(int lower, int upper, const char *errmsg) {
return parseInt(lower, upper, errmsg, optarg);
}
/**
* Parse a T string 'str'.
*/
template<typename T>
T parse(const char *s) {
T tmp;
stringstream ss(s);
ss >> tmp;
return tmp;
}
/**
* Parse a pair of Ts from a string, 'str', delimited with 'delim'.
*/
template<typename T>
pair<T, T> parsePair(const char *str, char delim) {
string s(str);
vector<string> ss;
tokenize(s, delim, ss);
pair<T, T> ret;
ret.first = parse<T>(ss[0].c_str());
ret.second = parse<T>(ss[1].c_str());
return ret;
}
/**
* Read command-line arguments
*/
static void parseOptions(int argc, const char **argv) {
int option_index = 0;
int next_option;
if(startVerbose) { cerr << "Parsing options: "; logTime(cerr, true); }
do {
next_option = getopt_long(
argc, const_cast<char**>(argv),
short_options, long_options, &option_index);
switch (next_option) {
case ARG_WRAPPER: wrapper = optarg; break;
case '1': tokenize(optarg, ",", mates1); break;
case '2': tokenize(optarg, ",", mates2); break;
case ARG_ONETWO: tokenize(optarg, ",", mates12); format = TAB_MATE; break;
case ARG_INTERLEAVED_FASTQ: tokenize(optarg, ",", mates12); format = INTERLEAVED; break;
case 'f': format = FASTA; break;
case 'F': {
format = FASTA_CONT;
pair<size_t, size_t> p = parsePair<size_t>(optarg, ',');
fastaContLen = p.first;
fastaContFreq = p.second;
break;
}
case 'q': format = FASTQ; break;
case 'r': format = RAW; break;
case 'c': format = CMDLINE; break;
case 'C': color = true; break;
case 'I':
minInsert = (uint32_t)parseInt(0, "-I arg must be positive");
break;
case 'X':
maxInsert = (uint32_t)parseInt(1, "-X arg must be at least 1");
break;
case 's':
skipReads = (uint32_t)parseInt(0, "-s arg must be positive");
break;
case ARG_FF: mate1fw = true; mate2fw = true; mateFwSet = true; break;
case ARG_RF: mate1fw = false; mate2fw = true; mateFwSet = true; break;
case ARG_FR: mate1fw = true; mate2fw = false; mateFwSet = true; break;
case ARG_RANGE: rangeMode = true; break;
case ARG_CONCISE: outType = OUTPUT_CONCISE; break;
case 'S': outType = OUTPUT_SAM; break;
case ARG_REFOUT: refOut = true; break;
case ARG_NOOUT: outType = OUTPUT_NONE; break;
case ARG_REFMAP: refMapFile = optarg; break;
case ARG_ANNOTMAP: annotMapFile = optarg; break;
case ARG_SHMEM: useShmem = true; break;
case ARG_COLOR_SEQ: colorSeq = true; break;
case ARG_COLOR_QUAL: colorQual = true; break;
case ARG_SHOWSEED: showSeed = true; break;
case ARG_ALLOW_CONTAIN: gAllowMateContainment = true; break;
case ARG_COLOR_PRIMER: gReportColorPrimer = true; break;
case ARG_SUPPRESS_FIELDS: {
vector<string> supp;
tokenize(optarg, ",", supp);
for(size_t i = 0; i < supp.size(); i++) {
int ii = parseInt(1, "--suppress arg must be at least 1", supp[i].c_str());
suppressOuts.set(ii-1);
}
break;
}
case ARG_MM: {
#ifdef BOWTIE_MM
useMm = true;
break;
#else
cerr << "Memory-mapped I/O mode is disabled because bowtie was not compiled with" << endl
<< "BOWTIE_MM defined. Memory-mapped I/O is not supported under Windows. If you" << endl
<< "would like to use memory-mapped I/O on a platform that supports it, please" << endl
<< "refrain from specifying BOWTIE_MM=0 when compiling Bowtie." << endl;
throw 1;
#endif
}
case ARG_MMSWEEP: mmSweep = true; break;
case ARG_HADOOPOUT: hadoopOut = true; break;
case ARG_AL: dumpAlBase = optarg; break;
case ARG_UN: dumpUnalBase = optarg; break;
case ARG_MAXDUMP: dumpMaxBase = optarg; break;
case ARG_SOLEXA_QUALS: solexaQuals = true; break;
case ARG_integerQuals: integerQuals = true; break;
case ARG_PHRED64: phred64Quals = true; break;
case ARG_PHRED33: solexaQuals = false; phred64Quals = false; break;
case ARG_NOMAQROUND: noMaqRound = true; break;
case ARG_COLOR_KEEP_ENDS: colorExEnds = false; break;
case ARG_SNPPHRED: snpPhred = parseInt(0, "--snpphred must be at least 0"); break;
case ARG_SNPFRAC: {
double p = parse<double>(optarg);
if(p <= 0.0) {
cerr << "Error: --snpfrac parameter must be > 0.0" << endl;
throw 1;
}
p = (log10(p) * -10);
snpPhred = (int)(p + 0.5);
if(snpPhred < 10)
cout << "snpPhred: " << snpPhred << endl;
break;
}
case 'z': {
cerr << "Error: -z/--phased mode is no longer supported" << endl;
throw 1;
}
case ARG_REFIDX: noRefNames = true; break;
case ARG_STATEFUL: stateful = true; break;
case ARG_REPORTSE: reportSe = true; break;
case ARG_FULLREF: fullRef = true; break;
case ARG_PREFETCH_WIDTH:
prefetchWidth = parseInt(1, "--prewidth must be at least 1");
break;
case 'B':
offBase = parseInt(-999999, "-B/--offbase cannot be a large negative number");
break;
case ARG_SEED:
seed = parseInt(0, "--seed arg must be at least 0");
break;
case 'u':
qUpto = (uint32_t)parseInt(1, "-u/--qupto arg must be at least 1");
break;
case 'k':
khits = (uint32_t)parseInt(1, "-k arg must be at least 1");
break;
case 'Q':
tokenize(optarg, ",", qualities);
integerQuals = true;
break;
case ARG_QUALS1:
tokenize(optarg, ",", qualities1);
integerQuals = true;
break;
case ARG_QUALS2:
tokenize(optarg, ",", qualities2);
integerQuals = true;
break;
case 'M':
sampleMax = true;
case 'm':
mhits = (uint32_t)parseInt(1, "-m arg must be at least 1");
break;
case 'x':
mixedThresh = (uint32_t)parseInt(0, "-x arg must be at least 0");
break;
case ARG_MIXED_ATTEMPTS:
mixedAttemptLim = (uint32_t)parseInt(1, "--pairtries arg must be at least 1");
break;
case ARG_CACHE_LIM:
cacheLimit = (uint32_t)parseInt(1, "--cachelim arg must be at least 1");
break;
case ARG_CACHE_SZ:
cacheSize = (uint32_t)parseInt(1, "--cachesz arg must be at least 1");
cacheSize *= (1024 * 1024); // convert from MB to B
break;
case ARG_NO_RECONCILE:
dontReconcileMates = true;
break;
case 'p':
nthreads = parseInt(1, "-p/--threads arg must be at least 1");
break;
case ARG_FILEPAR:
fileParallel = true;
break;
case 'v':
maqLike = 0;
mismatches = parseInt(0, 3, "-v arg must be at least 0 and at most 3");
break;
case '3': trim3 = parseInt(0, "-3/--trim3 arg must be at least 0"); break;
case '5': trim5 = parseInt(0, "-5/--trim5 arg must be at least 0"); break;
case 'o': offRate = parseInt(1, "-o/--offrate arg must be at least 1"); break;
case ARG_ISARATE: isaRate = parseInt(0, "--isarate arg must be at least 0"); break;
case 'e': qualThresh = parseInt(1, "-e/--err arg must be at least 1"); break;
case 'n': seedMms = parseInt(0, 3, "-n/--seedmms arg must be at least 0 and at most 3"); maqLike = 1; break;
case 'l': seedLen = parseInt(5, "-l/--seedlen arg must be at least 5"); break;
case 'h': printUsage(cout); throw 0; break;
case ARG_USAGE: printUsage(cout); throw 0; break;
case 'a': allHits = true; break;
case 'y': tryHard = true; break;
case ARG_CHUNKMBS: chunkPoolMegabytes = parseInt(1, "--chunkmbs arg must be at least 1"); break;
case ARG_CHUNKSZ: chunkSz = parseInt(1, "--chunksz arg must be at least 1"); break;
case ARG_CHUNKVERBOSE: chunkVerbose = true; break;
case ARG_BETTER: stateful = true; better = true; break;
case ARG_BEST: stateful = true; useV1 = false; break;
case ARG_STRATA: strata = true; break;
case ARG_VERBOSE: verbose = true; break;
case ARG_STARTVERBOSE: startVerbose = true; break;
case ARG_QUIET: quiet = true; break;
case ARG_SANITY: sanityCheck = true; break;
case 't': timing = true; break;
case ARG_NO_FW: nofw = true; break;
case ARG_NO_RC: norc = true; break;
case ARG_STATS: stats = true; break;
case ARG_PEV2: useV1 = false; break;
case ARG_SAM_NO_QNAME_TRUNC: samNoQnameTrunc = true; break;
case ARG_SAM_NOHEAD: samNoHead = true; break;
case ARG_SAM_NOSQ: samNoSQ = true; break;
case ARG_SAM_NO_UNAL: noUnal = true; break;
case ARG_SAM_RG: {
if(!rgs.empty()) rgs += '\t';
rgs += optarg;
break;
}
case ARG_COST: printCost = true; break;
case ARG_DEFAULT_MAPQ:
defaultMapq = parseInt(0, "--mapq must be positive");
break;
case ARG_MAXBTS: {
maxBts = parseInt(0, "--maxbts must be positive");
maxBtsBetter = maxBts;
break;
}
case ARG_DUMP_PATS: patDumpfile = optarg; break;
case ARG_STRAND_FIX: strandFix = true; break;
case ARG_PARTITION: partitionSz = parse<int>(optarg); break;
case ARG_READS_PER_BATCH: {
if(optarg == NULL || parse<int>(optarg) < 1) {
cerr << "--reads-per-batch arg must be at least 1" << endl;
printUsage(cerr);
throw 1;
}
// TODO: should output batch size be controlled separately?
readsPerBatch = outBatchSz = parse<int>(optarg);
break;
}
case ARG_ORIG:
if(optarg == NULL || strlen(optarg) == 0) {
cerr << "--orig arg must be followed by a string" << endl;
printUsage(cerr);
throw 1;
}
origString = optarg;
break;
case -1: break; /* Done with options. */
case 0:
if (long_options[option_index].flag != 0)
break;
default:
printUsage(cerr);
throw 1;
}
} while(next_option != -1);
//bool paired = mates1.size() > 0 || mates2.size() > 0 || mates12.size() > 0;
if(rangeMode) {
// Tell the Ebwt loader to ignore the suffix-array portion of
// the index. We don't need it because the user isn't asking
// for bowtie to report reference positions (just matrix
// ranges).
offRate = 32;
}
if(!maqLike && mismatches == 3) {
// Much faster than normal 3-mismatch mode
stateful = true;
}
if(mates1.size() != mates2.size()) {
cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << mates2.size() << endl
<< "mate files/sequences were specified with -2. The same number of mate files/" << endl
<< "sequences must be specified with -1 and -2." << endl;
throw 1;
}
if(qualities.size() && format != FASTA) {
cerr << "Error: one or more quality files were specified with -Q but -f was not" << endl
<< "enabled. -Q works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities.size() && !color) {
cerr << "Error: one or more quality files were specified with -Q but -C was not" << endl
<< "enabled. -Q works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities1.size() && format != FASTA) {
cerr << "Error: one or more quality files were specified with --Q1 but -f was not" << endl
<< "enabled. --Q1 works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities1.size() && !color) {
cerr << "Error: one or more quality files were specified with --Q1 but -C was not" << endl
<< "enabled. --Q1 works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities2.size() && format != FASTA) {
cerr << "Error: one or more quality files were specified with --Q2 but -f was not" << endl
<< "enabled. --Q2 works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities2.size() && !color) {
cerr << "Error: one or more quality files were specified with --Q2 but -C was not" << endl
<< "enabled. --Q2 works only in combination with -f and -C." << endl;
throw 1;
}
if(qualities1.size() > 0 && mates1.size() != qualities1.size()) {
cerr << "Error: " << mates1.size() << " mate files/sequences were specified with -1, but " << qualities1.size() << endl
<< "quality files were specified with --Q1. The same number of mate and quality" << endl
<< "files must sequences must be specified with -1 and --Q1." << endl;
throw 1;
}
if(qualities2.size() > 0 && mates2.size() != qualities2.size()) {
cerr << "Error: " << mates2.size() << " mate files/sequences were specified with -2, but " << qualities2.size() << endl
<< "quality files were specified with --Q2. The same number of mate and quality" << endl
<< "files must sequences must be specified with -2 and --Q2." << endl;
throw 1;
}
// Check for duplicate mate input files
if(format != CMDLINE) {
for(size_t i = 0; i < mates1.size(); i++) {
for(size_t j = 0; j < mates2.size(); j++) {
if(mates1[i] == mates2[j] && !quiet) {
cerr << "Warning: Same mate file \"" << mates1[i] << "\" appears as argument to both -1 and -2" << endl;
}
}
}
}
if(tryHard) {
// Increase backtracking limit to huge number
maxBts = maxBtsBetter = INT_MAX;
// Increase number of paired-end scan attempts to huge number
mixedAttemptLim = UINT_MAX;
}
if(!stateful && sampleMax) {
if(!quiet) {
cerr << "Warning: -M was specified w/o --best; automatically enabling --best" << endl;
}
stateful = true;
}
if(strata && !stateful) {
cerr << "--strata must be combined with --best" << endl;
throw 1;
}
if(strata && !allHits && khits == 1 && mhits == 0xffffffff) {
cerr << "--strata has no effect unless combined with -m, -a, or -k N where N > 1" << endl;
throw 1;
}
// If both -s and -u are used, we need to adjust qUpto accordingly
// since it uses patid to know if we've reached the -u limit (and
// patids are all shifted up by skipReads characters)
if(qUpto + skipReads > qUpto) {
qUpto += skipReads;
}
if(useShmem && useMm && !quiet) {
cerr << "Warning: --shmem overrides --mm..." << endl;
useMm = false;
}
if(snpPhred <= 10 && color && !quiet) {
cerr << "Warning: the colorspace SNP penalty (--snpphred) is very low: " << snpPhred << endl;
}
if(outType == OUTPUT_SAM && refOut) {
cerr << "Error: --refout cannot be combined with -S/--sam" << endl;
throw 1;
}
if(!mateFwSet) {
if(color) {
// Set colorspace default (--ff)
mate1fw = true;
mate2fw = true;
} else {
// Set nucleotide space default (--fr)
mate1fw = true;
mate2fw = false;
}
}
if(outType != OUTPUT_FULL && suppressOuts.count() > 0 && !quiet) {
cerr << "Warning: Ignoring --suppress because output type is not default." << endl;
cerr << " --suppress is only available for the default output type." << endl;
suppressOuts.clear();
}
}
static const char *argv0 = NULL;
#define FINISH_READ(p) \
/* Don't do finishRead if the read isn't legit or if the read was skipped by the doneMask */ \
if(get_read_ret.first) { \
sink->finishRead(*p, true, !skipped); \
get_read_ret.first = false; \
} \
skipped = false;
/// Macro for getting the next read, possibly aborting depending on
/// whether the result is empty or the patid exceeds the limit, and
/// marshaling the read into convenient variables.
#define GET_READ(p) \
if(get_read_ret.second) break; \
get_read_ret = p->nextReadPair(); \
if(p->rdid() >= qUpto) { \
get_read_ret = make_pair(false, true); \
} \
if(!get_read_ret.first) { \
if(get_read_ret.second) { \
break; \
} \
continue; \
} \
String<Dna5>& patFw = p->bufa().patFw; \
patFw.data_begin += 0; /* suppress "unused" compiler warning */ \
String<Dna5>& patRc = p->bufa().patRc; \
patRc.data_begin += 0; /* suppress "unused" compiler warning */ \
String<char>& qual = p->bufa().qual; \
qual.data_begin += 0; /* suppress "unused" compiler warning */ \
String<char>& qualRev = p->bufa().qualRev; \
qualRev.data_begin += 0; /* suppress "unused" compiler warning */ \
String<Dna5>& patFwRev = p->bufa().patFwRev; \
patFwRev.data_begin += 0; /* suppress "unused" compiler warning */ \
String<Dna5>& patRcRev = p->bufa().patRcRev; \
patRcRev.data_begin += 0; /* suppress "unused" compiler warning */ \
String<char>& name = p->bufa().name; \
name.data_begin += 0; /* suppress "unused" compiler warning */ \
uint32_t patid = (uint32_t)p->rdid(); \
params.setPatId(patid);
#define WORKER_EXIT() \
patsrcFact->destroy(patsrc); \
delete patsrcFact; \
sinkFact->destroy(sink); \
delete sinkFact; \
return;
#ifdef CHUD_PROFILING
#define CHUD_START() chudStartRemotePerfMonitor("Bowtie");
#define CHUD_STOP() chudStopRemotePerfMonitor();
#else
#define CHUD_START()
#define CHUD_STOP()
#endif
/// Create a PatternSourcePerThread for the current thread according
/// to the global params and return a pointer to it
static PatternSourcePerThreadFactory*
createPatsrcFactory(PatternComposer& _patsrc, int tid, uint32_t max_buf) {
PatternSourcePerThreadFactory *patsrcFact;
patsrcFact = new PatternSourcePerThreadFactory(_patsrc, max_buf, skipReads, seed);
assert(patsrcFact != NULL);
return patsrcFact;
}
/**
* Allocate a HitSinkPerThreadFactory on the heap according to the
* global params and return a pointer to it.
*/
static HitSinkPerThreadFactory*
createSinkFactory(HitSink& _sink, size_t threadId) {
HitSinkPerThreadFactory *sink = NULL;
if(!strata) {
// Unstratified
if(!allHits) {
// First N good; "good" inherently ignores strata
sink = new NGoodHitSinkPerThreadFactory(_sink, khits, mhits, defaultMapq, threadId);
} else {
// All hits, spanning strata
sink = new AllHitSinkPerThreadFactory(_sink, mhits, defaultMapq, threadId);
}
} else {
// Stratified
assert(stateful);
if(!allHits) {
assert(stateful);
// Buffer best hits, assuming they're arriving in best-
// to-worst order
sink = new NBestFirstStratHitSinkPerThreadFactory(_sink, khits, mhits, defaultMapq, threadId);
} else {
assert(stateful);
// Buffer best hits, assuming they're arriving in best-
// to-worst order
sink = new NBestFirstStratHitSinkPerThreadFactory(_sink, 0xffffffff/2, mhits, defaultMapq, threadId);
}
}
assert(sink != NULL);
return sink;
}
/**
* Search through a single (forward) Ebwt index for exact end-to-end
* hits. Assumes that index is already loaded into memory.
*/
static PatternComposer* exactSearch_patsrc;
static HitSink* exactSearch_sink;
static Ebwt<String<Dna> >* exactSearch_ebwt;
static vector<String<Dna5> >* exactSearch_os;
static BitPairReference* exactSearch_refs;
#ifdef WITH_TBB
//void exactSearchWorker::operator()() const {
static void exactSearchWorker(void *vp) {
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void exactSearchWorker(void *vp) {
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *exactSearch_patsrc;
HitSink& _sink = *exactSearch_sink;
Ebwt<String<Dna> >& ebwt = *exactSearch_ebwt;
vector<String<Dna5> >& os = *exactSearch_os;
const BitPairReference* refs = exactSearch_refs;
// Per-thread initialization
PatternSourcePerThreadFactory *patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
PatternSourcePerThread *patsrc = patsrcFact->create();
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
HitSinkPerThread* sink = sinkFact->create();
EbwtSearchParams<String<Dna> > params(
*sink, // HitSink
os, // reference sequences
true, // read is forward
true); // index is forward
GreedyDFSRangeSource bt(
&ebwt, params,
refs, // reference sequence (for colorspace)
0xffffffff, // qualThresh
0xffffffff, // max backtracks (no max)
0, // reportPartials (don't)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
false); // considerQuals
pair<bool, bool> get_read_ret = make_pair(false, false);
bool skipped = false;
#ifdef PER_THREAD_TIMING
uint64_t ncpu_changeovers = 0;
uint64_t nnuma_changeovers = 0;
int current_cpu = 0, current_node = 0;
get_cpu_and_node(current_cpu, current_node);
std::stringstream ss;
std::string msg;
ss << "thread: " << tid << " time: ";
msg = ss.str();
{
Timer timer(std::cout, msg.c_str());
#endif
while(true) {
#ifdef PER_THREAD_TIMING
int cpu = 0, node = 0;
get_cpu_and_node(cpu, node);
if(cpu != current_cpu) {
ncpu_changeovers++;
current_cpu = cpu;
}
if(node != current_node) {
nnuma_changeovers++;
current_node = node;
}
#endif
FINISH_READ(patsrc);
GET_READ(patsrc);
#include "search_exact.c"
}
FINISH_READ(patsrc);
#ifdef PER_THREAD_TIMING
ss.str("");
ss.clear();
ss << "thread: " << tid << " cpu_changeovers: " << ncpu_changeovers << std::endl
<< "thread: " << tid << " node_changeovers: " << nnuma_changeovers << std::endl;
std::cout << ss.str();
}
#endif
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
WORKER_EXIT();
}
/**
* A statefulness-aware worker driver. Uses UnpairedExactAlignerV1.
*/
#ifdef WITH_TBB
//void exactSearchWorkerStateful::operator()() const {
static void exactSearchWorkerStateful(void *vp) {
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void exactSearchWorkerStateful(void *vp) {
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *exactSearch_patsrc;
HitSink& _sink = *exactSearch_sink;
Ebwt<String<Dna> >& ebwt = *exactSearch_ebwt;
vector<String<Dna5> >& os = *exactSearch_os;
BitPairReference* refs = exactSearch_refs;
// Global initialization
PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
UnpairedExactAlignerV1Factory alSEfact(
ebwt,
NULL,
!nofw,
!norc,
_sink,
*sinkFact,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs,
os,
!noMaqRound,
!better,
strandFix,
rangeMode,
verbose,
quiet,
seed);
PairedExactAlignerV1Factory alPEfact(
ebwt,
NULL,
color,
!nofw,
!norc,
useV1,
_sink,
*sinkFact,
mate1fw,
mate2fw,
minInsert,
maxInsert,
dontReconcileMates,
mhits, // for symCeiling
mixedThresh,
mixedAttemptLim,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs, os,
reportSe,
!noMaqRound,
strandFix,
!better,
rangeMode,
verbose,
quiet,
seed);
{
MixedMultiAligner multi(
prefetchWidth,
qUpto,
alSEfact,
alPEfact,
*patsrcFact);
// Run that mother
multi.run(false, tid);
// MultiAligner must be destroyed before patsrcFact
}
delete patsrcFact;
delete sinkFact;
delete pool;
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
return;
}
#define SET_A_FW(bt, p, params) \
bt.setQuery(&p->bufa().patFw, &p->bufa().qual, &p->bufa().name); \
params.setFw(true);
#define SET_A_RC(bt, p, params) \
bt.setQuery(&p->bufa().patRc, &p->bufa().qualRev, &p->bufa().name); \
params.setFw(false);
#define SET_B_FW(bt, p, params) \
bt.setQuery(&p->bufb().patFw, &p->bufb().qual, &p->bufb().name); \
params.setFw(true);
#define SET_B_RC(bt, p, params) \
bt.setQuery(&p->bufb().patRc, &p->bufb().qualRev, &p->bufb().name); \
params.setFw(false);
/**
* Search through a single (forward) Ebwt index for exact end-to-end
* hits. Assumes that index is already loaded into memory.
*/
static void exactSearch(PatternComposer& _patsrc,
HitSink& _sink,
Ebwt<String<Dna> >& ebwt,
vector<String<Dna5> >& os)
{
exactSearch_patsrc = &_patsrc;
exactSearch_sink = &_sink;
exactSearch_ebwt = &ebwt;
exactSearch_os = &os;
assert(!ebwt.isInMemory());
{
// Load the rest of (vast majority of) the backward Ebwt into
// memory
Timer _t(cerr, "Time loading forward index: ", timing);
ebwt.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
}
BitPairReference *refs = NULL;
bool pair = mates1.size() > 0 || mates12.size() > 0;
if(color || (pair && mixedThresh < 0xffffffff)) {
Timer _t(cerr, "Time loading reference: ", timing);
refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
if(!refs->loaded()) throw 1;
}
exactSearch_refs = refs;
#ifdef WITH_TBB
AutoArray<std::thread*> threads(nthreads);
#else
AutoArray<tthread::thread*> threads(nthreads);
AutoArray<int> tids(nthreads);
#endif
#ifdef WITH_TBB
tbb::atomic<int> all_threads_done;
all_threads_done = 0;
#endif
CHUD_START();
{
Timer _t(cerr, "Time for 0-mismatch search: ", timing);
for(int i = 0; i < nthreads; i++) {
#ifdef WITH_TBB
thread_tracking_pair tp;
tp.tid = i;
tp.done = &all_threads_done;
if(stateful) {
threads[i] = new std::thread(exactSearchWorkerStateful, (void*) &tp);
} else {
threads[i] = new std::thread(exactSearchWorker, (void*) &tp);
}
threads[i]->detach();
SLEEP(10);
}
while(all_threads_done < nthreads) {
SLEEP(10);
}
#else
tids[i] = i;
if(stateful) {
threads[i] = new tthread::thread(exactSearchWorkerStateful, (void*)&tids[i]);
} else {
threads[i] = new tthread::thread(exactSearchWorker, (void*)&tids[i]);
}
}
for(int i = 0; i < nthreads; i++) {
threads[i]->join();
}
#endif
}
if(refs != NULL) delete refs;
}
/**
* Search through a pair of Ebwt indexes, one for the forward direction
* and one for the backward direction, for exact end-to-end hits and 1-
* mismatch end-to-end hits. In my experience, this is slightly faster
* than Maq (default) mode with the -n 1 option.
*
* Forward Ebwt (ebwtFw) is already loaded into memory and backward
* Ebwt (ebwtBw) is not loaded into memory.
*/
static PatternComposer* mismatchSearch_patsrc;
static HitSink* mismatchSearch_sink;
static Ebwt<String<Dna> >* mismatchSearch_ebwtFw;
static Ebwt<String<Dna> >* mismatchSearch_ebwtBw;
static vector<String<Dna5> >* mismatchSearch_os;
static SyncBitset* mismatchSearch_doneMask;
static SyncBitset* mismatchSearch_hitMask;
static BitPairReference* mismatchSearch_refs;
/**
* A statefulness-aware worker driver. Uses Unpaired/Paired1mmAlignerV1.
*/
#ifdef WITH_TBB
//void mismatchSearchWorkerFullStateful::operator()() const {
static void mismatchSearchWorkerFullStateful(void *vp) {
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void mismatchSearchWorkerFullStateful(void *vp) {
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *mismatchSearch_patsrc;
HitSink& _sink = *mismatchSearch_sink;
Ebwt<String<Dna> >& ebwtFw = *mismatchSearch_ebwtFw;
Ebwt<String<Dna> >& ebwtBw = *mismatchSearch_ebwtBw;
vector<String<Dna5> >& os = *mismatchSearch_os;
BitPairReference* refs = mismatchSearch_refs;
// Global initialization
PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
Unpaired1mmAlignerV1Factory alSEfact(
ebwtFw,
&ebwtBw,
!nofw,
!norc,
_sink,
*sinkFact,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs,
os,
!noMaqRound,
!better,
strandFix,
rangeMode,
verbose,
quiet,
seed);
Paired1mmAlignerV1Factory alPEfact(
ebwtFw,
&ebwtBw,
color,
!nofw,
!norc,
useV1,
_sink,
*sinkFact,
mate1fw,
mate2fw,
minInsert,
maxInsert,
dontReconcileMates,
mhits, // for symCeiling
mixedThresh,
mixedAttemptLim,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs, os,
reportSe,
!noMaqRound,
!better,
strandFix,
rangeMode,
verbose,
quiet,
seed);
{
MixedMultiAligner multi(
prefetchWidth,
qUpto,
alSEfact,
alPEfact,
*patsrcFact);
// Run that mother
multi.run(false, tid);
// MultiAligner must be destroyed before patsrcFact
}
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
delete patsrcFact;
delete sinkFact;
delete pool;
return;
}
#ifdef WITH_TBB
//void mismatchSearchWorkerFull::operator()() const {
static void mismatchSearchWorkerFull(void *vp){
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void mismatchSearchWorkerFull(void *vp){
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *mismatchSearch_patsrc;
HitSink& _sink = *mismatchSearch_sink;
Ebwt<String<Dna> >& ebwtFw = *mismatchSearch_ebwtFw;
Ebwt<String<Dna> >& ebwtBw = *mismatchSearch_ebwtBw;
vector<String<Dna5> >& os = *mismatchSearch_os;
const BitPairReference* refs = mismatchSearch_refs;
// Per-thread initialization
PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
PatternSourcePerThread* patsrc = patsrcFact->create();
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
HitSinkPerThread* sink = sinkFact->create();
EbwtSearchParams<String<Dna> > params(
*sink, // HitSinkPerThread
os, // reference sequences
true, // read is forward
false); // index is mirror index
GreedyDFSRangeSource bt(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
0xffffffff, // qualThresh
0xffffffff, // max backtracks (no max)
0, // reportPartials (don't)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
false); // considerQuals
bool skipped = false;
pair<bool, bool> get_read_ret = make_pair(false, false);
#ifdef PER_THREAD_TIMING
uint64_t ncpu_changeovers = 0;
uint64_t nnuma_changeovers = 0;
int current_cpu = 0, current_node = 0;
get_cpu_and_node(current_cpu, current_node);
std::stringstream ss;
std::string msg;
ss << "thread: " << tid << " time: ";
msg = ss.str();
{
Timer timer(std::cout, msg.c_str());
#endif
while(true) {
#ifdef PER_THREAD_TIMING
int cpu = 0, node = 0;
get_cpu_and_node(cpu, node);
if(cpu != current_cpu) {
ncpu_changeovers++;
current_cpu = cpu;
}
if(node != current_node) {
nnuma_changeovers++;
current_node = node;
}
#endif
FINISH_READ(patsrc);
GET_READ(patsrc);
uint32_t plen = (uint32_t)length(patFw);
uint32_t s = plen;
uint32_t s3 = s >> 1; // length of 3' half of seed
uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed
#define DONEMASK_SET(p)
#include "search_1mm_phase1.c"
#include "search_1mm_phase2.c"
#undef DONEMASK_SET
} // End read loop
FINISH_READ(patsrc);
#ifdef PER_THREAD_TIMING
ss.str("");
ss.clear();
ss << "thread: " << tid << " cpu_changeovers: " << ncpu_changeovers << std::endl
<< "thread: " << tid << " node_changeovers: " << nnuma_changeovers << std::endl;
std::cout << ss.str();
}
#endif
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
WORKER_EXIT();
}
/**
* Search through a single (forward) Ebwt index for exact end-to-end
* hits. Assumes that index is already loaded into memory.
*/
static void mismatchSearchFull(PatternComposer& _patsrc,
HitSink& _sink,
Ebwt<String<Dna> >& ebwtFw,
Ebwt<String<Dna> >& ebwtBw,
vector<String<Dna5> >& os)
{
mismatchSearch_patsrc = &_patsrc;
mismatchSearch_sink = &_sink;
mismatchSearch_ebwtFw = &ebwtFw;
mismatchSearch_ebwtBw = &ebwtBw;
mismatchSearch_doneMask = NULL;
mismatchSearch_hitMask = NULL;
mismatchSearch_os = &os;
assert(!ebwtFw.isInMemory());
assert(!ebwtBw.isInMemory());
{
// Load the other half of the index into memory
Timer _t(cerr, "Time loading forward index: ", timing);
ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
}
{
// Load the other half of the index into memory
Timer _t(cerr, "Time loading mirror index: ", timing);
ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
}
// Create range caches, which are shared among all aligners
BitPairReference *refs = NULL;
bool pair = mates1.size() > 0 || mates12.size() > 0;
if(color || (pair && mixedThresh < 0xffffffff)) {
Timer _t(cerr, "Time loading reference: ", timing);
refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
if(!refs->loaded()) throw 1;
}
mismatchSearch_refs = refs;
#ifdef WITH_TBB
AutoArray<std::thread*> threads(nthreads);
#else
AutoArray<tthread::thread*> threads(nthreads);
AutoArray<int> tids(nthreads);
#endif
#ifdef WITH_TBB
tbb::atomic<int> all_threads_done;
all_threads_done = 0;
#endif
CHUD_START();
{
Timer _t(cerr, "Time for 1-mismatch full-index search: ", timing);
for(int i = 0; i < nthreads; i++) {
#ifdef WITH_TBB
thread_tracking_pair tp;
tp.tid = i;
tp.done = &all_threads_done;
if(stateful) {
threads[i] = new std::thread(mismatchSearchWorkerFullStateful, (void*)&tp);
} else {
threads[i] = new std::thread(mismatchSearchWorkerFull, (void*)&tp);
}
threads[i]->detach();
SLEEP(10);
}
while(all_threads_done < nthreads) {
SLEEP(10);
}
#else
tids[i] = i;
if(stateful) {
threads[i] = new tthread::thread(mismatchSearchWorkerFullStateful, (void*)&tids[i]);
} else {
threads[i] = new tthread::thread(mismatchSearchWorkerFull, (void*)&tids[i]);
}
}
for(int i = 0; i < nthreads; i++) {
threads[i]->join();
}
#endif
}
if(refs != NULL) delete refs;
}
#define SWITCH_TO_FW_INDEX() { \
/* Evict the mirror index from memory if necessary */ \
if(ebwtBw.isInMemory()) ebwtBw.evictFromMemory(); \
assert(!ebwtBw.isInMemory()); \
/* Load the forward index into memory if necessary */ \
if(!ebwtFw.isInMemory()) { \
Timer _t(cerr, "Time loading forward index: ", timing); \
ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose); \
} \
assert(ebwtFw.isInMemory()); \
_patsrc.reset(); /* rewind pattern source to first pattern */ \
}
#define SWITCH_TO_BW_INDEX() { \
/* Evict the forward index from memory if necessary */ \
if(ebwtFw.isInMemory()) ebwtFw.evictFromMemory(); \
assert(!ebwtFw.isInMemory()); \
/* Load the forward index into memory if necessary */ \
if(!ebwtBw.isInMemory()) { \
Timer _t(cerr, "Time loading mirror index: ", timing); \
ebwtBw.loadIntoMemory(color ? 1 : 0, !noRefNames, startVerbose); \
} \
assert(ebwtBw.isInMemory()); \
_patsrc.reset(); /* rewind pattern source to first pattern */ \
}
#define ASSERT_NO_HITS_FW(ebwtfw) \
if(sanityCheck && os.size() > 0) { \
vector<Hit> hits; \
uint32_t threeRevOff = (seedMms <= 3) ? s : 0; \
uint32_t twoRevOff = (seedMms <= 2) ? s : 0; \
uint32_t oneRevOff = (seedMms <= 1) ? s : 0; \
uint32_t unrevOff = (seedMms == 0) ? s : 0; \
if(hits.size() > 0) { \
/* Print offending hit obtained by oracle */ \
::printHit( \
os, \
hits[0], \
patFw, \
plen, \
unrevOff, \
oneRevOff, \
twoRevOff, \
threeRevOff, \
ebwtfw); /* ebwtFw */ \
} \
assert_eq(0, hits.size()); \
}
#define ASSERT_NO_HITS_RC(ebwtfw) \
if(sanityCheck && os.size() > 0) { \
vector<Hit> hits; \
uint32_t threeRevOff = (seedMms <= 3) ? s : 0; \
uint32_t twoRevOff = (seedMms <= 2) ? s : 0; \
uint32_t oneRevOff = (seedMms <= 1) ? s : 0; \
uint32_t unrevOff = (seedMms == 0) ? s : 0; \
if(hits.size() > 0) { \
/* Print offending hit obtained by oracle */ \
::printHit( \
os, \
hits[0], \
patRc, \
plen, \
unrevOff, \
oneRevOff, \
twoRevOff, \
threeRevOff, \
ebwtfw); /* ebwtFw */ \
} \
assert_eq(0, hits.size()); \
}
static PatternComposer* twoOrThreeMismatchSearch_patsrc;
static HitSink* twoOrThreeMismatchSearch_sink;
static Ebwt<String<Dna> >* twoOrThreeMismatchSearch_ebwtFw;
static Ebwt<String<Dna> >* twoOrThreeMismatchSearch_ebwtBw;
static vector<String<Dna5> >* twoOrThreeMismatchSearch_os;
static SyncBitset* twoOrThreeMismatchSearch_doneMask;
static SyncBitset* twoOrThreeMismatchSearch_hitMask;
static bool twoOrThreeMismatchSearch_two;
static BitPairReference* twoOrThreeMismatchSearch_refs;
/**
* A statefulness-aware worker driver. Uses UnpairedExactAlignerV1.
*/
#ifdef WITH_TBB
//void twoOrThreeMismatchSearchWorkerStateful::operator()() const {
static void twoOrThreeMismatchSearchWorkerStateful(void *vp) {
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void twoOrThreeMismatchSearchWorkerStateful(void *vp) {
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *twoOrThreeMismatchSearch_patsrc;
HitSink& _sink = *twoOrThreeMismatchSearch_sink;
Ebwt<String<Dna> >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw;
Ebwt<String<Dna> >& ebwtBw = *twoOrThreeMismatchSearch_ebwtBw;
vector<String<Dna5> >& os = *twoOrThreeMismatchSearch_os;
BitPairReference* refs = twoOrThreeMismatchSearch_refs;
static bool two = twoOrThreeMismatchSearch_two;
// Global initialization
PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
Unpaired23mmAlignerV1Factory alSEfact(
ebwtFw,
&ebwtBw,
two,
!nofw,
!norc,
_sink,
*sinkFact,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs,
os,
!noMaqRound,
!better,
strandFix,
rangeMode,
verbose,
quiet,
seed);
Paired23mmAlignerV1Factory alPEfact(
ebwtFw,
&ebwtBw,
color,
!nofw,
!norc,
useV1,
two,
_sink,
*sinkFact,
mate1fw,
mate2fw,
minInsert,
maxInsert,
dontReconcileMates,
mhits, // for symCeiling
mixedThresh,
mixedAttemptLim,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs, os,
reportSe,
!noMaqRound,
!better,
strandFix,
rangeMode,
verbose,
quiet,
seed);
{
MixedMultiAligner multi(
prefetchWidth,
qUpto,
alSEfact,
alPEfact,
*patsrcFact);
// Run that mother
multi.run(false, tid);
// MultiAligner must be destroyed before patsrcFact
}
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
delete patsrcFact;
delete sinkFact;
delete pool;
return;
}
#ifdef WITH_TBB
//void twoOrThreeMismatchSearchWorkerFull::operator()() const {
static void twoOrThreeMismatchSearchWorkerFull(void *vp) {
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void twoOrThreeMismatchSearchWorkerFull(void *vp) {
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *twoOrThreeMismatchSearch_patsrc;
HitSink& _sink = *twoOrThreeMismatchSearch_sink;
vector<String<Dna5> >& os = *twoOrThreeMismatchSearch_os;
bool two = twoOrThreeMismatchSearch_two;
PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
PatternSourcePerThread* patsrc = patsrcFact->create();
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
HitSinkPerThread* sink = sinkFact->create();
/* Per-thread initialization */
EbwtSearchParams<String<Dna> > params(
*sink, /* HitSink */
os, /* reference sequences */
true, /* read is forward */
true); /* index is forward */
Ebwt<String<Dna> >& ebwtFw = *twoOrThreeMismatchSearch_ebwtFw;
Ebwt<String<Dna> >& ebwtBw = *twoOrThreeMismatchSearch_ebwtBw;
const BitPairReference* refs = twoOrThreeMismatchSearch_refs;
GreedyDFSRangeSource btr1(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
0xffffffff, // qualThresh
// Do not impose maximums in 2/3-mismatch mode
0xffffffff, // max backtracks (no limit)
0, // reportPartials (don't)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
false); // considerQuals
GreedyDFSRangeSource bt2(
&ebwtBw, params,
refs, // reference sequence (for colorspace)
0xffffffff, // qualThresh
// Do not impose maximums in 2/3-mismatch mode
0xffffffff, // max backtracks (no limit)
0, // reportPartials (no)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
false); // considerQuals
GreedyDFSRangeSource bt3(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
0xffffffff, // qualThresh (none)
// Do not impose maximums in 2/3-mismatch mode
0xffffffff, // max backtracks (no limit)
0, // reportPartials (don't)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
false); // considerQuals
GreedyDFSRangeSource bthh3(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
0xffffffff, // qualThresh
// Do not impose maximums in 2/3-mismatch mode
0xffffffff, // max backtracks (no limit)
0, // reportPartials (don't)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
false, // considerQuals
true); // halfAndHalf
bool skipped = false;
pair<bool, bool> get_read_ret = make_pair(false, false);
#ifdef PER_THREAD_TIMING
uint64_t ncpu_changeovers = 0;
uint64_t nnuma_changeovers = 0;
int current_cpu = 0, current_node = 0;
get_cpu_and_node(current_cpu, current_node);
std::stringstream ss;
std::string msg;
ss << "thread: " << tid << " time: ";
msg = ss.str();
{
Timer timer(std::cout, msg.c_str());
#endif
while(true) { // Read read-in loop
#ifdef PER_THREAD_TIMING
int cpu = 0, node = 0;
get_cpu_and_node(cpu, node);
if(cpu != current_cpu) {
ncpu_changeovers++;
current_cpu = cpu;
}
if(node != current_node) {
nnuma_changeovers++;
current_node = node;
}
#endif
FINISH_READ(patsrc);
GET_READ(patsrc);
patid += 0; // kill unused variable warning
uint32_t plen = (uint32_t)length(patFw);
uint32_t s = plen;
uint32_t s3 = s >> 1; // length of 3' half of seed
uint32_t s5 = (s >> 1) + (s & 1); // length of 5' half of seed
#define DONEMASK_SET(p)
#include "search_23mm_phase1.c"
#include "search_23mm_phase2.c"
#include "search_23mm_phase3.c"
#undef DONEMASK_SET
}
FINISH_READ(patsrc);
#ifdef PER_THREAD_TIMING
ss.str("");
ss.clear();
ss << "thread: " << tid << " cpu_changeovers: " << ncpu_changeovers << std::endl
<< "thread: " << tid << " node_changeovers: " << nnuma_changeovers << std::endl;
std::cout << ss.str();
}
#endif
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
// Threads join at end of Phase 1
WORKER_EXIT();
}
template<typename TStr>
static void twoOrThreeMismatchSearchFull(
PatternComposer& _patsrc, /// pattern source
HitSink& _sink, /// hit sink
Ebwt<TStr>& ebwtFw, /// index of original text
Ebwt<TStr>& ebwtBw, /// index of mirror text
vector<String<Dna5> >& os, /// text strings, if available (empty otherwise)
bool two = true) /// true -> 2, false -> 3
{
// Global initialization
assert(!ebwtFw.isInMemory());
assert(!ebwtBw.isInMemory());
{
// Load the other half of the index into memory
Timer _t(cerr, "Time loading forward index: ", timing);
ebwtFw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
}
{
// Load the other half of the index into memory
Timer _t(cerr, "Time loading mirror index: ", timing);
ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
}
// Create range caches, which are shared among all aligners
BitPairReference *refs = NULL;
bool pair = mates1.size() > 0 || mates12.size() > 0;
if(color || (pair && mixedThresh < 0xffffffff)) {
Timer _t(cerr, "Time loading reference: ", timing);
refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
if(!refs->loaded()) throw 1;
}
twoOrThreeMismatchSearch_refs = refs;
twoOrThreeMismatchSearch_patsrc = &_patsrc;
twoOrThreeMismatchSearch_sink = &_sink;
twoOrThreeMismatchSearch_ebwtFw = &ebwtFw;
twoOrThreeMismatchSearch_ebwtBw = &ebwtBw;
twoOrThreeMismatchSearch_os = &os;
twoOrThreeMismatchSearch_doneMask = NULL;
twoOrThreeMismatchSearch_hitMask = NULL;
twoOrThreeMismatchSearch_two = two;
#ifdef WITH_TBB
AutoArray<std::thread*> threads(nthreads);
#else
AutoArray<tthread::thread*> threads(nthreads);
AutoArray<int> tids(nthreads);
#endif
#ifdef WITH_TBB
tbb::atomic<int> all_threads_done;
all_threads_done = 0;
#endif
CHUD_START();
{
Timer _t(cerr, "End-to-end 2/3-mismatch full-index search: ", timing);
for(int i = 0; i < nthreads; i++) {
#ifdef WITH_TBB
thread_tracking_pair tp;
tp.tid = i;
tp.done = &all_threads_done;
if(stateful) {
threads[i] = new std::thread(twoOrThreeMismatchSearchWorkerStateful, (void*) &tp);
} else {
threads[i] = new std::thread(twoOrThreeMismatchSearchWorkerFull, (void*) &tp);
}
threads[i]->detach();
SLEEP(10);
}
while(all_threads_done < nthreads) {
SLEEP(10);
}
#else
tids[i] = i;
if(stateful) {
threads[i] = new tthread::thread(twoOrThreeMismatchSearchWorkerStateful, (void*)&tids[i]);
} else {
threads[i] = new tthread::thread(twoOrThreeMismatchSearchWorkerFull, (void*)&tids[i]);
}
}
for(int i = 0; i < nthreads; i++) {
threads[i]->join();
}
#endif
}
if(refs != NULL) delete refs;
return;
}
static PatternComposer* seededQualSearch_patsrc;
static HitSink* seededQualSearch_sink;
static Ebwt<String<Dna> >* seededQualSearch_ebwtFw;
static Ebwt<String<Dna> >* seededQualSearch_ebwtBw;
static vector<String<Dna5> >* seededQualSearch_os;
static SyncBitset* seededQualSearch_doneMask;
static SyncBitset* seededQualSearch_hitMask;
static PartialAlignmentManager* seededQualSearch_pamFw;
static PartialAlignmentManager* seededQualSearch_pamRc;
static int seededQualSearch_qualCutoff;
static BitPairReference* seededQualSearch_refs;
#ifdef WITH_TBB
//void seededQualSearchWorkerFull::operator()() const {
static void seededQualSearchWorkerFull(void *vp) {
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void seededQualSearchWorkerFull(void *vp) {
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *seededQualSearch_patsrc;
HitSink& _sink = *seededQualSearch_sink;
vector<String<Dna5> >& os = *seededQualSearch_os;
int qualCutoff = seededQualSearch_qualCutoff;
PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
PatternSourcePerThread* patsrc = patsrcFact->create();
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
HitSinkPerThread* sink = sinkFact->create();
/* Per-thread initialization */
EbwtSearchParams<String<Dna> > params(
*sink, /* HitSink */
os, /* reference sequences */
true, /* read is forward */
true); /* index is forward */
Ebwt<String<Dna> >& ebwtFw = *seededQualSearch_ebwtFw;
Ebwt<String<Dna> >& ebwtBw = *seededQualSearch_ebwtBw;
PartialAlignmentManager * pamRc = NULL;
PartialAlignmentManager * pamFw = NULL;
if(seedMms > 0) {
pamRc = new PartialAlignmentManager(64);
pamFw = new PartialAlignmentManager(64);
}
vector<PartialAlignment> pals;
const BitPairReference* refs = seededQualSearch_refs;
// GreedyDFSRangeSource for finding exact hits for the forward-
// oriented read
GreedyDFSRangeSource btf1(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh
maxBtsBetter, // max backtracks
0, // reportPartials (don't)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
false); // considerQuals
GreedyDFSRangeSource bt1(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh
maxBtsBetter, // max backtracks
0, // reportPartials (don't)
true, // reportExacts
rangeMode, // reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os, // reference sequences
true, // considerQuals
false, !noMaqRound);
// GreedyDFSRangeSource to search for hits for cases 1F, 2F, 3F
GreedyDFSRangeSource btf2(
&ebwtBw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh
maxBtsBetter, // max backtracks
0, // reportPartials (no)
true, // reportExacts
rangeMode, // reportRanges
NULL, // partial alignment manager
NULL, // mutations
verbose, // verbose
&os, // reference sequences
true, // considerQuals
false, !noMaqRound);
// GreedyDFSRangeSource to search for partial alignments for case 4R
GreedyDFSRangeSource btr2(
&ebwtBw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh (none)
maxBtsBetter, // max backtracks
seedMms, // report partials (up to seedMms mms)
true, // reportExacts
rangeMode, // reportRanges
pamRc, // partial alignment manager
NULL, // mutations
verbose, // verbose
&os, // reference sequences
true, // considerQuals
false, !noMaqRound);
// GreedyDFSRangeSource to search for seedlings for case 4F
GreedyDFSRangeSource btf3(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh (none)
maxBtsBetter, // max backtracks
seedMms, // reportPartials (do)
true, // reportExacts
rangeMode, // reportRanges
pamFw, // seedlings
NULL, // mutations
verbose, // verbose
&os, // reference sequences
true, // considerQuals
false, !noMaqRound);
// GreedyDFSRangeSource to search for hits for case 4R by extending
// the partial alignments found in Phase 2
GreedyDFSRangeSource btr3(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh
maxBtsBetter, // max backtracks
0, // reportPartials (don't)
true, // reportExacts
rangeMode,// reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os, // reference sequences
true, // considerQuals
false, !noMaqRound);
// The half-and-half GreedyDFSRangeSource
GreedyDFSRangeSource btr23(
&ebwtFw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh
maxBtsBetter, // max backtracks
0, // reportPartials (don't)
true, // reportExacts
rangeMode,// reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
true, // considerQuals
true, // halfAndHalf
!noMaqRound);
// GreedyDFSRangeSource to search for hits for case 4F by extending
// the partial alignments found in Phase 3
GreedyDFSRangeSource btf4(
&ebwtBw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh
maxBtsBetter, // max backtracks
0, // reportPartials (don't)
true, // reportExacts
rangeMode,// reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os, // reference sequences
true, // considerQuals
false, !noMaqRound);
// Half-and-half GreedyDFSRangeSource for forward read
GreedyDFSRangeSource btf24(
&ebwtBw, params,
refs, // reference sequence (for colorspace)
qualCutoff, // qualThresh
maxBtsBetter, // max backtracks
0, // reportPartials (don't)
true, // reportExacts
rangeMode,// reportRanges
NULL, // seedlings
NULL, // mutations
verbose, // verbose
&os,
true, // considerQuals
true, // halfAndHalf
!noMaqRound);
String<QueryMutation> muts;
bool skipped = false;
pair<bool, bool> get_read_ret = make_pair(false, false);
#ifdef PER_THREAD_TIMING
uint64_t ncpu_changeovers = 0;
uint64_t nnuma_changeovers = 0;
int current_cpu = 0, current_node = 0;
get_cpu_and_node(current_cpu, current_node);
std::stringstream ss;
std::string msg;
ss << "thread: " << tid << " time: ";
msg = ss.str();
{
Timer timer(std::cout, msg.c_str());
#endif
while(true) {
#ifdef PER_THREAD_TIMING
int cpu = 0, node = 0;
get_cpu_and_node(cpu, node);
if(cpu != current_cpu) {
ncpu_changeovers++;
current_cpu = cpu;
}
if(node != current_node) {
nnuma_changeovers++;
current_node = node;
}
#endif
FINISH_READ(patsrc);
GET_READ(patsrc);
uint32_t plen = (uint32_t)length(patFw);
uint32_t s = seedLen;
uint32_t s3 = (s >> 1); /* length of 3' half of seed */
uint32_t s5 = (s >> 1) + (s & 1); /* length of 5' half of seed */
uint32_t qs = min<uint32_t>(plen, s);
uint32_t qs3 = qs >> 1;
uint32_t qs5 = (qs >> 1) + (qs & 1);
#define DONEMASK_SET(p)
#include "search_seeded_phase1.c"
#include "search_seeded_phase2.c"
#include "search_seeded_phase3.c"
#include "search_seeded_phase4.c"
#undef DONEMASK_SET
}
FINISH_READ(patsrc);
if(seedMms > 0) {
delete pamRc;
delete pamFw;
}
#ifdef PER_THREAD_TIMING
ss.str("");
ss.clear();
ss << "thread: " << tid << " cpu_changeovers: " << ncpu_changeovers << std::endl
<< "thread: " << tid << " node_changeovers: " << nnuma_changeovers << std::endl;
std::cout << ss.str();
}
#endif
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
WORKER_EXIT();
}
#ifdef WITH_TBB
//void seededQualSearchWorkerFullStateful::operator()() const {
static void seededQualSearchWorkerFullStateful(void *vp) {
thread_tracking_pair *p = (thread_tracking_pair*) vp;
int tid = p->tid;
#else
static void seededQualSearchWorkerFullStateful(void *vp) {
int tid = *((int*)vp);
#endif
PatternComposer& _patsrc = *seededQualSearch_patsrc;
HitSink& _sink = *seededQualSearch_sink;
Ebwt<String<Dna> >& ebwtFw = *seededQualSearch_ebwtFw;
Ebwt<String<Dna> >& ebwtBw = *seededQualSearch_ebwtBw;
vector<String<Dna5> >& os = *seededQualSearch_os;
int qualCutoff = seededQualSearch_qualCutoff;
BitPairReference* refs = seededQualSearch_refs;
// Global initialization
PatternSourcePerThreadFactory* patsrcFact = createPatsrcFactory(_patsrc, tid, readsPerBatch);
HitSinkPerThreadFactory* sinkFact = createSinkFactory(_sink, tid);
ChunkPool *pool = new ChunkPool(chunkSz * 1024, chunkPoolMegabytes * 1024 * 1024, chunkVerbose);
AlignerMetrics *metrics = NULL;
if(stats) {
metrics = new AlignerMetrics();
}
UnpairedSeedAlignerFactory alSEfact(
ebwtFw,
&ebwtBw,
!nofw,
!norc,
seedMms,
seedLen,
qualCutoff,
maxBts,
_sink,
*sinkFact,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs,
os,
!noMaqRound,
!better,
strandFix,
rangeMode,
verbose,
quiet,
seed,
metrics);
PairedSeedAlignerFactory alPEfact(
ebwtFw,
&ebwtBw,
color,
useV1,
!nofw,
!norc,
seedMms,
seedLen,
qualCutoff,
maxBts,
_sink,
*sinkFact,
mate1fw,
mate2fw,
minInsert,
maxInsert,
dontReconcileMates,
mhits, // for symCeiling
mixedThresh,
mixedAttemptLim,
NULL, //&cacheFw,
NULL, //&cacheBw,
cacheLimit,
pool,
refs,
os,
reportSe,
!noMaqRound,
!better,
strandFix,
rangeMode,
verbose,
quiet,
seed);
{
MixedMultiAligner multi(
prefetchWidth,
qUpto,
alSEfact,
alPEfact,
*patsrcFact);
// Run that mother
multi.run(false, tid);
// MultiAligner must be destroyed before patsrcFact
}
if(metrics != NULL) {
metrics->printSummary();
delete metrics;
}
#ifdef WITH_TBB
p->done->fetch_and_add(1);
#endif
delete patsrcFact;
delete sinkFact;
delete pool;
return;
}
/**
* Search for a good alignments for each read using criteria that
* correspond somewhat faithfully to Maq's. Search is aided by a pair
* of Ebwt indexes, one for the original references, and one for the
* transpose of the references. Neither index should be loaded upon
* entry to this function.
*
* Like Maq, we treat the first 24 base pairs of the read (those
* closest to the 5' end) differently from the remainder of the read.
* We call the first 24 base pairs the "seed."
*/
template<typename TStr>
static void seededQualCutoffSearchFull(
int seedLen, /// length of seed (not a maq option)
int qualCutoff, /// maximum sum of mismatch qualities
/// like maq map's -e option
/// default: 70
int seedMms, /// max # mismatches allowed in seed
/// (like maq map's -n option)
/// Can only be 1 or 2, default: 1
PatternComposer& _patsrc, /// pattern source
HitSink& _sink, /// hit sink
Ebwt<TStr>& ebwtFw, /// index of original text
Ebwt<TStr>& ebwtBw, /// index of mirror text
vector<String<Dna5> >& os) /// text strings, if available (empty otherwise)
{
// Global intialization
assert_leq(seedMms, 3);
seededQualSearch_patsrc = &_patsrc;
seededQualSearch_sink = &_sink;
seededQualSearch_ebwtFw = &ebwtFw;
seededQualSearch_ebwtBw = &ebwtBw;
seededQualSearch_os = &os;
seededQualSearch_doneMask = NULL;
seededQualSearch_hitMask = NULL;
seededQualSearch_pamFw = NULL;
seededQualSearch_pamRc = NULL;
seededQualSearch_qualCutoff = qualCutoff;
// Create range caches, which are shared among all aligners
BitPairReference *refs = NULL;
bool pair = mates1.size() > 0 || mates12.size() > 0;
if(color || (pair && mixedThresh < 0xffffffff)) {
Timer _t(cerr, "Time loading reference: ", timing);
refs = new BitPairReference(adjustedEbwtFileBase, color, sanityCheck, NULL, &os, false, true, useMm, useShmem, mmSweep, verbose, startVerbose);
if(!refs->loaded()) throw 1;
}
seededQualSearch_refs = refs;
#ifdef WITH_TBB
//tbb::task_group tbb_grp;
AutoArray<std::thread*> threads(nthreads);
#else
AutoArray<tthread::thread*> threads(nthreads);
AutoArray<int> tids(nthreads);
#endif
#ifdef WITH_TBB
tbb::atomic<int> all_threads_done;
all_threads_done = 0;
#endif
SWITCH_TO_FW_INDEX();
assert(!ebwtBw.isInMemory());
{
// Load the other half of the index into memory
Timer _t(cerr, "Time loading mirror index: ", timing);
ebwtBw.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
}
CHUD_START();
{
// Phase 1: Consider cases 1R and 2R
Timer _t(cerr, "Seeded quality full-index search: ", timing);
for(int i = 0; i < nthreads; i++) {
#ifdef WITH_TBB
thread_tracking_pair tp;
tp.tid = i;
tp.done = &all_threads_done;
if(stateful) {
threads[i] = new std::thread(seededQualSearchWorkerFullStateful, (void*) &tp);
} else {
threads[i] = new std::thread(seededQualSearchWorkerFull, (void*) &tp);
}
threads[i]->detach();
SLEEP(10);
}
while(all_threads_done < nthreads) {
SLEEP(10);
}
#else
tids[i] = i;
if(stateful) {
threads[i] = new tthread::thread(seededQualSearchWorkerFullStateful, (void*)&tids[i]);
} else {
threads[i] = new tthread::thread(seededQualSearchWorkerFull, (void*)&tids[i]);
}
}
for(int i = 0; i < nthreads; i++) {
threads[i]->join();
}
#endif
}
if(refs != NULL) {
delete refs;
}
ebwtBw.evictFromMemory();
}
/**
* Return a new dynamically allocated PatternSource for the given
* format, using the given list of strings as the filenames to read
* from or as the sequences themselves (i.e. if -c was used).
*/
static PatternSource*
patsrcFromStrings(int format,
const vector<string>& reads,
const vector<string>* quals)
{
switch(format) {
case FASTA:
return new FastaPatternSource (reads, quals, color,
patDumpfile,
trim3, trim5,
solexaQuals, phred64Quals,
integerQuals);
case FASTA_CONT:
return new FastaContinuousPatternSource (
reads, fastaContLen,
fastaContFreq,
patDumpfile);
case RAW:
return new RawPatternSource (reads, color,
patDumpfile,
trim3, trim5);
case FASTQ:
return new FastqPatternSource (reads, color,
patDumpfile,
trim3, trim5,
solexaQuals, phred64Quals,
integerQuals);
case INTERLEAVED:
return new FastqPatternSource (reads, color,
patDumpfile,
trim3, trim5,
solexaQuals, phred64Quals,
integerQuals, true /* is interleaved */);
case TAB_MATE:
return new TabbedPatternSource(reads, false, color,
patDumpfile,
trim3, trim5);
case CMDLINE:
return new VectorPatternSource(reads, color,
patDumpfile,
trim3, trim5);
default: {
cerr << "Internal error; bad patsrc format: " << format << endl;
throw 1;
}
}
}
static string argstr;
template<typename TStr>
static void driver(const char * type,
const string& ebwtFileBase,
const string& query,
const vector<string>& queries,
const vector<string>& qualities,
const string& outfile)
{
if(verbose || startVerbose) {
cerr << "Entered driver(): "; logTime(cerr, true);
}
// Vector of the reference sequences; used for sanity-checking
vector<String<Dna5> > os;
// Read reference sequences from the command-line or from a FASTA file
if(!origString.empty()) {
// Determine if it's a file by looking at whether it has a FASTA-like
// extension
size_t len = origString.length();
if((len >= 6 && origString.substr(len-6) == ".fasta") ||
(len >= 4 && origString.substr(len-4) == ".mfa") ||
(len >= 4 && origString.substr(len-4) == ".fas") ||
(len >= 4 && origString.substr(len-4) == ".fna") ||
(len >= 3 && origString.substr(len-3) == ".fa"))
{
// Read fasta file
vector<string> origFiles;
tokenize(origString, ",", origFiles);
readSequenceFiles<String<Dna5>, Fasta>(origFiles, os);
} else {
// Read sequence
readSequenceString(origString, os);
}
}
// Adjust
adjustedEbwtFileBase = adjustEbwtBase(argv0, ebwtFileBase, verbose);
vector<PatternSource*> patsrcs_a;
vector<PatternSource*> patsrcs_b;
vector<PatternSource*> patsrcs_ab;
// If there were any first-mates specified, we will operate in
// stateful mode
bool paired = mates1.size() > 0 || mates12.size() > 0;
if(paired) stateful = true;
// Create list of pattern sources for paired reads appearing
// interleaved in a single file
if(verbose || startVerbose) {
cerr << "Creating paired-end patsrcs: "; logTime(cerr, true);
}
for(size_t i = 0; i < mates12.size(); i++) {
const vector<string>* qs = &mates12;
vector<string> tmp;
if(fileParallel) {
// Feed query files one to each PatternSource
qs = &tmp;
tmp.push_back(mates12[i]);
assert_eq(1, tmp.size());
}
patsrcs_ab.push_back(patsrcFromStrings(format, *qs, NULL));
if(!fileParallel) {
break;
}
}
// Create list of pattern sources for paired reads
for(size_t i = 0; i < mates1.size(); i++) {
const vector<string>* qs = &mates1;
const vector<string>* quals = &qualities1;
vector<string> tmpSeq;
vector<string> tmpQual;
if(fileParallel) {
// Feed query files one to each PatternSource
qs = &tmpSeq;
tmpSeq.push_back(mates1[i]);
quals = &tmpSeq;
tmpQual.push_back(qualities1[i]);
assert_eq(1, tmpSeq.size());
}
if(quals->empty()) quals = NULL;
patsrcs_a.push_back(patsrcFromStrings(format, *qs, quals));
if(!fileParallel) {
break;
}
}
// Create list of pattern sources for paired reads
for(size_t i = 0; i < mates2.size(); i++) {
const vector<string>* qs = &mates2;
const vector<string>* quals = &qualities2;
vector<string> tmpSeq;
vector<string> tmpQual;
if(fileParallel) {
// Feed query files one to each PatternSource
qs = &tmpSeq;
tmpSeq.push_back(mates2[i]);
quals = &tmpQual;
tmpQual.push_back(qualities2[i]);
assert_eq(1, tmpSeq.size());
}
if(quals->empty()) quals = NULL;
patsrcs_b.push_back(patsrcFromStrings(format, *qs, quals));
if(!fileParallel) {
break;
}
}
// All mates/mate files must be paired
assert_eq(patsrcs_a.size(), patsrcs_b.size());
// Create list of pattern sources for the unpaired reads
if(verbose || startVerbose) {
cerr << "Creating single-end patsrcs: "; logTime(cerr, true);
}
for(size_t i = 0; i < queries.size(); i++) {
const vector<string>* qs = &queries;
const vector<string>* quals = &qualities;
PatternSource* patsrc = NULL;
vector<string> tmpSeq;
vector<string> tmpQual;
if(fileParallel) {
// Feed query files one to each PatternSource
qs = &tmpSeq;
tmpSeq.push_back(queries[i]);
quals = &tmpQual;
tmpQual.push_back(qualities[i]);
assert_eq(1, tmpSeq.size());
}
if(quals->empty()) quals = NULL;
patsrc = patsrcFromStrings(format, *qs, quals);
assert(patsrc != NULL);
patsrcs_a.push_back(patsrc);
patsrcs_b.push_back(NULL);
if(!fileParallel) {
break;
}
}
if(verbose || startVerbose) {
cerr << "Creating PatternSource: "; logTime(cerr, true);
}
PatternComposer *patsrc = NULL;
if(mates12.size() > 0) {
patsrc = new SoloPatternComposer(patsrcs_ab);
} else {
patsrc = new DualPatternComposer(patsrcs_a, patsrcs_b);
}
// Open hit output file
if(verbose || startVerbose) {
cerr << "Opening hit output file: "; logTime(cerr, true);
}
OutFileBuf *fout;
if(!outfile.empty()) {
if(refOut) {
fout = NULL;
if(!quiet) {
cerr << "Warning: ignoring alignment output file " << outfile << " because --refout was specified" << endl;
}
} else {
fout = new OutFileBuf(outfile.c_str(), false);
}
} else {
fout = new OutFileBuf();
}
ReferenceMap* rmap = NULL;
if(refMapFile != NULL) {
if(verbose || startVerbose) {
cerr << "About to load in a reference map file with name "
<< refMapFile << ": "; logTime(cerr, true);
}
rmap = new ReferenceMap(refMapFile, !noRefNames);
}
AnnotationMap* amap = NULL;
if(annotMapFile != NULL) {
if(verbose || startVerbose) {
cerr << "About to load in an annotation map file with name "
<< annotMapFile << ": "; logTime(cerr, true);
}
amap = new AnnotationMap(annotMapFile);
}
// Initialize Ebwt object and read in header
if(verbose || startVerbose) {
cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
}
Ebwt<TStr> ebwt(adjustedEbwtFileBase,
color, // index is colorspace
-1, // don't care about entireReverse
true, // index is for the forward direction
/* overriding: */ offRate,
/* overriding: */ isaRate,
useMm, // whether to use memory-mapped files
useShmem, // whether to use shared memory
mmSweep, // sweep memory-mapped files
!noRefNames, // load names?
rmap, // reference map, or NULL if none is needed
verbose, // whether to be talkative
startVerbose, // talkative during initialization
false /*passMemExc*/,
sanityCheck);
Ebwt<TStr>* ebwtBw = NULL;
// We need the mirror index if mismatches are allowed
if(mismatches > 0 || maqLike) {
if(verbose || startVerbose) {
cerr << "About to initialize rev Ebwt: "; logTime(cerr, true);
}
ebwtBw = new Ebwt<TStr>(
adjustedEbwtFileBase + ".rev",
color, // index is colorspace
-1, // don't care about entireReverse
false, // index is for the reverse direction
/* overriding: */ offRate,
/* overriding: */ isaRate,
useMm, // whether to use memory-mapped files
useShmem, // whether to use shared memory
mmSweep, // sweep memory-mapped files
!noRefNames, // load names?
rmap, // reference map, or NULL if none is needed
verbose, // whether to be talkative
startVerbose, // talkative during initialization
false /*passMemExc*/,
sanityCheck);
}
if(!os.empty()) {
for(size_t i = 0; i < os.size(); i++) {
size_t olen = seqan::length(os[i]);
int longestStretch = 0;
int curStretch = 0;
for(size_t j = 0; j < olen; j++) {
if((int)os[i][j] < 4) {
curStretch++;
if(curStretch > longestStretch) longestStretch = curStretch;
} else {
curStretch = 0;
}
}
if(longestStretch < (color ? 2 : 1)) {
os.erase(os.begin() + i);
i--;
}
}
}
if(sanityCheck && !os.empty()) {
// Sanity check number of patterns and pattern lengths in Ebwt
// against original strings
assert_eq(os.size(), ebwt.nPat());
for(size_t i = 0; i < os.size(); i++) {
assert_eq(length(os[i]), ebwt.plen()[i] + (color ? 1 : 0));
}
ebwt.loadIntoMemory(color ? 1 : 0, -1, !noRefNames, startVerbose);
ebwt.checkOrigs(os, color, false);
ebwt.evictFromMemory();
}
{
Timer _t(cerr, "Time searching: ", timing);
if(verbose || startVerbose) {
cerr << "Creating HitSink: "; logTime(cerr, true);
}
// Set up hit sink; if sanityCheck && !os.empty() is true,
// then instruct the sink to "retain" hits in a vector in
// memory so that we can easily sanity check them later on
HitSink *sink;
vector<string>* refnames = &ebwt.refnames();
if(noRefNames) refnames = NULL;
switch(outType) {
case OUTPUT_FULL:
if(refOut) {
sink = new VerboseHitSink(
ebwt.nPat(), offBase,
colorSeq, colorQual, printCost,
suppressOuts, rmap, amap,
fullRef,
dumpAlBase,
dumpUnalBase,
dumpMaxBase,
format == TAB_MATE, sampleMax,
refnames, nthreads,
outBatchSz, partitionSz);
} else {
sink = new VerboseHitSink(
fout, offBase,
colorSeq, colorQual, printCost,
suppressOuts, rmap, amap,
fullRef,
dumpAlBase,
dumpUnalBase,
dumpMaxBase,
format == TAB_MATE, sampleMax,
refnames, nthreads,
outBatchSz, partitionSz);
}
break;
case OUTPUT_SAM:
if(refOut) {
throw 1;
} else {
SAMHitSink *sam = new SAMHitSink(
fout, 1, rmap, amap,
fullRef, samNoQnameTrunc,
dumpAlBase,
dumpUnalBase,
dumpMaxBase,
format == TAB_MATE,
sampleMax,
refnames,
nthreads,
outBatchSz);
if(!samNoHead) {
vector<string> refnames;
if(!samNoSQ) {
readEbwtRefnames(adjustedEbwtFileBase, refnames);
}
sam->appendHeaders(
sam->out(0),
ebwt.nPat(),
refnames, color, samNoSQ, rmap,
ebwt.plen(), fullRef,
samNoQnameTrunc,
argstr.c_str(),
rgs.empty() ? NULL : rgs.c_str());
}
sink = sam;
}
break;
case OUTPUT_CONCISE:
if(refOut) {
sink = new ConciseHitSink(
ebwt.nPat(),
offBase,
dumpAlBase,
dumpUnalBase,
dumpMaxBase,
format == TAB_MATE,
sampleMax,
refnames,
nthreads,
outBatchSz,
reportOpps);
} else {
sink = new ConciseHitSink(
fout,
offBase,
dumpAlBase,
dumpUnalBase,
dumpMaxBase,
format == TAB_MATE,
sampleMax,
refnames,
nthreads,
outBatchSz,
reportOpps);
}
break;
case OUTPUT_NONE:
sink = new StubHitSink();
break;
default:
cerr << "Invalid output type: " << outType << endl;
throw 1;
}
if(verbose || startVerbose) {
cerr << "Dispatching to search driver: "; logTime(cerr, true);
}
if(maqLike) {
seededQualCutoffSearchFull(seedLen,
qualThresh,
seedMms,
*patsrc,
*sink,
ebwt, // forward index
*ebwtBw, // mirror index (not optional)
os); // references, if available
}
else if(mismatches > 0) {
if(mismatches == 1) {
assert(ebwtBw != NULL);
mismatchSearchFull(*patsrc, *sink, ebwt, *ebwtBw, os);
} else if(mismatches == 2 || mismatches == 3) {
twoOrThreeMismatchSearchFull(*patsrc, *sink, ebwt, *ebwtBw, os, mismatches == 2);
} else {
cerr << "Error: " << mismatches << " is not a supported number of mismatches" << endl;
throw 1;
}
} else {
// Search without mismatches
// Note that --fast doesn't make a difference here because
// we're only loading half of the index anyway
exactSearch(*patsrc, *sink, ebwt, os);
}
// Evict any loaded indexes from memory
if(ebwt.isInMemory()) {
ebwt.evictFromMemory();
}
if(ebwtBw != NULL) {
delete ebwtBw;
}
sink->finish(hadoopOut); // end the hits section of the hit file
for(size_t i = 0; i < patsrcs_a.size(); i++) {
assert(patsrcs_a[i] != NULL);
delete patsrcs_a[i];
}
for(size_t i = 0; i < patsrcs_b.size(); i++) {
if(patsrcs_b[i] != NULL) {
delete patsrcs_b[i];
}
}
for(size_t i = 0; i < patsrcs_ab.size(); i++) {
if(patsrcs_ab[i] != NULL) {
delete patsrcs_ab[i];
}
}
delete patsrc;
delete sink;
delete amap;
delete rmap;
if(fout != NULL) delete fout;
}
}
// C++ name mangling is disabled for the bowtie() function to make it
// easier to use Bowtie as a library.
extern "C" {
/**
* Main bowtie entry function. Parses argc/argv style command-line
* options, sets global configuration variables, and calls the driver()
* function.
*/
int bowtie(int argc, const char **argv) {
try {
#ifdef WITH_TBB
#ifdef WITH_AFFINITY
//CWILKS: adjust this depending on # of hyperthreads per core
pinning_observer pinner( 2 /* the number of hyper threads on each core */ );
pinner.observe( true );
#endif
#endif
// Reset all global state, including getopt state
opterr = optind = 1;
resetOptions();
for(int i = 0; i < argc; i++) {
argstr += argv[i];
if(i < argc-1) argstr += " ";
}
string ebwtFile; // read serialized Ebwt from this file
string query; // read query string(s) from this file
vector<string> queries;
string outfile; // write query results to this file
if(startVerbose) { cerr << "Entered main(): "; logTime(cerr, true); }
parseOptions(argc, argv);
argv0 = argv[0];
if(showVersion) {
cout << argv0 << " version " << BOWTIE_VERSION << endl;
if(sizeof(void*) == 4) {
cout << "32-bit" << endl;
} else if(sizeof(void*) == 8) {
cout << "64-bit" << endl;
} else {
cout << "Neither 32- nor 64-bit: sizeof(void*) = " << sizeof(void*) << endl;
}
cout << "Built on " << BUILD_HOST << endl;
cout << BUILD_TIME << endl;
cout << "Compiler: " << COMPILER_VERSION << endl;
cout << "Options: " << COMPILER_OPTIONS << endl;
cout << "Sizeof {int, long, long long, void*, size_t, off_t}: {"
<< sizeof(int)
<< ", " << sizeof(long) << ", " << sizeof(long long)
<< ", " << sizeof(void *) << ", " << sizeof(size_t)
<< ", " << sizeof(off_t) << "}" << endl;
return 0;
}
#ifdef CHUD_PROFILING
chudInitialize();
chudAcquireRemoteAccess();
#endif
{
Timer _t(cerr, "Overall time: ", timing);
if(startVerbose) {
cerr << "Parsing index and read arguments: "; logTime(cerr, true);
}
// Get index basename
if(optind >= argc) {
cerr << "No index, query, or output file specified!" << endl;
printUsage(cerr);
return 1;
}
ebwtFile = argv[optind++];
// Get query filename
if(optind >= argc) {
if(mates1.size() > 0 || mates12.size() > 0) {
query = "";
} else {
cerr << "No query or output file specified!" << endl;
printUsage(cerr);
return 1;
}
} else if (mates1.size() == 0 && mates12.size() == 0) {
query = argv[optind++];
// Tokenize the list of query files
tokenize(query, ",", queries);
if(queries.size() < 1) {
cerr << "Tokenized query file list was empty!" << endl;
printUsage(cerr);
return 1;
}
}
// Get output filename
if(optind < argc) {
outfile = argv[optind++];
}
// Extra parametesr?
if(optind < argc) {
cerr << "Extra parameter(s) specified: ";
for(int i = optind; i < argc; i++) {
cerr << "\"" << argv[i] << "\"";
if(i < argc-1) cerr << ", ";
}
cerr << endl;
if(mates1.size() > 0) {
cerr << "Note that if <mates> files are specified using -1/-2, a <singles> file cannot" << endl
<< "also be specified. Please run bowtie separately for mates and singles." << endl;
}
throw 1;
}
// Optionally summarize
if(verbose) {
cout << "Input ebwt file: \"" << ebwtFile << "\"" << endl;
cout << "Query inputs (DNA, " << file_format_names[format] << "):" << endl;
for(size_t i = 0; i < queries.size(); i++) {
cout << " " << queries[i] << endl;
}
cout << "Quality inputs:" << endl;
for(size_t i = 0; i < qualities.size(); i++) {
cout << " " << qualities[i] << endl;
}
cout << "Output file: \"" << outfile << "\"" << endl;
cout << "Local endianness: " << (currentlyBigEndian()? "big":"little") << endl;
cout << "Sanity checking: " << (sanityCheck? "enabled":"disabled") << endl;
#ifdef NDEBUG
cout << "Assertions: disabled" << endl;
#else
cout << "Assertions: enabled" << endl;
#endif
}
if(ipause) {
cout << "Press key to continue..." << endl;
getchar();
}
driver<String<Dna, Alloc<> > >("DNA", ebwtFile, query, queries, qualities, outfile);
CHUD_STOP();
}
#ifdef CHUD_PROFILING
chudReleaseRemoteAccess();
#endif
#ifdef WITH_TBB
#ifdef WITH_AFFINITY
// Always disable observation before observers destruction
//tracker.observe( false );
pinner.observe( false );
#endif
#endif
return 0;
} catch(exception& e) {
cerr << "Command: ";
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
cerr << endl;
return 1;
} catch(int e) {
if(e != 0) {
cerr << "Command: ";
for(int i = 0; i < argc; i++) cerr << argv[i] << " ";
cerr << endl;
}
return e;
}
} // bowtie()
} // extern "C"