Permalink
Please sign in to comment.
Browse files
Streamlining and slimming down the contents of the bowtie module in p…
…reparation for the 1.0 release
- Loading branch information...
Showing
with
419 additions
and 11,251 deletions.
- +0 −342 LVKernel.cpp
- +0 −115 LVKernel.h
- +18 −117 Makefile
- +0 −95 README
- +0 −52 TODO
- +6 −56 alphabet.h
- +0 −2 bitpack.h
- +0 −190 blockwise_sa.cpp
- +25 −43 blockwise_sa.h
- +0 −38 bwt.cpp
- +0 −6 ccnt_lut.h
- +0 −100 diff_covers.h
- +0 −243 diff_sample.cpp
- +98 −6 diff_sample.h
- +14 −8 ebwt.h
- +8 −16 ebwt_build.cpp
- +15 −476 ebwt_search.cpp
- +33 −0 ebwt_search_backtrack.h
- +0 −73 endian_swap.cpp
- +71 −6 endian_swap.h
- +21 −0 formats.h
- +0 −88 gen_lookup_tables.pl
- +0 −183 getusage/getusage.c
- +0 −107 inexact_extend.cpp
- +0 −306 inexact_extend.h
- +0 −241 lcp.cpp
- +1 −1 maq_convert/bowtie_convert.cpp
- +1 −2 maq_convert/read_bfq.h
- +0 −287 multikey_qsort.cpp
- +0 −129 pack_fasta.cpp
- +0 −35 packed_io.cpp
- +0 −324 packed_io.h
- BIN papers/CMSC858P_Report.pdf
- +0 −55 params.h
- +24 −135 pat.h
- +0 −50 pop.cpp
- +0 −255 prefetch_bench.cpp
- +0 −34 quals.cpp
- +0 −9 quals.h
- +3 −0 random_source.h
- +3 −5 ref_read.cpp
- +3 −3 ref_read.h
- +0 −98 rusage.cpp
- +0 −8 rusage.h
- +0 −29 seqan_helpers.h
- +0 −3 sequence_io.h
- +0 −454 simreads.cpp
- +0 −78 solexa.h
- +0 −8 tests/chr22/snippet1_qry.mfa
- +0 −8 tests/chr22/snippet1_ref.fa
- +0 −8 tests/chr22/snippet2_qry.mfa
- +0 −3 tests/chr22/snippet2_ref.fa
- +0 −70 tests/maqLike/queries.fq
- +0 −2 tests/maqLike/ref.fa
- +0 −5 tests/maqLike/run.sh
- +0 −49 tests/oneMismatch/queries.mfa
- +0 −2 tests/oneMismatch/ref.fa
- +0 −4 tests/oneMismatch/run.sh
- +0 −25 tests/paper_eval/README
- +0 −9 tests/paper_eval/bowtie_reads_mapped/README
- +0 −3 tests/paper_eval/bowtie_reads_mapped/TODO
- +0 −5 tests/paper_eval/bowtie_reads_mapped/clean.sh
- +0 −21 tests/paper_eval/bowtie_reads_mapped/plot.gpl
- +0 −87 tests/paper_eval/bowtie_reads_mapped/plot.pl
- +0 −38 tests/paper_eval/bowtie_reads_mapped/plot.sh
- +0 −620 tests/paper_eval/bowtie_reads_mapped/results.eps
- BIN tests/paper_eval/bowtie_reads_mapped/results.pdf
- +0 −99 tests/paper_eval/bowtie_reads_mapped/run.sh
- +0 −8 tests/paper_eval/builds/README
- +0 −1 tests/paper_eval/builds/TODO
- BIN tests/paper_eval/builds/builds.pdf
- +0 −42 tests/paper_eval/builds/builds.tex
- +0 −5 tests/paper_eval/builds/clean.sh
- +0 −18 tests/paper_eval/builds/headerinc.tex
- +0 −101 tests/paper_eval/builds/plot.pl
- +0 −24 tests/paper_eval/builds/plot.sh
- +0 −43 tests/paper_eval/builds/run.sh
- +0 −38 tests/paper_eval/builds/setup.sh
- +0 −13 tests/paper_eval/genome_varies/README
- +0 −3 tests/paper_eval/genome_varies/TODO
- +0 −41 tests/paper_eval/genome_varies/build.sh
- +0 −24 tests/paper_eval/genome_varies/plot.gpl
- +0 −76 tests/paper_eval/genome_varies/plot.sh
- +0 −53 tests/paper_eval/genome_varies/readbuild.sh
- +0 −790 tests/paper_eval/genome_varies/results.eps
- BIN tests/paper_eval/genome_varies/results.pdf
- +0 −156 tests/paper_eval/genome_varies/run.sh
- +0 −9 tests/paper_eval/genome_varies/setup.sh
- +0 −7 tests/paper_eval/genome_varies/wrap.sh
- +0 −19 tests/paper_eval/kg_competition/README
- +0 −3 tests/paper_eval/kg_competition/TODO
- +0 −87 tests/paper_eval/kg_competition/analyze_maps.sh
- BIN tests/paper_eval/kg_competition/both.pdf
- +0 −66 tests/paper_eval/kg_competition/both.tex
- +0 −7 tests/paper_eval/kg_competition/clean.sh
- +0 −10 tests/paper_eval/kg_competition/driver.sh
- +0 −107 tests/paper_eval/kg_competition/ebwt.sh
- +0 −18 tests/paper_eval/kg_competition/headerinc.tex
- BIN tests/paper_eval/kg_competition/maq.pdf
- +0 −124 tests/paper_eval/kg_competition/maq.sh
- +0 −67 tests/paper_eval/kg_competition/maq.tex
- BIN tests/paper_eval/kg_competition/maq_filt.pdf
- +0 −67 tests/paper_eval/kg_competition/maq_filt.tex
- +0 −124 tests/paper_eval/kg_competition/maq_n1.sh
- BIN tests/paper_eval/kg_competition/maq_reads.pdf
- +0 −32 tests/paper_eval/kg_competition/maq_reads.tex
- BIN tests/paper_eval/kg_competition/maq_reads_filt.pdf
- +0 −32 tests/paper_eval/kg_competition/maq_reads_filt.tex
- +0 −205 tests/paper_eval/kg_competition/plot.pl
- +0 −148 tests/paper_eval/kg_competition/plot.sh
- +0 −72 tests/paper_eval/kg_competition/plot_reads.pl
- +0 −63 tests/paper_eval/kg_competition/plot_reads.sh
- BIN tests/paper_eval/kg_competition/server.pdf
- +0 −38 tests/paper_eval/kg_competition/server.tex
- +0 −87 tests/paper_eval/kg_competition/setup.sh
- BIN tests/paper_eval/kg_competition/soap.pdf
- +0 −65 tests/paper_eval/kg_competition/soap.sh
- +0 −57 tests/paper_eval/kg_competition/soap.tex
- BIN tests/paper_eval/kg_competition/soap_reads.pdf
- +0 −32 tests/paper_eval/kg_competition/soap_reads.tex
- BIN tests/paper_eval/kg_competition/soap_server.pdf
- +0 −33 tests/paper_eval/kg_competition/soap_server.tex
- BIN tests/paper_eval/kg_competition/workstation.pdf
- +0 −36 tests/paper_eval/kg_competition/workstation.tex
- +0 −135 tests/paper_eval/polymorph.py
- +0 −70 tests/paper_eval/rand_reads.py
- +0 −75 tests/paper_eval/run_eval.sh
- +0 −11 tests/paper_eval/simreads_competition/README
- +0 −3 tests/paper_eval/simreads_competition/TODO
- +0 −126 tests/paper_eval/simreads_competition/analyze_maps.sh
- +0 −7 tests/paper_eval/simreads_competition/clean.sh
- +0 −9 tests/paper_eval/simreads_competition/driver.sh
- +0 −45 tests/paper_eval/simreads_competition/ebwt.sh
- +0 −45 tests/paper_eval/simreads_competition/ebwt_n1.sh
- +0 −17 tests/paper_eval/simreads_competition/headerinc.tex
- +0 −108 tests/paper_eval/simreads_competition/maq.sh
- +0 −108 tests/paper_eval/simreads_competition/maq_n1.sh
- +0 −244 tests/paper_eval/simreads_competition/plot.pl
- +0 −44 tests/paper_eval/simreads_competition/plot.sh
- BIN tests/paper_eval/simreads_competition/server.pdf
- +0 −40 tests/paper_eval/simreads_competition/server.tex
- +0 −78 tests/paper_eval/simreads_competition/setup.sh
- +0 −65 tests/paper_eval/simreads_competition/soap.sh
- BIN tests/paper_eval/simreads_competition/workstation.pdf
- +0 −40 tests/paper_eval/simreads_competition/workstation.tex
- +0 −7 tests/paper_eval/simreads_competition/wrap.sh
- +0 −100 tests/paper_eval/snp_eval.py
- +0 −192 tests/random_unique/gentestreads.py
- +0 −70 tests/random_unique/rand_reads.py
- +0 −189 tests/random_unique/reads_from_mers.py
- +0 −261 tests/random_unique/test_search.sh
- +1 −1 timer.h
- +0 −18 tokenize.cpp
- +15 −3 tokenize.h
- +0 −49 txt_to_fastq.cpp
- +0 −63 word_io.cpp
- +59 −8 word_io.h
342
LVKernel.cpp
| @@ -1,342 +0,0 @@ | ||
| -#include <vector> | ||
| -#include <algorithm> | ||
| -#include <stdlib.h> | ||
| -#include <ctype.h> | ||
| -#include <string.h> | ||
| -#include "LVKernel.h" | ||
| - | ||
| -#define OS_X | ||
| - | ||
| -using namespace std; | ||
| - | ||
| -LVPyramid::LVPyramid(unsigned int Max_Diffs) : _max_diffs(Max_Diffs) | ||
| -{ | ||
| - // TODO: profile and check that this constructor isn't popping up | ||
| - // there are better ways to store the DP pyramid, but this is a | ||
| - // clear way. | ||
| - | ||
| - _cells.resize(_max_diffs + 2); | ||
| - for (unsigned int i = 0; i < _max_diffs + 2; ++i) | ||
| - { | ||
| - _cells[i] = vector<LVCell>(2 * (_max_diffs + 1) + 1); | ||
| - } | ||
| -} | ||
| - | ||
| -LVPyramid::~LVPyramid() | ||
| -{ | ||
| - | ||
| -} | ||
| - | ||
| -LVCell& LVPyramid::cell(int diffs, int offset) | ||
| -{ | ||
| - return _cells[diffs][offset]; | ||
| - //return *(_cells + (_max_diffs * offset) + diffs); | ||
| -} | ||
| - | ||
| -/****************************************************************************/ | ||
| -/* Public Functions */ | ||
| -/****************************************************************************/ | ||
| - | ||
| -// Pilfered from googlecode: | ||
| -// Find the first (least significant) set bit in a 64-bit integer. The return | ||
| -// value ranges from 0 (for no bit set), to 1 (for the least significant bit | ||
| -// set), to 64 (for only the most significant bit set). | ||
| -int find_64_lsm(uint64_t n) | ||
| -{ | ||
| -#if defined(OS_X) || defined(WINDOWS) | ||
| - n &= -n; | ||
| - int shift = (uint64_t) n <= 0xFFFFFFFFULL ? 0 : 32; | ||
| -#endif | ||
| - | ||
| -#if defined(LINUX) | ||
| - return ffsll(n); | ||
| -#elif defined(OS_X) | ||
| - return ffs(n >> shift) + shift; | ||
| -#elif defined(WINDOWS) | ||
| - return find_32(n >> shift) + shift; | ||
| -#endif | ||
| -} | ||
| - | ||
| -// Find the first (most significant) set bit in a 64-bit integer. The return | ||
| -// value ranges from 0 (for no bit set), to 1 (for the least significant bit | ||
| -// set), to 64 (for only the most significant bit set). | ||
| -int find_64_msm(uint64_t n) | ||
| -{ | ||
| - | ||
| - int pos = 0; | ||
| - uint64_t tmp; | ||
| - tmp = n >> 32; | ||
| - if (tmp != 0) { n = tmp; pos = pos + 32; } | ||
| - tmp = n >> 16; | ||
| - if (tmp != 0) { n = tmp; pos = pos + 16; } | ||
| - tmp = n >> 8; | ||
| - if (tmp != 0) { n = tmp; pos = pos + 8; } | ||
| - tmp = n >> 4; | ||
| - if (tmp != 0) { n = tmp; pos = pos + 4; } | ||
| - tmp = n >> 2; | ||
| - if (tmp != 0) { n = tmp; pos = pos + 2; } | ||
| - tmp = n >> 1; | ||
| - if (tmp != 0) { n = tmp; pos = pos + 1; } | ||
| - return pos + n - 1; | ||
| -} | ||
| - | ||
| - | ||
| -enum Dna { A, C, G, T }; | ||
| - | ||
| -DnaWord::DnaWord(const char* s, bool pack_left) | ||
| -{ | ||
| - len = strlen(s); | ||
| - assert (len <= 32); | ||
| - word = pack_dna_string(s, pack_left); | ||
| -} | ||
| - | ||
| -bool DnaWord::operator==(const DnaWord& rhs) const | ||
| -{ | ||
| - return (this->word == rhs.word) && (this->len == rhs.len); | ||
| -} | ||
| - | ||
| - | ||
| -// Packs a dna string into a 64 bit unsigned int. The string dna must be | ||
| -// no more than 32 bp (excluding null terminator) | ||
| -uint64_t pack_dna_string(const char* dna, bool pack_left) | ||
| -{ | ||
| - char c; | ||
| - uint64_t w = 0; | ||
| - unsigned int len = 0; | ||
| - while ((c = *dna++)) | ||
| - { | ||
| - ++len; | ||
| - c = toupper(c); | ||
| - switch (c) | ||
| - { | ||
| - case 'A': c = A; break; | ||
| - case 'C': c = C; break; | ||
| - case 'G': c = G; break; | ||
| - case 'T': c = T; break; | ||
| - default: c = A; break; | ||
| - } | ||
| - w <<= 2; | ||
| - w |= c; | ||
| - } | ||
| - | ||
| - if (pack_left) | ||
| - { | ||
| - w <<= 64 - (len << 1); | ||
| - } | ||
| - return w; | ||
| -} | ||
| - | ||
| - | ||
| -int get_right_matching_chars(uint64_t w1, uint64_t w2) | ||
| -{ | ||
| - //find the least significant mismatching bit between w1 and w2 | ||
| - int mismatch_bit = find_64_lsm(w1 ^ w2); | ||
| - | ||
| - if (!mismatch_bit) | ||
| - return -1; | ||
| - | ||
| - mismatch_bit -= 1; | ||
| - mismatch_bit -= ((mismatch_bit) & 1); | ||
| - mismatch_bit >>= 1; | ||
| - | ||
| - return mismatch_bit; | ||
| -} | ||
| - | ||
| -// Given two left-packed 64 bit packed dna strings, | ||
| -// returns the number of bases that match between their | ||
| -// left ends, or -1 if the strings are equal | ||
| -int get_left_matching_chars(uint64_t w1, uint64_t w2) | ||
| -{ | ||
| - //find the most significant mismatching bit between w1 and w2 | ||
| - int mismatch_bit = find_64_msm(w1 ^ w2); | ||
| - | ||
| - if (mismatch_bit < 0) | ||
| - return -1; | ||
| - | ||
| - int matching_bits = 64 - mismatch_bit; | ||
| - matching_bits -= ((matching_bits - 1) & 1); | ||
| - | ||
| - matching_bits >>= 1; | ||
| - | ||
| - return matching_bits; | ||
| -} | ||
| - | ||
| -// This routine assumes w1 != w2, and returns a pair (row, col) in the | ||
| -// pyramid where the alignment terminated. | ||
| -void compute_pyramid(LVPyramid& py, | ||
| - const DnaWord& w1, | ||
| - const DnaWord& w2, | ||
| - bool left_extend, | ||
| - int* row, | ||
| - int* col, | ||
| - int* w1_remaining) | ||
| -{ | ||
| - // diff_row tracks the number of edits so far in the alignment | ||
| - // since there are MAX_DIFFS + 1 rows, the number of actual edits is | ||
| - // diff_row - 1 at any point in the alignment. | ||
| - unsigned int diff_row = 0; | ||
| - unsigned int max_diffs = py.max_diffs(); | ||
| - // When shift_row changes, a gap is introduced. | ||
| - unsigned int shift_col = max_diffs + 1; | ||
| - | ||
| - LVCell& start = py.cell(diff_row,shift_col); | ||
| - if (left_extend) | ||
| - start.match_chars = get_left_matching_chars(w1.word, w2.word); | ||
| - else | ||
| - start.match_chars = get_right_matching_chars(w1.word, w2.word); | ||
| - | ||
| - if (start.match_chars == -1) | ||
| - start.match_chars = min(w2.len, w1.len); | ||
| - else | ||
| - { | ||
| - start.match_chars = min(w2.len, start.match_chars); | ||
| - start.match_chars = min(w1.len, start.match_chars); | ||
| - } | ||
| - start.w1_shift = start.w2_shift = (start.match_chars << 1); | ||
| - unsigned int row_start = shift_col - 1; | ||
| - ++diff_row; | ||
| - | ||
| - for (; diff_row <= max_diffs + 1; ++diff_row) | ||
| - { | ||
| - shift_col = row_start--; | ||
| - for (; shift_col <= diff_row + max_diffs + 1; ++shift_col) | ||
| - { | ||
| - int total_matched_chars = py.cell(diff_row - 1,shift_col).match_chars; | ||
| - int w1_shift = py.cell(diff_row - 1,shift_col).w1_shift + 2; | ||
| - int w2_shift = py.cell(diff_row - 1,shift_col).w2_shift + 2; | ||
| - | ||
| - TracePtr back_ptr = UP; | ||
| - if (total_matched_chars < py.cell(diff_row - 1,shift_col - 1).match_chars) | ||
| - { | ||
| - // Gap in w2 (insertion in w1) | ||
| - total_matched_chars = py.cell(diff_row - 1,shift_col - 1).match_chars; | ||
| - w1_shift = py.cell(diff_row - 1,shift_col - 1).w1_shift + 2; | ||
| - w2_shift = py.cell(diff_row - 1,shift_col - 1).w2_shift; | ||
| - back_ptr = LEFT; | ||
| - } | ||
| - if (total_matched_chars < py.cell(diff_row - 1,shift_col + 1).match_chars) | ||
| - { | ||
| - // Gap in w1 (insertion in w2) | ||
| - total_matched_chars = py.cell(diff_row - 1,shift_col + 1).match_chars; | ||
| - w1_shift = py.cell(diff_row - 1,shift_col + 1).w1_shift; | ||
| - w2_shift = py.cell(diff_row - 1,shift_col + 1).w2_shift + 2; | ||
| - back_ptr = RIGHT; | ||
| - } | ||
| - | ||
| - uint64_t shifted_w1; | ||
| - uint64_t shifted_w2; | ||
| - | ||
| - if (!left_extend) | ||
| - { | ||
| - shifted_w1 = w1.word >> w1_shift; | ||
| - shifted_w2 = w2.word >> w2_shift; | ||
| - } | ||
| - else | ||
| - { | ||
| - shifted_w1 = w1.word << w1_shift; | ||
| - shifted_w2 = w2.word << w2_shift; | ||
| - } | ||
| - int matching; | ||
| - int w2_chars_remaining = w2.len - (w2_shift >> 1); | ||
| - int w1_chars_remaining = w1.len - (w1_shift >> 1); | ||
| - | ||
| - if (w2_chars_remaining == 0 || w1_chars_remaining == 0) | ||
| - { | ||
| - matching = 0; | ||
| - } | ||
| - else if (shifted_w1 == shifted_w2 && | ||
| - w2_chars_remaining == w1_chars_remaining) | ||
| - { | ||
| - matching = w1.len - ((w1_shift >> (w1_shift & 1)) >> 1); | ||
| - } | ||
| - else | ||
| - { | ||
| - if (left_extend) | ||
| - matching = get_left_matching_chars(shifted_w1, shifted_w2); | ||
| - else | ||
| - matching = get_right_matching_chars(shifted_w1, shifted_w2); | ||
| - if (matching == -1) | ||
| - matching = min(w2_chars_remaining, w1_chars_remaining); | ||
| - else | ||
| - { | ||
| - matching = min(matching, w2_chars_remaining); | ||
| - matching = min(matching, w1_chars_remaining); | ||
| - } | ||
| - } | ||
| - | ||
| - total_matched_chars += matching; | ||
| - w1_shift += (matching << 1); | ||
| - w2_shift += (matching << 1); | ||
| - | ||
| - py.cell(diff_row,shift_col).match_chars = total_matched_chars; | ||
| - py.cell(diff_row,shift_col).w1_shift = w1_shift; | ||
| - py.cell(diff_row,shift_col).w2_shift = w2_shift; | ||
| - py.cell(diff_row,shift_col).back_ptr = back_ptr; | ||
| - | ||
| - w2_chars_remaining -= matching; | ||
| - w1_chars_remaining -= matching; | ||
| - | ||
| - if (w2_chars_remaining <= 0 /*&& w1_chars_remaining == 0*/ && | ||
| - total_matched_chars + (int)(diff_row) >= min(w1.len, w2.len)) | ||
| - { | ||
| - *row = diff_row; | ||
| - *col = shift_col; | ||
| - *w1_remaining = w1_chars_remaining; | ||
| - return; | ||
| - } | ||
| - } | ||
| - | ||
| - } | ||
| - | ||
| - *row = max_diffs + 1; | ||
| - *col = max_diffs + 1; | ||
| - *w1_remaining = -1; | ||
| -} | ||
| - | ||
| -void edit_distance(const DnaWord& w1, | ||
| - const DnaWord& w2, | ||
| - int max_diffs, | ||
| - bool left_extend, | ||
| - int* dist, | ||
| - int* w1_chars_remaining) | ||
| -{ | ||
| - if (w1 == w2) | ||
| - { | ||
| - *dist = 0; | ||
| - *w1_chars_remaining = 0; | ||
| - return; | ||
| - } | ||
| - | ||
| - // The edit distance between w1 and w2 is at least the difference in | ||
| - // their lengths | ||
| - if (abs(w1.len - w2.len) > max_diffs) | ||
| - { | ||
| - *dist = max_diffs + 1; | ||
| - *w1_chars_remaining = -1; | ||
| - return; | ||
| - } | ||
| - | ||
| - | ||
| - | ||
| - LVPyramid py(max_diffs); | ||
| - | ||
| - int col; | ||
| - *w1_chars_remaining = -1; | ||
| - | ||
| - compute_pyramid(py, w1, w2, left_extend, dist, &col, w1_chars_remaining); | ||
| - | ||
| - return; | ||
| -} | ||
| - | ||
| -void edit_distance(const char* s1, | ||
| - const char* s2, | ||
| - int max_diffs, | ||
| - bool left_extend, | ||
| - int* dist, | ||
| - int* w1_chars_remaining) | ||
| -{ | ||
| - DnaWord w1(s1, left_extend); | ||
| - DnaWord w2(s2, left_extend); | ||
| - | ||
| - edit_distance(w1, w2, max_diffs, left_extend, dist, w1_chars_remaining); | ||
| -} |
Oops, something went wrong.
0 comments on commit
36b3f11