Permalink
Browse files

Streamlining and slimming down the contents of the bowtie module in p…

…reparation for the 1.0 release
  • Loading branch information...
1 parent 050ace6 commit 36b3f11cc49ce138b37c79b367403a9b1193883c langmead committed Aug 13, 2008
Showing with 419 additions and 11,251 deletions.
  1. +0 −342 LVKernel.cpp
  2. +0 −115 LVKernel.h
  3. +18 −117 Makefile
  4. +0 −95 README
  5. +0 −52 TODO
  6. +6 −56 alphabet.h
  7. +0 −2 bitpack.h
  8. +0 −190 blockwise_sa.cpp
  9. +25 −43 blockwise_sa.h
  10. +0 −38 bwt.cpp
  11. +0 −6 ccnt_lut.h
  12. +0 −100 diff_covers.h
  13. +0 −243 diff_sample.cpp
  14. +98 −6 diff_sample.h
  15. +14 −8 ebwt.h
  16. +8 −16 ebwt_build.cpp
  17. +15 −476 ebwt_search.cpp
  18. +33 −0 ebwt_search_backtrack.h
  19. +0 −73 endian_swap.cpp
  20. +71 −6 endian_swap.h
  21. +21 −0 formats.h
  22. +0 −88 gen_lookup_tables.pl
  23. +0 −183 getusage/getusage.c
  24. +0 −107 inexact_extend.cpp
  25. +0 −306 inexact_extend.h
  26. +0 −241 lcp.cpp
  27. +1 −1 maq_convert/bowtie_convert.cpp
  28. +1 −2 maq_convert/read_bfq.h
  29. +0 −287 multikey_qsort.cpp
  30. +0 −129 pack_fasta.cpp
  31. +0 −35 packed_io.cpp
  32. +0 −324 packed_io.h
  33. BIN papers/CMSC858P_Report.pdf
  34. +0 −55 params.h
  35. +24 −135 pat.h
  36. +0 −50 pop.cpp
  37. +0 −255 prefetch_bench.cpp
  38. +0 −34 quals.cpp
  39. +0 −9 quals.h
  40. +3 −0 random_source.h
  41. +3 −5 ref_read.cpp
  42. +3 −3 ref_read.h
  43. +0 −98 rusage.cpp
  44. +0 −8 rusage.h
  45. +0 −29 seqan_helpers.h
  46. +0 −3 sequence_io.h
  47. +0 −454 simreads.cpp
  48. +0 −78 solexa.h
  49. +0 −8 tests/chr22/snippet1_qry.mfa
  50. +0 −8 tests/chr22/snippet1_ref.fa
  51. +0 −8 tests/chr22/snippet2_qry.mfa
  52. +0 −3 tests/chr22/snippet2_ref.fa
  53. +0 −70 tests/maqLike/queries.fq
  54. +0 −2 tests/maqLike/ref.fa
  55. +0 −5 tests/maqLike/run.sh
  56. +0 −49 tests/oneMismatch/queries.mfa
  57. +0 −2 tests/oneMismatch/ref.fa
  58. +0 −4 tests/oneMismatch/run.sh
  59. +0 −25 tests/paper_eval/README
  60. +0 −9 tests/paper_eval/bowtie_reads_mapped/README
  61. +0 −3 tests/paper_eval/bowtie_reads_mapped/TODO
  62. +0 −5 tests/paper_eval/bowtie_reads_mapped/clean.sh
  63. +0 −21 tests/paper_eval/bowtie_reads_mapped/plot.gpl
  64. +0 −87 tests/paper_eval/bowtie_reads_mapped/plot.pl
  65. +0 −38 tests/paper_eval/bowtie_reads_mapped/plot.sh
  66. +0 −620 tests/paper_eval/bowtie_reads_mapped/results.eps
  67. BIN tests/paper_eval/bowtie_reads_mapped/results.pdf
  68. +0 −99 tests/paper_eval/bowtie_reads_mapped/run.sh
  69. +0 −8 tests/paper_eval/builds/README
  70. +0 −1 tests/paper_eval/builds/TODO
  71. BIN tests/paper_eval/builds/builds.pdf
  72. +0 −42 tests/paper_eval/builds/builds.tex
  73. +0 −5 tests/paper_eval/builds/clean.sh
  74. +0 −18 tests/paper_eval/builds/headerinc.tex
  75. +0 −101 tests/paper_eval/builds/plot.pl
  76. +0 −24 tests/paper_eval/builds/plot.sh
  77. +0 −43 tests/paper_eval/builds/run.sh
  78. +0 −38 tests/paper_eval/builds/setup.sh
  79. +0 −13 tests/paper_eval/genome_varies/README
  80. +0 −3 tests/paper_eval/genome_varies/TODO
  81. +0 −41 tests/paper_eval/genome_varies/build.sh
  82. +0 −24 tests/paper_eval/genome_varies/plot.gpl
  83. +0 −76 tests/paper_eval/genome_varies/plot.sh
  84. +0 −53 tests/paper_eval/genome_varies/readbuild.sh
  85. +0 −790 tests/paper_eval/genome_varies/results.eps
  86. BIN tests/paper_eval/genome_varies/results.pdf
  87. +0 −156 tests/paper_eval/genome_varies/run.sh
  88. +0 −9 tests/paper_eval/genome_varies/setup.sh
  89. +0 −7 tests/paper_eval/genome_varies/wrap.sh
  90. +0 −19 tests/paper_eval/kg_competition/README
  91. +0 −3 tests/paper_eval/kg_competition/TODO
  92. +0 −87 tests/paper_eval/kg_competition/analyze_maps.sh
  93. BIN tests/paper_eval/kg_competition/both.pdf
  94. +0 −66 tests/paper_eval/kg_competition/both.tex
  95. +0 −7 tests/paper_eval/kg_competition/clean.sh
  96. +0 −10 tests/paper_eval/kg_competition/driver.sh
  97. +0 −107 tests/paper_eval/kg_competition/ebwt.sh
  98. +0 −18 tests/paper_eval/kg_competition/headerinc.tex
  99. BIN tests/paper_eval/kg_competition/maq.pdf
  100. +0 −124 tests/paper_eval/kg_competition/maq.sh
  101. +0 −67 tests/paper_eval/kg_competition/maq.tex
  102. BIN tests/paper_eval/kg_competition/maq_filt.pdf
  103. +0 −67 tests/paper_eval/kg_competition/maq_filt.tex
  104. +0 −124 tests/paper_eval/kg_competition/maq_n1.sh
  105. BIN tests/paper_eval/kg_competition/maq_reads.pdf
  106. +0 −32 tests/paper_eval/kg_competition/maq_reads.tex
  107. BIN tests/paper_eval/kg_competition/maq_reads_filt.pdf
  108. +0 −32 tests/paper_eval/kg_competition/maq_reads_filt.tex
  109. +0 −205 tests/paper_eval/kg_competition/plot.pl
  110. +0 −148 tests/paper_eval/kg_competition/plot.sh
  111. +0 −72 tests/paper_eval/kg_competition/plot_reads.pl
  112. +0 −63 tests/paper_eval/kg_competition/plot_reads.sh
  113. BIN tests/paper_eval/kg_competition/server.pdf
  114. +0 −38 tests/paper_eval/kg_competition/server.tex
  115. +0 −87 tests/paper_eval/kg_competition/setup.sh
  116. BIN tests/paper_eval/kg_competition/soap.pdf
  117. +0 −65 tests/paper_eval/kg_competition/soap.sh
  118. +0 −57 tests/paper_eval/kg_competition/soap.tex
  119. BIN tests/paper_eval/kg_competition/soap_reads.pdf
  120. +0 −32 tests/paper_eval/kg_competition/soap_reads.tex
  121. BIN tests/paper_eval/kg_competition/soap_server.pdf
  122. +0 −33 tests/paper_eval/kg_competition/soap_server.tex
  123. BIN tests/paper_eval/kg_competition/workstation.pdf
  124. +0 −36 tests/paper_eval/kg_competition/workstation.tex
  125. +0 −135 tests/paper_eval/polymorph.py
  126. +0 −70 tests/paper_eval/rand_reads.py
  127. +0 −75 tests/paper_eval/run_eval.sh
  128. +0 −11 tests/paper_eval/simreads_competition/README
  129. +0 −3 tests/paper_eval/simreads_competition/TODO
  130. +0 −126 tests/paper_eval/simreads_competition/analyze_maps.sh
  131. +0 −7 tests/paper_eval/simreads_competition/clean.sh
  132. +0 −9 tests/paper_eval/simreads_competition/driver.sh
  133. +0 −45 tests/paper_eval/simreads_competition/ebwt.sh
  134. +0 −45 tests/paper_eval/simreads_competition/ebwt_n1.sh
  135. +0 −17 tests/paper_eval/simreads_competition/headerinc.tex
  136. +0 −108 tests/paper_eval/simreads_competition/maq.sh
  137. +0 −108 tests/paper_eval/simreads_competition/maq_n1.sh
  138. +0 −244 tests/paper_eval/simreads_competition/plot.pl
  139. +0 −44 tests/paper_eval/simreads_competition/plot.sh
  140. BIN tests/paper_eval/simreads_competition/server.pdf
  141. +0 −40 tests/paper_eval/simreads_competition/server.tex
  142. +0 −78 tests/paper_eval/simreads_competition/setup.sh
  143. +0 −65 tests/paper_eval/simreads_competition/soap.sh
  144. BIN tests/paper_eval/simreads_competition/workstation.pdf
  145. +0 −40 tests/paper_eval/simreads_competition/workstation.tex
  146. +0 −7 tests/paper_eval/simreads_competition/wrap.sh
  147. +0 −100 tests/paper_eval/snp_eval.py
  148. +0 −192 tests/random_unique/gentestreads.py
  149. +0 −70 tests/random_unique/rand_reads.py
  150. +0 −189 tests/random_unique/reads_from_mers.py
  151. +0 −261 tests/random_unique/test_search.sh
  152. +1 −1 timer.h
  153. +0 −18 tokenize.cpp
  154. +15 −3 tokenize.h
  155. +0 −49 txt_to_fastq.cpp
  156. +0 −63 word_io.cpp
  157. +59 −8 word_io.h
View
@@ -1,342 +0,0 @@
-#include <vector>
-#include <algorithm>
-#include <stdlib.h>
-#include <ctype.h>
-#include <string.h>
-#include "LVKernel.h"
-
-#define OS_X
-
-using namespace std;
-
-LVPyramid::LVPyramid(unsigned int Max_Diffs) : _max_diffs(Max_Diffs)
-{
- // TODO: profile and check that this constructor isn't popping up
- // there are better ways to store the DP pyramid, but this is a
- // clear way.
-
- _cells.resize(_max_diffs + 2);
- for (unsigned int i = 0; i < _max_diffs + 2; ++i)
- {
- _cells[i] = vector<LVCell>(2 * (_max_diffs + 1) + 1);
- }
-}
-
-LVPyramid::~LVPyramid()
-{
-
-}
-
-LVCell& LVPyramid::cell(int diffs, int offset)
-{
- return _cells[diffs][offset];
- //return *(_cells + (_max_diffs * offset) + diffs);
-}
-
-/****************************************************************************/
-/* Public Functions */
-/****************************************************************************/
-
-// Pilfered from googlecode:
-// Find the first (least significant) set bit in a 64-bit integer. The return
-// value ranges from 0 (for no bit set), to 1 (for the least significant bit
-// set), to 64 (for only the most significant bit set).
-int find_64_lsm(uint64_t n)
-{
-#if defined(OS_X) || defined(WINDOWS)
- n &= -n;
- int shift = (uint64_t) n <= 0xFFFFFFFFULL ? 0 : 32;
-#endif
-
-#if defined(LINUX)
- return ffsll(n);
-#elif defined(OS_X)
- return ffs(n >> shift) + shift;
-#elif defined(WINDOWS)
- return find_32(n >> shift) + shift;
-#endif
-}
-
-// Find the first (most significant) set bit in a 64-bit integer. The return
-// value ranges from 0 (for no bit set), to 1 (for the least significant bit
-// set), to 64 (for only the most significant bit set).
-int find_64_msm(uint64_t n)
-{
-
- int pos = 0;
- uint64_t tmp;
- tmp = n >> 32;
- if (tmp != 0) { n = tmp; pos = pos + 32; }
- tmp = n >> 16;
- if (tmp != 0) { n = tmp; pos = pos + 16; }
- tmp = n >> 8;
- if (tmp != 0) { n = tmp; pos = pos + 8; }
- tmp = n >> 4;
- if (tmp != 0) { n = tmp; pos = pos + 4; }
- tmp = n >> 2;
- if (tmp != 0) { n = tmp; pos = pos + 2; }
- tmp = n >> 1;
- if (tmp != 0) { n = tmp; pos = pos + 1; }
- return pos + n - 1;
-}
-
-
-enum Dna { A, C, G, T };
-
-DnaWord::DnaWord(const char* s, bool pack_left)
-{
- len = strlen(s);
- assert (len <= 32);
- word = pack_dna_string(s, pack_left);
-}
-
-bool DnaWord::operator==(const DnaWord& rhs) const
-{
- return (this->word == rhs.word) && (this->len == rhs.len);
-}
-
-
-// Packs a dna string into a 64 bit unsigned int. The string dna must be
-// no more than 32 bp (excluding null terminator)
-uint64_t pack_dna_string(const char* dna, bool pack_left)
-{
- char c;
- uint64_t w = 0;
- unsigned int len = 0;
- while ((c = *dna++))
- {
- ++len;
- c = toupper(c);
- switch (c)
- {
- case 'A': c = A; break;
- case 'C': c = C; break;
- case 'G': c = G; break;
- case 'T': c = T; break;
- default: c = A; break;
- }
- w <<= 2;
- w |= c;
- }
-
- if (pack_left)
- {
- w <<= 64 - (len << 1);
- }
- return w;
-}
-
-
-int get_right_matching_chars(uint64_t w1, uint64_t w2)
-{
- //find the least significant mismatching bit between w1 and w2
- int mismatch_bit = find_64_lsm(w1 ^ w2);
-
- if (!mismatch_bit)
- return -1;
-
- mismatch_bit -= 1;
- mismatch_bit -= ((mismatch_bit) & 1);
- mismatch_bit >>= 1;
-
- return mismatch_bit;
-}
-
-// Given two left-packed 64 bit packed dna strings,
-// returns the number of bases that match between their
-// left ends, or -1 if the strings are equal
-int get_left_matching_chars(uint64_t w1, uint64_t w2)
-{
- //find the most significant mismatching bit between w1 and w2
- int mismatch_bit = find_64_msm(w1 ^ w2);
-
- if (mismatch_bit < 0)
- return -1;
-
- int matching_bits = 64 - mismatch_bit;
- matching_bits -= ((matching_bits - 1) & 1);
-
- matching_bits >>= 1;
-
- return matching_bits;
-}
-
-// This routine assumes w1 != w2, and returns a pair (row, col) in the
-// pyramid where the alignment terminated.
-void compute_pyramid(LVPyramid& py,
- const DnaWord& w1,
- const DnaWord& w2,
- bool left_extend,
- int* row,
- int* col,
- int* w1_remaining)
-{
- // diff_row tracks the number of edits so far in the alignment
- // since there are MAX_DIFFS + 1 rows, the number of actual edits is
- // diff_row - 1 at any point in the alignment.
- unsigned int diff_row = 0;
- unsigned int max_diffs = py.max_diffs();
- // When shift_row changes, a gap is introduced.
- unsigned int shift_col = max_diffs + 1;
-
- LVCell& start = py.cell(diff_row,shift_col);
- if (left_extend)
- start.match_chars = get_left_matching_chars(w1.word, w2.word);
- else
- start.match_chars = get_right_matching_chars(w1.word, w2.word);
-
- if (start.match_chars == -1)
- start.match_chars = min(w2.len, w1.len);
- else
- {
- start.match_chars = min(w2.len, start.match_chars);
- start.match_chars = min(w1.len, start.match_chars);
- }
- start.w1_shift = start.w2_shift = (start.match_chars << 1);
- unsigned int row_start = shift_col - 1;
- ++diff_row;
-
- for (; diff_row <= max_diffs + 1; ++diff_row)
- {
- shift_col = row_start--;
- for (; shift_col <= diff_row + max_diffs + 1; ++shift_col)
- {
- int total_matched_chars = py.cell(diff_row - 1,shift_col).match_chars;
- int w1_shift = py.cell(diff_row - 1,shift_col).w1_shift + 2;
- int w2_shift = py.cell(diff_row - 1,shift_col).w2_shift + 2;
-
- TracePtr back_ptr = UP;
- if (total_matched_chars < py.cell(diff_row - 1,shift_col - 1).match_chars)
- {
- // Gap in w2 (insertion in w1)
- total_matched_chars = py.cell(diff_row - 1,shift_col - 1).match_chars;
- w1_shift = py.cell(diff_row - 1,shift_col - 1).w1_shift + 2;
- w2_shift = py.cell(diff_row - 1,shift_col - 1).w2_shift;
- back_ptr = LEFT;
- }
- if (total_matched_chars < py.cell(diff_row - 1,shift_col + 1).match_chars)
- {
- // Gap in w1 (insertion in w2)
- total_matched_chars = py.cell(diff_row - 1,shift_col + 1).match_chars;
- w1_shift = py.cell(diff_row - 1,shift_col + 1).w1_shift;
- w2_shift = py.cell(diff_row - 1,shift_col + 1).w2_shift + 2;
- back_ptr = RIGHT;
- }
-
- uint64_t shifted_w1;
- uint64_t shifted_w2;
-
- if (!left_extend)
- {
- shifted_w1 = w1.word >> w1_shift;
- shifted_w2 = w2.word >> w2_shift;
- }
- else
- {
- shifted_w1 = w1.word << w1_shift;
- shifted_w2 = w2.word << w2_shift;
- }
- int matching;
- int w2_chars_remaining = w2.len - (w2_shift >> 1);
- int w1_chars_remaining = w1.len - (w1_shift >> 1);
-
- if (w2_chars_remaining == 0 || w1_chars_remaining == 0)
- {
- matching = 0;
- }
- else if (shifted_w1 == shifted_w2 &&
- w2_chars_remaining == w1_chars_remaining)
- {
- matching = w1.len - ((w1_shift >> (w1_shift & 1)) >> 1);
- }
- else
- {
- if (left_extend)
- matching = get_left_matching_chars(shifted_w1, shifted_w2);
- else
- matching = get_right_matching_chars(shifted_w1, shifted_w2);
- if (matching == -1)
- matching = min(w2_chars_remaining, w1_chars_remaining);
- else
- {
- matching = min(matching, w2_chars_remaining);
- matching = min(matching, w1_chars_remaining);
- }
- }
-
- total_matched_chars += matching;
- w1_shift += (matching << 1);
- w2_shift += (matching << 1);
-
- py.cell(diff_row,shift_col).match_chars = total_matched_chars;
- py.cell(diff_row,shift_col).w1_shift = w1_shift;
- py.cell(diff_row,shift_col).w2_shift = w2_shift;
- py.cell(diff_row,shift_col).back_ptr = back_ptr;
-
- w2_chars_remaining -= matching;
- w1_chars_remaining -= matching;
-
- if (w2_chars_remaining <= 0 /*&& w1_chars_remaining == 0*/ &&
- total_matched_chars + (int)(diff_row) >= min(w1.len, w2.len))
- {
- *row = diff_row;
- *col = shift_col;
- *w1_remaining = w1_chars_remaining;
- return;
- }
- }
-
- }
-
- *row = max_diffs + 1;
- *col = max_diffs + 1;
- *w1_remaining = -1;
-}
-
-void edit_distance(const DnaWord& w1,
- const DnaWord& w2,
- int max_diffs,
- bool left_extend,
- int* dist,
- int* w1_chars_remaining)
-{
- if (w1 == w2)
- {
- *dist = 0;
- *w1_chars_remaining = 0;
- return;
- }
-
- // The edit distance between w1 and w2 is at least the difference in
- // their lengths
- if (abs(w1.len - w2.len) > max_diffs)
- {
- *dist = max_diffs + 1;
- *w1_chars_remaining = -1;
- return;
- }
-
-
-
- LVPyramid py(max_diffs);
-
- int col;
- *w1_chars_remaining = -1;
-
- compute_pyramid(py, w1, w2, left_extend, dist, &col, w1_chars_remaining);
-
- return;
-}
-
-void edit_distance(const char* s1,
- const char* s2,
- int max_diffs,
- bool left_extend,
- int* dist,
- int* w1_chars_remaining)
-{
- DnaWord w1(s1, left_extend);
- DnaWord w2(s2, left_extend);
-
- edit_distance(w1, w2, max_diffs, left_extend, dist, w1_chars_remaining);
-}
Oops, something went wrong.

0 comments on commit 36b3f11

Please sign in to comment.