Permalink
Browse files

*** empty log message ***

  • Loading branch information...
1 parent 8419e77 commit 2d5153c05f32b644969c8ebc4f6c2f4864d466fa langmead committed May 5, 2009
Showing with 356 additions and 25 deletions.
  1. +3 −13 pat.cpp
  2. +1 −9 pat.h
  3. +34 −0 qual.cpp
  4. +19 −3 qual.h
  5. +238 −0 scripts/gen_occ_lookup.pl
  6. +61 −0 scripts/gen_solqual_lookup.pl
View
16 pat.cpp
@@ -6,24 +6,14 @@
using namespace std;
using namespace seqan;
-void wrongQualityScale() {
- cerr << "Encounterd negative quality value, but Phred qualities can't be negative."<<endl
- << "These qualities appear to use the Solexa scale." << endl
- << "Please re-run Bowtie with the --solexa-quals option.";
-}
-
void wrongQualityFormat() {
cerr << "Encounterd space-separated qualities"<<endl
- << "This appears to be an FASTQ-int file" << endl
- << "Please re-run Bowtie with the --integer-quals option.";
+ << "This appears to be an FASTQ-int file" << endl
+ << "Please re-run Bowtie with the --integer-quals option.";
}
void tooFewQualities(const String<char>& read_name) {
- string s;
- for(size_t i = 0; i < seqan::length(read_name); i++) {
- s.push_back(read_name[i]);
- }
- cerr << "Too few quality values for read: " << s << endl
+ cerr << "Too few quality values for read: " << read_name << endl
<< "\tare you sure this is a FASTQ-int file?" << endl;
}
View
10 pat.h
@@ -1582,7 +1582,6 @@ class FastaPatternSource : public BufferedFilePatternSource {
int policy_;
};
-extern void wrongQualityScale();
extern void wrongQualityFormat();
extern void tooFewQualities(const String<char>& read_name);
@@ -2031,13 +2030,7 @@ class FastqPatternSource : public BufferedFilePatternSource {
solQuals_(solexa_quals),
phred64Quals_(phred64Quals),
intQuals_(integer_quals)
- {
- for (int l = 0; l != 128; ++l) {
- table_[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64) / 10.0)) / log(10.0) + .499);
- if (table_[l] >= 63) table_[l] = 63;
- if (table_[l] == 0) table_[l] = 1;
- }
- }
+ { }
virtual void reset() {
first_ = true;
BufferedFilePatternSource::reset();
@@ -2302,7 +2295,6 @@ class FastqPatternSource : public BufferedFilePatternSource {
bool phred64Quals_;
bool intQuals_;
int policy_;
- int table_[128];
};
/**
View
@@ -30,3 +30,37 @@ unsigned char qualRounds[] = {
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 245 - 254
30 // 255
};
+
+/**
+ * Lookup table for converting from Solexa-scaled (log-odds) quality
+ * values to Phred-scaled quality values.
+ */
+unsigned char solToPhred[] = {
+ /* -10 */ 0, 1, 1, 1, 1, 1, 1, 2, 2, 3,
+ /* 0 */ 3, 4, 4, 5, 5, 6, 7, 8, 9, 10,
+ /* 10 */ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ /* 20 */ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ /* 30 */ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ /* 40 */ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ /* 50 */ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ /* 60 */ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ /* 70 */ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ /* 80 */ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ /* 90 */ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+ /* 100 */ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+ /* 110 */ 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ /* 120 */ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+ /* 130 */ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
+ /* 140 */ 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ /* 150 */ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+ /* 160 */ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
+ /* 170 */ 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ /* 180 */ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
+ /* 190 */ 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
+ /* 200 */ 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ /* 210 */ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
+ /* 220 */ 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
+ /* 230 */ 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ /* 240 */ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
+ /* 250 */ 250, 251, 252, 253, 254, 255
+};
View
22 qual.h
@@ -2,12 +2,29 @@
#define QUAL_H_
extern unsigned char qualRounds[];
+extern unsigned char solToPhred[];
/// Translate a Phred-encoded ASCII character into a Phred quality
static inline uint8_t phredCharToPhredQual(char c) {
return ((uint8_t)c >= 33 ? ((uint8_t)c - 33) : 0);
}
+/**
+ * Convert a Solexa-scaled quality value into a Phred-scale quality
+ * value.
+ *
+ * p = probability that base is miscalled
+ * Qphred = -10 * log10 (p)
+ * Qsolexa = -10 * log10 (p / (1 - p))
+ * See: http://en.wikipedia.org/wiki/FASTQ_format
+ *
+ */
+static inline uint8_t solexaToPhred(int sol) {
+ assert_lt(sol, 256);
+ if(sol < -10) return 0;
+ return solToPhred[sol+10];
+}
+
class SimplePhredPenalty {
public:
static uint8_t mmPenalty (uint8_t qual) {
@@ -71,8 +88,7 @@ inline static char charToPhred33(char c, bool solQuals, bool phred64Quals) {
if (solQuals) {
// Convert solexa-scaled chars to phred
// http://maq.sourceforge.net/fastq.shtml
- int pQ = (int)(10.0 * log(1.0 + pow(10.0, ((int)c - 64) / 10.0)) / log(10.0) + .499) + 33;
- char cc = (char)(pQ);
+ char cc = solexaToPhred((int)c - 64) + 33;
if (cc < 33) {
cerr << "Saw ASCII character "
<< ((int)c)
@@ -119,7 +135,7 @@ inline static char intToPhred33(int iQ, bool solQuals) {
// Convert from solexa quality to phred
// quality and translate to ASCII
// http://maq.sourceforge.net/qual.shtml
- pQ = (int)(10.0 * log(1.0 + pow(10.0, (iQ) / 10.0)) / log(10.0) + .499) + 33;
+ pQ = solexaToPhred((int)iQ) + 33;
} else {
// Keep the phred quality and translate
// to ASCII
View
@@ -0,0 +1,238 @@
+#!/usr/bin/perl -w
+
+#
+# Generate lookup table that, given a packed DNA byte (four bases) and
+# a character (A, C, G or T), returns how many times that character
+# occurs in that packed byte. Useful for quickly counting character
+# occurrences in long strings. The LUT is indexed first by character
+# (0-3) then by byte (0-255).
+#
+# Larger lookup tables are also possible, though they seem
+# counterproductive. E.g., looking up eight bases at a time yields a
+# 256K LUT, which doesn't fit in L1. A four-base LUT is 1KB, easily
+# fitting in L1.
+#
+# See ebwt.h.
+#
+
+my @as4 = (), @as3 = (), @as2 = (), @as1 = ();
+my @cs4 = (), @cs3 = (), @cs2 = (), @cs1 = ();
+my @gs4 = (), @gs3 = (), @gs2 = (), @gs1 = ();
+my @ts4 = (), @ts3 = (), @ts2 = (), @ts1 = ();
+
+# Compile character arrays
+my $i;
+for($i = 0; $i < 256; $i++) {
+ my $b01 = ($i >> 0) & 3;
+ my $b23 = ($i >> 2) & 3;
+ my $b45 = ($i >> 4) & 3;
+ my $b67 = ($i >> 6) & 3;
+
+ my $a4 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0) + ($b67 == 0);
+ my $c4 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1) + ($b67 == 1);
+ my $g4 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2) + ($b67 == 2);
+ my $t4 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3) + ($b67 == 3);
+
+ push @as4, $a4;
+ push @cs4, $c4;
+ push @gs4, $g4;
+ push @ts4, $t4;
+
+ my $a3 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0);
+ my $c3 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1);
+ my $g3 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2);
+ my $t3 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3);
+
+ push @as3, $a3;
+ push @cs3, $c3;
+ push @gs3, $g3;
+ push @ts3, $t3;
+
+ my $a2 = ($b01 == 0) + ($b23 == 0);
+ my $c2 = ($b01 == 1) + ($b23 == 1);
+ my $g2 = ($b01 == 2) + ($b23 == 2);
+ my $t2 = ($b01 == 3) + ($b23 == 3);
+
+ push @as2, $a2;
+ push @cs2, $c2;
+ push @gs2, $g2;
+ push @ts2, $t2;
+
+ my $a1 = ($b01 == 0) + 0;
+ my $c1 = ($b01 == 1) + 0;
+ my $g1 = ($b01 == 2) + 0;
+ my $t1 = ($b01 == 3) + 0;
+
+ push @as1, $a1;
+ push @cs1, $c1;
+ push @gs1, $g1;
+ push @ts1, $t1;
+}
+
+my $entsPerLine = 16;
+
+print "#include <stdint.h>\n\n";
+print "/* Generated by gen_lookup_tables.pl */\n\n";
+
+# Count occurrences in all 4 bit pairs
+
+print "uint8_t cCntLUT_4[4][4][256] = {\n";
+print "\t/* All 4 bit pairs */ {\n";
+
+# Print As array
+print "\t\t/* As */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$as4[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Cs array
+print "\t\t/* Cs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$cs4[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Gs array
+print "\t\t/* Gs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$gs4[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Ts array
+print "\t\t/* Ts */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$ts4[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t}\n\t},\n";
+
+# Count occurrences in low 1 bit pair
+
+print "\t/* Least significant 1 bit pair */ {\n";
+
+# Print As array
+print "\t\t/* As */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$as1[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Cs array
+print "\t\t/* Cs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$cs1[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Gs array
+print "\t\t/* Gs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$gs1[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Ts array
+print "\t\t/* Ts */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$ts1[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t}\n\t},\n";
+
+# Count occurrences in low 2 bit pairs
+
+print "\t/* Least significant 2 bit pairs */ {\n";
+
+# Print As array
+print "\t\t/* As */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$as2[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Cs array
+print "\t\t/* Cs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$cs2[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Gs array
+print "\t\t/* Gs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$gs2[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Ts array
+print "\t\t/* Ts */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$ts2[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t}\n\t},\n";
+
+# Count occurrences in low 3 bit pairs
+
+print "\t/* Least significant 3 bit pairs */ {\n";
+
+# Print As array
+print "\t\t/* As */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$as3[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Cs array
+print "\t\t/* Cs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$cs3[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Gs array
+print "\t\t/* Gs */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$gs3[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t},\n";
+
+# Print Ts array
+print "\t\t/* Ts */ {\n";
+for($i = 0; $i < 256; $i++) {
+ print "\t\t\t" if(($i % $entsPerLine) == 0);
+ print "$ts3[$i], ";
+ print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
+}
+print "\t\t}\n\t}\n";
+
+print "};\n";
Oops, something went wrong.

0 comments on commit 2d5153c

Please sign in to comment.