diff --git a/pat.cpp b/pat.cpp index 8b9f9cf..29c15d1 100644 --- a/pat.cpp +++ b/pat.cpp @@ -6,24 +6,14 @@ using namespace std; using namespace seqan; -void wrongQualityScale() { - cerr << "Encounterd negative quality value, but Phred qualities can't be negative."<& read_name) { - string s; - for(size_t i = 0; i < seqan::length(read_name); i++) { - s.push_back(read_name[i]); - } - cerr << "Too few quality values for read: " << s << endl + cerr << "Too few quality values for read: " << read_name << endl << "\tare you sure this is a FASTQ-int file?" << endl; } diff --git a/pat.h b/pat.h index 1f7c005..def7752 100644 --- a/pat.h +++ b/pat.h @@ -1582,7 +1582,6 @@ class FastaPatternSource : public BufferedFilePatternSource { int policy_; }; -extern void wrongQualityScale(); extern void wrongQualityFormat(); extern void tooFewQualities(const String& read_name); @@ -2031,13 +2030,7 @@ class FastqPatternSource : public BufferedFilePatternSource { solQuals_(solexa_quals), phred64Quals_(phred64Quals), intQuals_(integer_quals) - { - for (int l = 0; l != 128; ++l) { - table_[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64) / 10.0)) / log(10.0) + .499); - if (table_[l] >= 63) table_[l] = 63; - if (table_[l] == 0) table_[l] = 1; - } - } + { } virtual void reset() { first_ = true; BufferedFilePatternSource::reset(); @@ -2302,7 +2295,6 @@ class FastqPatternSource : public BufferedFilePatternSource { bool phred64Quals_; bool intQuals_; int policy_; - int table_[128]; }; /** diff --git a/qual.cpp b/qual.cpp index bb2020a..283ca3f 100644 --- a/qual.cpp +++ b/qual.cpp @@ -30,3 +30,37 @@ unsigned char qualRounds[] = { 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 245 - 254 30 // 255 }; + +/** + * Lookup table for converting from Solexa-scaled (log-odds) quality + * values to Phred-scaled quality values. + */ +unsigned char solToPhred[] = { + /* -10 */ 0, 1, 1, 1, 1, 1, 1, 2, 2, 3, + /* 0 */ 3, 4, 4, 5, 5, 6, 7, 8, 9, 10, + /* 10 */ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + /* 20 */ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + /* 30 */ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, + /* 40 */ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, + /* 50 */ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + /* 60 */ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, + /* 70 */ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + /* 80 */ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, + /* 90 */ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, + /* 100 */ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, + /* 110 */ 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, + /* 120 */ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, + /* 130 */ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, + /* 140 */ 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + /* 150 */ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + /* 160 */ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, + /* 170 */ 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + /* 180 */ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, + /* 190 */ 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, + /* 200 */ 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, + /* 210 */ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, + /* 220 */ 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, + /* 230 */ 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, + /* 240 */ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, + /* 250 */ 250, 251, 252, 253, 254, 255 +}; diff --git a/qual.h b/qual.h index 5e190de..cd9e5e1 100644 --- a/qual.h +++ b/qual.h @@ -2,12 +2,29 @@ #define QUAL_H_ extern unsigned char qualRounds[]; +extern unsigned char solToPhred[]; /// Translate a Phred-encoded ASCII character into a Phred quality static inline uint8_t phredCharToPhredQual(char c) { return ((uint8_t)c >= 33 ? ((uint8_t)c - 33) : 0); } +/** + * Convert a Solexa-scaled quality value into a Phred-scale quality + * value. + * + * p = probability that base is miscalled + * Qphred = -10 * log10 (p) + * Qsolexa = -10 * log10 (p / (1 - p)) + * See: http://en.wikipedia.org/wiki/FASTQ_format + * + */ +static inline uint8_t solexaToPhred(int sol) { + assert_lt(sol, 256); + if(sol < -10) return 0; + return solToPhred[sol+10]; +} + class SimplePhredPenalty { public: static uint8_t mmPenalty (uint8_t qual) { @@ -71,8 +88,7 @@ inline static char charToPhred33(char c, bool solQuals, bool phred64Quals) { if (solQuals) { // Convert solexa-scaled chars to phred // http://maq.sourceforge.net/fastq.shtml - int pQ = (int)(10.0 * log(1.0 + pow(10.0, ((int)c - 64) / 10.0)) / log(10.0) + .499) + 33; - char cc = (char)(pQ); + char cc = solexaToPhred((int)c - 64) + 33; if (cc < 33) { cerr << "Saw ASCII character " << ((int)c) @@ -119,7 +135,7 @@ inline static char intToPhred33(int iQ, bool solQuals) { // Convert from solexa quality to phred // quality and translate to ASCII // http://maq.sourceforge.net/qual.shtml - pQ = (int)(10.0 * log(1.0 + pow(10.0, (iQ) / 10.0)) / log(10.0) + .499) + 33; + pQ = solexaToPhred((int)iQ) + 33; } else { // Keep the phred quality and translate // to ASCII diff --git a/scripts/gen_occ_lookup.pl b/scripts/gen_occ_lookup.pl new file mode 100755 index 0000000..e0f7626 --- /dev/null +++ b/scripts/gen_occ_lookup.pl @@ -0,0 +1,238 @@ +#!/usr/bin/perl -w + +# +# Generate lookup table that, given a packed DNA byte (four bases) and +# a character (A, C, G or T), returns how many times that character +# occurs in that packed byte. Useful for quickly counting character +# occurrences in long strings. The LUT is indexed first by character +# (0-3) then by byte (0-255). +# +# Larger lookup tables are also possible, though they seem +# counterproductive. E.g., looking up eight bases at a time yields a +# 256K LUT, which doesn't fit in L1. A four-base LUT is 1KB, easily +# fitting in L1. +# +# See ebwt.h. +# + +my @as4 = (), @as3 = (), @as2 = (), @as1 = (); +my @cs4 = (), @cs3 = (), @cs2 = (), @cs1 = (); +my @gs4 = (), @gs3 = (), @gs2 = (), @gs1 = (); +my @ts4 = (), @ts3 = (), @ts2 = (), @ts1 = (); + +# Compile character arrays +my $i; +for($i = 0; $i < 256; $i++) { + my $b01 = ($i >> 0) & 3; + my $b23 = ($i >> 2) & 3; + my $b45 = ($i >> 4) & 3; + my $b67 = ($i >> 6) & 3; + + my $a4 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0) + ($b67 == 0); + my $c4 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1) + ($b67 == 1); + my $g4 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2) + ($b67 == 2); + my $t4 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3) + ($b67 == 3); + + push @as4, $a4; + push @cs4, $c4; + push @gs4, $g4; + push @ts4, $t4; + + my $a3 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0); + my $c3 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1); + my $g3 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2); + my $t3 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3); + + push @as3, $a3; + push @cs3, $c3; + push @gs3, $g3; + push @ts3, $t3; + + my $a2 = ($b01 == 0) + ($b23 == 0); + my $c2 = ($b01 == 1) + ($b23 == 1); + my $g2 = ($b01 == 2) + ($b23 == 2); + my $t2 = ($b01 == 3) + ($b23 == 3); + + push @as2, $a2; + push @cs2, $c2; + push @gs2, $g2; + push @ts2, $t2; + + my $a1 = ($b01 == 0) + 0; + my $c1 = ($b01 == 1) + 0; + my $g1 = ($b01 == 2) + 0; + my $t1 = ($b01 == 3) + 0; + + push @as1, $a1; + push @cs1, $c1; + push @gs1, $g1; + push @ts1, $t1; +} + +my $entsPerLine = 16; + +print "#include \n\n"; +print "/* Generated by gen_lookup_tables.pl */\n\n"; + +# Count occurrences in all 4 bit pairs + +print "uint8_t cCntLUT_4[4][4][256] = {\n"; +print "\t/* All 4 bit pairs */ {\n"; + +# Print As array +print "\t\t/* As */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$as4[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Cs array +print "\t\t/* Cs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$cs4[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Gs array +print "\t\t/* Gs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$gs4[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Ts array +print "\t\t/* Ts */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$ts4[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t}\n\t},\n"; + +# Count occurrences in low 1 bit pair + +print "\t/* Least significant 1 bit pair */ {\n"; + +# Print As array +print "\t\t/* As */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$as1[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Cs array +print "\t\t/* Cs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$cs1[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Gs array +print "\t\t/* Gs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$gs1[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Ts array +print "\t\t/* Ts */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$ts1[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t}\n\t},\n"; + +# Count occurrences in low 2 bit pairs + +print "\t/* Least significant 2 bit pairs */ {\n"; + +# Print As array +print "\t\t/* As */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$as2[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Cs array +print "\t\t/* Cs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$cs2[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Gs array +print "\t\t/* Gs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$gs2[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Ts array +print "\t\t/* Ts */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$ts2[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t}\n\t},\n"; + +# Count occurrences in low 3 bit pairs + +print "\t/* Least significant 3 bit pairs */ {\n"; + +# Print As array +print "\t\t/* As */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$as3[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Cs array +print "\t\t/* Cs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$cs3[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Gs array +print "\t\t/* Gs */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$gs3[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t},\n"; + +# Print Ts array +print "\t\t/* Ts */ {\n"; +for($i = 0; $i < 256; $i++) { + print "\t\t\t" if(($i % $entsPerLine) == 0); + print "$ts3[$i], "; + print "\n" if(($i % $entsPerLine) == ($entsPerLine-1)); +} +print "\t\t}\n\t}\n"; + +print "};\n"; diff --git a/scripts/gen_solqual_lookup.pl b/scripts/gen_solqual_lookup.pl new file mode 100644 index 0000000..a076edb --- /dev/null +++ b/scripts/gen_solqual_lookup.pl @@ -0,0 +1,61 @@ +#!/usr/bin/perl -w + +use warnings; +use strict; + +sub log10($) { + return log(shift) / log(10.0); +} + +sub round { + my($number) = shift; + return int($number + .5 * ($number <=> 0)); +} + +# Convert from solexa qual to probability of miscall +sub phredToP($) { + my $sol = shift; + my $p = (10.0 ** (($sol) / -10.0)); + ($p >= 0.0 && $p <= 1.0) || die "Bad prob: $p, from sol $sol"; + return $p; +} + +# Convert from phred qual to probability of miscall +sub solToP($) { + my $phred = shift; + my $x = (10.0 ** (($phred) / -10.0)); + my $p = $x / (1.0 + $x); + ($p >= 0.0 && $p <= 1.0) || die "Bad prob: $p, from x $x, phred $phred"; + return $p; +} + +# Convert from probability of miscall to phred qual +sub pToPhred($) { + my $p = shift; + ($p >= 0.0 && $p <= 1.0) || die "Bad prob: $p"; + return round(-10.0 * log10($p)); +} + +# Convert from probability of miscall to solexa qual +sub pToSol($) { + my $p = shift; + ($p >= 0.0 && $p <= 1.0) || die "Bad prob: $p"; + return 0 if($p == 1.0); + return round(-10.0 * log10($p / (1.0 - $p))); +} + +# Print conversion table from Phred to Solexa +print "uint8_t solToPhred[] = {"; +my $cols = 10; +my $cnt = 0; +for(my $i = -10; $i < 256; $i++) { + # Solexa qual = $i + my $p = solToP($i); + my $ph = pToPhred($p); + print "\n\t/* $i */ " if($cnt == 0); + $cnt++; + $cnt = 0 if($cnt == 10); + print "$ph"; + print ", " if($i < 255); +} +print "\n};\n";