From 64af9e583d19d33f981c7371654f89718442d44e Mon Sep 17 00:00:00 2001 From: BenLangmead Date: Mon, 13 Mar 2017 17:28:40 -0400 Subject: [PATCH] bringing over some bowtie2 alphabet tables in support of sstring.h --- alphabet.cpp | 126 +++++++++++++++++++++++++++++++++++++++++++++++------------ alphabet.h | 28 +++++++++++++ 2 files changed, 128 insertions(+), 26 deletions(-) diff --git a/alphabet.cpp b/alphabet.cpp index d75c2b1..fe94493 100644 --- a/alphabet.cpp +++ b/alphabet.cpp @@ -74,6 +74,60 @@ uint8_t rcCharToDna5[] = { /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; +/** + * Mapping from ASCII characters for ambiguous nucleotides into masks: + */ +uint8_t asc2dnamask[] = { + /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 64 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0, + /* A B C D G H K M N */ + /* 80 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, + /* R S T V W Y */ + /* 96 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0, + /* a b c d g h k m n */ + /* 112 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, + /* r s t v w y */ + /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/// For converting from ASCII to the Dna5 code where A=0, C=1, G=2, +/// T=3, N=4 +/// According to the manual all the other characters, including +/// IUPAC codes are being converted to N +uint8_t asc2dna[] = { + /* 0 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 16 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 32 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* - */ + /* 48 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 64 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + /* A B C D G H K M N */ + /* 80 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* R S T U V W Y */ + /* 96 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + /* a b c d g h k m n */ + /* 112 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* r s t u v w y */ + /* 128 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 144 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 160 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 176 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 192 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 208 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 224 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* 240 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +}; + /// For converting from ASCII to the Dna5 code where A=0, C=1, G=2, /// T=3, N=4 uint8_t asc2col[] = { @@ -133,32 +187,6 @@ uint8_t asc2dnacat[] = { /** * Mapping from ASCII characters for ambiguous nucleotides into masks: */ -uint8_t asc2dnamask[] = { - /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 64 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0, - /* A B C D G H K M N */ - /* 80 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, - /* R S T V W Y */ - /* 96 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0, - /* a b c d g h k m n */ - /* 112 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, - /* r s t v w y */ - /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -/** - * Mapping from ASCII characters for ambiguous nucleotides into masks: - */ char asc2dnacomp[] = { /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -237,3 +265,49 @@ uint8_t dinuc2color[5][5] = { /* T */ {3, 2, 1, 0, 4}, /* N */ {4, 4, 4, 4, 4} }; + +/// Convert bit encoded DNA char to its complement +int dnacomp[5] = { + 3, 2, 1, 0, 4 +}; + +const char *iupacs = "!ACMGRSVTWYHKDBN!acmgrsvtwyhkdbn"; + +char mask2iupac[16] = { + -1, + 'A', // 0001 + 'C', // 0010 + 'M', // 0011 + 'G', // 0100 + 'R', // 0101 + 'S', // 0110 + 'V', // 0111 + 'T', // 1000 + 'W', // 1001 + 'Y', // 1010 + 'H', // 1011 + 'K', // 1100 + 'D', // 1101 + 'B', // 1110 + 'N', // 1111 +}; + +int maskcomp[16] = { + 0, // 0000 (!) -> 0000 (!) + 8, // 0001 (A) -> 1000 (T) + 4, // 0010 (C) -> 0100 (G) + 12, // 0011 (M) -> 1100 (K) + 2, // 0100 (G) -> 0010 (C) + 10, // 0101 (R) -> 1010 (Y) + 6, // 0110 (S) -> 0110 (S) + 14, // 0111 (V) -> 1110 (B) + 1, // 1000 (T) -> 0001 (A) + 9, // 1001 (W) -> 1001 (W) + 5, // 1010 (Y) -> 0101 (R) + 13, // 1011 (H) -> 1101 (D) + 3, // 1100 (K) -> 0011 (M) + 11, // 1101 (D) -> 1011 (H) + 7, // 1110 (B) -> 0111 (V) + 15, // 1111 (N) -> 1111 (N) +}; + diff --git a/alphabet.h b/alphabet.h index d2b5af9..fc03779 100644 --- a/alphabet.h +++ b/alphabet.h @@ -197,6 +197,11 @@ static inline char comp(char c) { extern uint8_t dna4Cat[]; extern uint8_t charToDna5[]; + +/// Convert an ascii char to a 2-bit base: 0=A, 1=C, 2=G, 3=T, 4=N +extern uint8_t asc2dna[]; + +/// Convert an ascii char to a 2-bit base: 0=A, 1=C, 2=G, 3=T, 4=N extern uint8_t asc2col[]; extern uint8_t rcCharToDna5[]; @@ -218,6 +223,12 @@ extern uint8_t asc2colcat[]; /// corresponding 2-bit nucleotide extern uint8_t nuccol2nuc[5][5]; +/// Convert ambiguous ASCII nuceleotide to mask +extern uint8_t asc2dnamask[]; + +/// Convert a 4-bit mask into an IUPAC code +extern char mask2iupac[16]; + /** * Return true iff c is an unambiguous Dna character. */ @@ -246,7 +257,24 @@ static inline bool isColor(char c) { return asc2colcat[(int)c] > 0; } +/// Convert bit encoded DNA char to its complement +extern int dnacomp[5]; + +/// String of all DNA and IUPAC characters +extern const char *iupacs; + +/** + * Return the reverse complement of a bit-encoded nucleotide. + */ +static inline int compDna(int c) { + assert_leq(c, 4); + return dnacomp[c]; +} + /// Convert a pair of 2-bit (and 4=N) encoded DNA bases to a color extern uint8_t dinuc2color[5][5]; +/// Map from masks to their reverse-complement masks +extern int maskcomp[16]; + #endif /*ALPHABETS_H_*/