Permalink
Browse files

bringing over some bowtie2 alphabet tables in support of sstring.h

  • Loading branch information...
1 parent 0d5a573 commit 64af9e583d19d33f981c7371654f89718442d44e @BenLangmead committed Mar 13, 2017
Showing with 128 additions and 26 deletions.
  1. +100 −26 alphabet.cpp
  2. +28 −0 alphabet.h
View
@@ -74,6 +74,60 @@ uint8_t rcCharToDna5[] = {
/* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
+/**
+ * Mapping from ASCII characters for ambiguous nucleotides into masks:
+ */
+uint8_t asc2dnamask[] = {
+ /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 64 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
+ /* A B C D G H K M N */
+ /* 80 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
+ /* R S T V W Y */
+ /* 96 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
+ /* a b c d g h k m n */
+ /* 112 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
+ /* r s t v w y */
+ /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
+/// T=3, N=4
+/// According to the manual all the other characters, including
+/// IUPAC codes are being converted to N
+uint8_t asc2dna[] = {
+ /* 0 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 16 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 32 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* - */
+ /* 48 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 64 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* A B C D G H K M N */
+ /* 80 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* R S T U V W Y */
+ /* 96 */ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* a b c d g h k m n */
+ /* 112 */ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* r s t u v w y */
+ /* 128 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 144 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 160 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 176 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 192 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 208 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 224 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ /* 240 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+};
+
/// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
/// T=3, N=4
uint8_t asc2col[] = {
@@ -133,32 +187,6 @@ uint8_t asc2dnacat[] = {
/**
* Mapping from ASCII characters for ambiguous nucleotides into masks:
*/
-uint8_t asc2dnamask[] = {
- /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 48 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 64 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
- /* A B C D G H K M N */
- /* 80 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
- /* R S T V W Y */
- /* 96 */ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
- /* a b c d g h k m n */
- /* 112 */ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
- /* r s t v w y */
- /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-/**
- * Mapping from ASCII characters for ambiguous nucleotides into masks:
- */
char asc2dnacomp[] = {
/* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -237,3 +265,49 @@ uint8_t dinuc2color[5][5] = {
/* T */ {3, 2, 1, 0, 4},
/* N */ {4, 4, 4, 4, 4}
};
+
+/// Convert bit encoded DNA char to its complement
+int dnacomp[5] = {
+ 3, 2, 1, 0, 4
+};
+
+const char *iupacs = "!ACMGRSVTWYHKDBN!acmgrsvtwyhkdbn";
+
+char mask2iupac[16] = {
+ -1,
+ 'A', // 0001
+ 'C', // 0010
+ 'M', // 0011
+ 'G', // 0100
+ 'R', // 0101
+ 'S', // 0110
+ 'V', // 0111
+ 'T', // 1000
+ 'W', // 1001
+ 'Y', // 1010
+ 'H', // 1011
+ 'K', // 1100
+ 'D', // 1101
+ 'B', // 1110
+ 'N', // 1111
+};
+
+int maskcomp[16] = {
+ 0, // 0000 (!) -> 0000 (!)
+ 8, // 0001 (A) -> 1000 (T)
+ 4, // 0010 (C) -> 0100 (G)
+ 12, // 0011 (M) -> 1100 (K)
+ 2, // 0100 (G) -> 0010 (C)
+ 10, // 0101 (R) -> 1010 (Y)
+ 6, // 0110 (S) -> 0110 (S)
+ 14, // 0111 (V) -> 1110 (B)
+ 1, // 1000 (T) -> 0001 (A)
+ 9, // 1001 (W) -> 1001 (W)
+ 5, // 1010 (Y) -> 0101 (R)
+ 13, // 1011 (H) -> 1101 (D)
+ 3, // 1100 (K) -> 0011 (M)
+ 11, // 1101 (D) -> 1011 (H)
+ 7, // 1110 (B) -> 0111 (V)
+ 15, // 1111 (N) -> 1111 (N)
+};
+
View
@@ -197,6 +197,11 @@ static inline char comp(char c) {
extern uint8_t dna4Cat[];
extern uint8_t charToDna5[];
+
+/// Convert an ascii char to a 2-bit base: 0=A, 1=C, 2=G, 3=T, 4=N
+extern uint8_t asc2dna[];
+
+/// Convert an ascii char to a 2-bit base: 0=A, 1=C, 2=G, 3=T, 4=N
extern uint8_t asc2col[];
extern uint8_t rcCharToDna5[];
@@ -218,6 +223,12 @@ extern uint8_t asc2colcat[];
/// corresponding 2-bit nucleotide
extern uint8_t nuccol2nuc[5][5];
+/// Convert ambiguous ASCII nuceleotide to mask
+extern uint8_t asc2dnamask[];
+
+/// Convert a 4-bit mask into an IUPAC code
+extern char mask2iupac[16];
+
/**
* Return true iff c is an unambiguous Dna character.
*/
@@ -246,7 +257,24 @@ static inline bool isColor(char c) {
return asc2colcat[(int)c] > 0;
}
+/// Convert bit encoded DNA char to its complement
+extern int dnacomp[5];
+
+/// String of all DNA and IUPAC characters
+extern const char *iupacs;
+
+/**
+ * Return the reverse complement of a bit-encoded nucleotide.
+ */
+static inline int compDna(int c) {
+ assert_leq(c, 4);
+ return dnacomp[c];
+}
+
/// Convert a pair of 2-bit (and 4=N) encoded DNA bases to a color
extern uint8_t dinuc2color[5][5];
+/// Map from masks to their reverse-complement masks
+extern int maskcomp[16];
+
#endif /*ALPHABETS_H_*/

0 comments on commit 64af9e5

Please sign in to comment.