-
Notifications
You must be signed in to change notification settings - Fork 14
/
nucleicacid.jl
491 lines (386 loc) · 12.9 KB
/
nucleicacid.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
# Nucleic Acid
# ============
#
# DNA and RNA types.
#
# This file is a part of the BioJulia ecosystem.
# License is MIT: https://github.com/BioJulia/NucleicAcids.jl/blob/master/LICENSE.md
# NucleicAcid Encoding
# -------------------
#
# Unambiguous nucleotides are represented in one-hot encoding as follows:
#
# | NucleicAcid | Bits |
# | ----------- | ---- |
# | A | 0001 |
# | C | 0010 |
# | G | 0100 |
# | T/U | 1000 |
#
# Ambiguous nucleotides are bitwise OR of these four nucleotides. For example, R
# , A or G, is represented as 0101 (= A: 0001 | G: 0100). The gap symbol is
# always 0000. The meaningful four bits are stored in the least significant
# bits of a byte.
"""
An abstract nucleic acid type.
"""
abstract type NucleicAcid <: BioSymbol end
"""
A deoxyribonucleic acid type.
"""
primitive type DNA <: NucleicAcid 8 end
"""
A ribonucleic acid type.
"""
primitive type RNA <: NucleicAcid 8 end
prefix(::DNA) = "DNA"
prefix(::RNA) = "RNA"
type_text(x::NucleicAcid) = prefix(x)
isterm(symbol::NucleicAcid) = false
encoded_data_eltype(::Type{DNA}) = UInt8
encoded_data_eltype(::Type{RNA}) = UInt8
# Conversion from/to compatible types
# ---------------------------
Base.convert(::Type{DNA}, nt::RNA) = reinterpret(DNA, nt)
DNA(nt::RNA) = convert(DNA, nt)
Base.convert(::Type{RNA}, nt::DNA) = reinterpret(RNA, nt)
RNA(nt::DNA) = convert(RNA, nt)
# Conversion from/to characters
# -----------------------------
function Base.convert(::Type{DNA}, c::Char)
if c > '\uff'
throw(InexactError(:convert, DNA, repr(c)))
end
@inbounds dna = char_to_dna[convert(Int, c) + 1]
if !isvalid(DNA, dna)
throw(InexactError(:convert, DNA, repr(c)))
end
return encode(DNA, dna)
end
DNA(c::Char) = convert(DNA, c)
function Base.convert(::Type{RNA}, c::Char)
if c > '\uff'
throw(InexactError(:convert, RNA, repr(c)))
end
@inbounds rna = char_to_rna[convert(Int, c) + 1]
if !isvalid(RNA, rna)
throw(InexactError(:convert, RNA, repr(c)))
end
return encode(RNA, rna)
end
RNA(c::Char) = convert(RNA, c)
function Base.convert(::Type{Char}, nt::DNA)
return dna_to_char[encoded_data(nt) + 1]
end
Char(nt::DNA) = convert(Char, nt)
function Base.convert(::Type{Char}, nt::RNA)
return rna_to_char[encoded_data(nt) + 1]
end
Char(nt::RNA) = convert(Char, nt)
# Encoding of DNA and RNA NucleicAcids
# ------------------------------------
"""
Lookup table used for converting characters to DNA symbol values
The provided `convert` method should be used rather than this table, but you can
use it if you insist and know what your are doing.
!!! note
The array is indexed by converting a character to an integer. When indexed, it
returns a UInt8 with the bit pattern on the corresponding nucleic acid.
The `convert(DNA, x)` method does this for you.
!!! warning
If you index this array with a character that is greater than '\uff', then
you will get a bounds error. The `convert(DNA, x)` method checks such things
to avoid this for you.
!!! warning
If you index this array with a character that does not have a corresonding
DNA symbol, then you get a byte with the bit pattern `0x80`, which is an
invalid DNA symbol and will be of no use to you. The `convert(DNA, x)`
checks such things for you and throws an exception gracefully if such a
situation arises.
"""
const char_to_dna = [0x80 for _ in 0x00:0xff]
"""
Lookup table for converting DNA symbol values to characters
The provided `convert` method should be used rather than this table, but you can
use it if you insist and know what your are doing.
!!! note
The array is indexed by reinterpreting a DNA symbol value as an UInt8.
When indexed, it returns the character corresponding to the symbol.
The `convert(Char, x::DNA)` method does this for you.
!!! warning
If you index this array with an invalid DNA symbol, then you will hit a
bounds error. If you construct DNA symbols properly, then this scenario
should never occur.
"""
dna_to_char
# Derived from "The DDBJ/ENA/GenBank Feature Table Definition"
# §7.4.1 Nucleotide base code (IUPAC)
# http://www.insdc.org/documents/feature_table.html#7.4.1
const dna_to_char = let
chararray = Vector{Char}(undef, 16)
for (char, doc, bits) in [
('-', "DNA Gap", 0b0000),
('A', "DNA Adenine", 0b0001),
('C', "DNA Cytosine", 0b0010),
('G', "DNA Guanine", 0b0100),
('T', "DNA Thymine", 0b1000),
('M', "DNA Adenine or Cytosine", 0b0011),
('R', "DNA Adenine or Guanine", 0b0101),
('W', "DNA Adenine or Thymine", 0b1001),
('S', "DNA Cytosine or Guanine", 0b0110),
('Y', "DNA Cytosine or Thymine", 0b1010),
('K', "DNA Guanine or Thymine", 0b1100),
('V', "DNA Adenine, Cytosine or Guanine", 0b0111),
('H', "DNA Adenine, Cytosine or Thymine", 0b1011),
('D', "DNA Adenine, Guanine or Thymine", 0b1101),
('B', "DNA Cytosine, Guanine or Thymine", 0b1110),
('N', "DNA Adenine, Cytosine, Guanine or Thymine", 0b1111)]
var = Symbol("DNA_", char != '-' ? char : "Gap")
@eval begin
@doc $(doc) const $(var) = encode(DNA, $(bits))
char_to_dna[$(convert(Int, char) + 1)] = char_to_dna[$(convert(Int, lowercase(char)) + 1)] = $(bits)
$(chararray)[$(convert(Int, bits) + 1)] = $(char)
end
end
Tuple(chararray)
end
@eval function alphabet(::Type{DNA})
return $(tuple([encode(DNA, x) for x in 0b0000:0b1111]...))
end
"""
alphabet(DNA)
Get all symbols of `DNA` in sorted order.
Examples
--------
```jldoctest
julia> alphabet(DNA)
(DNA_Gap, DNA_A, DNA_C, DNA_M, DNA_G, DNA_R, DNA_S, DNA_V, DNA_T, DNA_W, DNA_Y, DNA_H, DNA_K, DNA_D, DNA_B, DNA_N)
julia> issorted(alphabet(DNA))
true
```
"""
alphabet(::Type{DNA})
"""
ACGT
Unambiguous DNA.
Examples
--------
```jldoctest
julia> ACGT
(DNA_A, DNA_C, DNA_G, DNA_T)
```
"""
const ACGT = (DNA_A, DNA_C, DNA_G, DNA_T)
"""
ACGTN
Unambiguous DNA and `DNA_N`.
Examples
--------
```jldoctest
julia> ACGTN
(DNA_A, DNA_C, DNA_G, DNA_T, DNA_N)
```
"""
const ACGTN = (DNA_A, DNA_C, DNA_G, DNA_T, DNA_N)
"""
Lookup table used for converting characters to RNA symbol values
The provided `convert` method should be used rather than this table, but you can
use it if you insist and know what your are doing.
!!! note
The array is indexed by converting a character to an integer. When indexed, it
returns a UInt8 with the bit pattern on the corresponding nucleic acid.
The `convert(RNA, x)` method does this for you.
!!! warning
If you index this array with a character that is greater than '\uff', then
you will get a bounds error. The `convert(RNA, x)` method checks such things
to avoid this for you.
!!! warning
If you index this array with a character that does not have a corresonding
RNA symbol, then you get a byte with the bit pattern `0x80`, which is an
invalid RNA symbol and will be of no use to you. The `convert(RNA, x)`
checks such things for you and throws an exception gracefully if such a
situation arises.
"""
const char_to_rna = [0x80 for _ in 0x00:0xff]
"""
Lookup table for converting RNA symbol values to characters
The provided `convert` method should be used rather than this table, but you can
use it if you insist and know what your are doing.
!!! note
The array is indexed by reinterpreting a RNA symbol value as an UInt8.
When indexed, it returns the character corresponding to the symbol.
The `convert(Char, x::RNA)` method does this for you.
!!! warning
If you index this array with an invalid RNA symbol, then you will hit a
bounds error. If you construct RNA symbols properly, then this scenario
should never occur.
"""
rna_to_char
const rna_to_char = let
chararray = Vector{Char}(undef, 16)
for (char, doc, dna) in [
('-', "RNA Gap", DNA_Gap),
('A', "RNA Adenine", DNA_A ),
('C', "RNA Cytosine", DNA_C ),
('G', "RNA Guanine", DNA_G ),
('U', "RNA Uracil", DNA_T ),
('M', "RNA Adenine or Cytosine", DNA_M ),
('R', "RNA Adenine or Guanine", DNA_R ),
('W', "RNA Adenine or Uracil", DNA_W ),
('S', "RNA Cytosine or Guanine", DNA_S ),
('Y', "RNA Cytosine or Uracil", DNA_Y ),
('K', "RNA Guanine or Uracil", DNA_K ),
('V', "RNA Adenine, Cytosine or Guanine", DNA_V ),
('H', "RNA Adenine, Cytosine or Uracil", DNA_H ),
('D', "RNA Adenine, Guanine or Uracil", DNA_D ),
('B', "RNA Cytosine, Guanine or Uracil", DNA_B ),
('N', "RNA Adenine, Cytosine, Guanine or Uracil", DNA_N )]
var = Symbol("RNA_", char != '-' ? char : "Gap")
@eval begin
@doc $(doc) const $(var) = reinterpret(RNA, $(dna))
char_to_rna[$(convert(Int, char) + 1)] = char_to_rna[$(convert(Int, lowercase(char) + 1))] = reinterpret(UInt8, $(dna))
$(chararray)[$(encoded_data(dna) + 1)] = $(char)
end
end
Tuple(chararray)
end
@eval function alphabet(::Type{RNA})
return $(tuple([encode(RNA, x) for x in 0b0000:0b1111]...))
end
"""
alphabet(RNA)
Get all symbols of `RNA` in sorted order.
Examples
--------
```jldoctest
julia> alphabet(RNA)
(RNA_Gap, RNA_A, RNA_C, RNA_M, RNA_G, RNA_R, RNA_S, RNA_V, RNA_U, RNA_W, RNA_Y, RNA_H, RNA_K, RNA_D, RNA_B, RNA_N)
julia> issorted(alphabet(RNA))
true
```
"""
alphabet(::Type{RNA})
"""
ACGU
Unambiguous RNA.
Examples
--------
```jldoctest
julia> ACGU
(RNA_A, RNA_C, RNA_G, RNA_U)
```
"""
const ACGU = (RNA_A, RNA_C, RNA_G, RNA_U)
"""
ACGUN
Unambiguous RNA and `RNA_N`.
Examples
--------
```jldoctest
julia> ACGUN
(RNA_A, RNA_C, RNA_G, RNA_U, RNA_N)
```
"""
const ACGUN = (RNA_A, RNA_C, RNA_G, RNA_U, RNA_N)
gap(::Type{DNA}) = DNA_Gap
gap(::Type{RNA}) = RNA_Gap
"""
isGC(nt::NucleicAcid)
Test if `nt` is surely either guanine or cytosine.
"""
function isGC(nt::NucleicAcid)
bits = encoded_data(nt)
return bits != 0 && (bits & 0b1001) == 0
end
"""
ispurine(nt::NucleicAcid)
Test if `nt` is surely a purine.
"""
@inline function ispurine(nt::NucleicAcid)
bits = encoded_data(nt)
return bits != 0 && (bits & 0b1010) == 0
end
"""
ispyrimidine(nt::NucleicAcid)
Test if `nt` is surely a pyrimidine.
"""
@inline function ispyrimidine(nt::NucleicAcid)
bits = encoded_data(nt)
return bits != 0 && (bits & 0b0101) == 0
end
"""
isambiguous(nt::NucleicAcid)
Test if `nt` is an ambiguous nucleotide.
"""
@inline function isambiguous(nt::NucleicAcid)
return count_ones(nt) > 1
end
"""
iscertain(nt::NucleicAcid)
Test if `nt` is a non-ambiguous nucleotide e.g. ACGT.
"""
@inline function iscertain(nt::NucleicAcid)
return count_ones(nt) == 1
end
"""
complement(nt::NucleicAcid)
Return the complementary nucleotide of `nt`.
This function returns the union of all possible complementary nucleotides.
Examples
--------
```jldoctest
julia> complement(DNA_A)
DNA_T
julia> complement(DNA_N)
DNA_N
julia> complement(RNA_U)
RNA_A
```
"""
function complement(nt::Union{DNA, RNA})
# This is essentially a lookup table of 16 x 4 bits.
# It's the concatenation of the bitpatterns of the nucleotides,
# in order, complemented.
u64 = 0xf7b3d591e6a2c480 >>> ((4 * encoded_data(nt)) & 63)
reinterpret(typeof(nt), (u64 % UInt8) & 0x0f)
end
function Base.isvalid(::Type{T}, x::Integer) where T <: NucleicAcid
return 0 ≤ x < 16
end
function Base.isvalid(nt::NucleicAcid)
return encoded_data(nt) ≤ 0b1111
end
function Base.:~(x::N) where N <: NucleicAcid
return encode(N, encoded_data(x) ⊻ 0b1111)
end
function Base.:|(x::N, y::N) where N <: NucleicAcid
return encode(N, encoded_data(x) | encoded_data(y))
end
function Base.:&(x::N, y::N) where N <: NucleicAcid
return encode(N, encoded_data(x) & encoded_data(y))
end
"""
compatbits(nt::NucleicAcid)
Return the compatibility bits of `nt` as `UInt8`.
The resulting `UInt8` has the lower four bits set
if `nt` is compatible with `A`, `C`, `G` and `T/U`, respectively.
Hence, `RNA_Gap` is `0x00` (not compatible with any nucleotide),
and `DNA_W` is `0x09` (compatible with `A` and `T`)
Examples
--------
```jldoctest
julia> compatbits(DNA_A)
0x01
julia> compatbits(DNA_C)
0x02
julia> compatbits(DNA_N)
0x0f
julia> compatbits(DNA_W)
0x09
julia> compatbits(RNA_Gap)
0x00
```
"""
@inline function compatbits(nt::NucleicAcid)
return encoded_data(nt)
end