# DNA Handling Examples

This notebook provides a brief overview of some included utilities for working with DNA sequences and FASTQ files.

In [1]:
import bootstrap

In [2]:
import gzip
from common import dna, fastq

---
## Load a FASTQ File

In [3]:
path = "/home/shared/prism-data/Nachusa Sequences/nachusa-2020-soil16S-sequences/Wesley001-WH-051220_S140_L001_R1_001.fastq.gz"

In [4]:
with gzip.open(path) as f:
    sample = fastq.read(f)

In [5]:
len(sample)

17164

In [6]:
sample[0]

FastqEntry:
  @MN01227:252:000H3FWK5:1:11101:5434:1073 1:N:0:TGCTACATCA
  GTGCCAGCAGCAGCGGTAATACGGGGGGAGCAAGCGTTGTTCGGATTTACTGGGCGTAAAGGGCGCGTAGGCGGTCAGCACAAGTCAGTTGTGAAATCTCCGAGCTNAACTCGGAANGGTCAACTGAAACTGTGCGACTAGAGTGCGGAAGGG
  +
  FFAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFAFFFFFFFF/FF/FAAFFFA/A/FFFFAFFFFFFFF#/FFFFAFFF#FF/F//FFFAAAAFFFFFF/AFFFAFFFFAF/FFFA

---
## Sequence Encoding/Decoding

The following cells demonstrate encoding DNA sequences into vector representations.

- A = 0
- C = 1
- G = 2
- T = 3
- N = 4

In [7]:
sample[0].sequence

'GTGCCAGCAGCAGCGGTAATACGGGGGGAGCAAGCGTTGTTCGGATTTACTGGGCGTAAAGGGCGCGTAGGCGGTCAGCACAAGTCAGTTGTGAAATCTCCGAGCTNAACTCGGAANGGTCAACTGAAACTGTGCGACTAGAGTGCGGAAGGG'

In [8]:
len(sample[0].sequence)

153

In [9]:
encoded = dna.encode_sequence(sample[0].sequence)
encoded

array([2, 3, 2, 1, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 2, 2, 3, 0, 0, 3, 0, 1,
       2, 2, 2, 2, 2, 2, 0, 2, 1, 0, 0, 2, 1, 2, 3, 3, 2, 3, 3, 1, 2, 2,
       0, 3, 3, 3, 0, 1, 3, 2, 2, 2, 1, 2, 3, 0, 0, 0, 2, 2, 2, 1, 2, 1,
       2, 3, 0, 2, 2, 1, 2, 2, 3, 1, 0, 2, 1, 0, 1, 0, 0, 2, 3, 1, 0, 2,
       3, 3, 2, 3, 2, 0, 0, 0, 3, 1, 3, 1, 1, 2, 0, 2, 1, 3, 4, 0, 0, 1,
       3, 1, 2, 2, 0, 0, 4, 2, 2, 3, 1, 0, 0, 1, 3, 2, 0, 0, 0, 1, 3, 2,
       3, 2, 1, 2, 0, 1, 3, 0, 2, 0, 2, 3, 2, 1, 2, 2, 0, 0, 2, 2, 2],
      dtype=uint8)

In [10]:
dna.decode_sequence(encoded)

'GTGCCAGCAGCAGCGGTAATACGGGGGGAGCAAGCGTTGTTCGGATTTACTGGGCGTAAAGGGCGCGTAGGCGGTCAGCACAAGTCAGTTGTGAAATCTCCGAGCTNAACTCGGAANGGTCAACTGAAACTGTGCGACTAGAGTGCGGAAGGG'

---
## k-mer Sequence Encoding/Decoding

3-mer example:
`to_kmers('ACTCG') = ['ACT', 'CTC', 'TCG']`

In [11]:
kmer_sequence = dna.encode_kmers(encoded, kmer=3)
kmer_sequence

array([ 67,  86,  56,  30,  27,  11,  55,  27,  11,  55,  27,  11,  57,
        37,  63,  65,  75,   3,  15,  76,   7,  37,  62,  62,  62,  62,
        60,  52,  11,  55,  25,   2,  11,  57,  38,  68,  92,  88,  68,
        91,  82,  37,  60,  53,  18,  93,  90,  76,   8,  42,  87,  62,
        61,  57,  38,  65,  75,   0,   2,  12,  62,  61,  57,  36,  57,
        38,  65,  77,  12,  61,  57,  37,  63,  66,  80,  27,  11,  55,
        26,   5,  25,   2,  13,  66,  80,  27,  13,  68,  92,  88,  67,
        85,  50,   0,   3,  16,  83,  41,  81,  32,  35,  52,  11,  58,
        44,  95, 100,   1,   8,  41,  82,  37,  60,  50,   4,  22, 112,
        63,  66,  80,  25,   1,   8,  42,  85,  50,   0,   1,   8,  42,
        88,  67,  86,  57,  35,  51,   8,  40,  77,  10,  52,  13,  67,
        86,  57,  37,  60,  50,   2,  12,  62])

In [12]:
len(kmer_sequence)

151

In [13]:
dna.decode_kmers(kmer_sequence, kmer=3)

array([2, 3, 2, 1, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 2, 2, 3, 0, 0, 3, 0, 1,
       2, 2, 2, 2, 2, 2, 0, 2, 1, 0, 0, 2, 1, 2, 3, 3, 2, 3, 3, 1, 2, 2,
       0, 3, 3, 3, 0, 1, 3, 2, 2, 2, 1, 2, 3, 0, 0, 0, 2, 2, 2, 1, 2, 1,
       2, 3, 0, 2, 2, 1, 2, 2, 3, 1, 0, 2, 1, 0, 1, 0, 0, 2, 3, 1, 0, 2,
       3, 3, 2, 3, 2, 0, 0, 0, 3, 1, 3, 1, 1, 2, 0, 2, 1, 3, 4, 0, 0, 1,
       3, 1, 2, 2, 0, 0, 4, 2, 2, 3, 1, 0, 0, 1, 3, 2, 0, 0, 0, 1, 3, 2,
       3, 2, 1, 2, 0, 1, 3, 0, 2, 0, 2, 3, 2, 1, 2, 2, 0, 0, 2, 2, 2])

---
## Quality Scores

The quality scores in a FASTQ file are typically PHRED-33 encoded. Theese scores are decoded to otain the probabilities that the bases are incorrect.

The score can be computed from the probability score like so:

$Q = -10\log_{10}{P} + 33$

Likewise, the error probability can be recovered by re-aranging the equation:

$P = 10^{\frac{Q - 33}{-10}}$

In [14]:
scores = sample[0].quality_scores[:10]
scores

'FFAFFFFFFF'

In [15]:
error_probs = dna.decode_phred(scores)
error_probs

array([0.00019953, 0.00019953, 0.00063096, 0.00019953, 0.00019953,
       0.00019953, 0.00019953, 0.00019953, 0.00019953, 0.00019953])

In [16]:
dna.encode_phred(error_probs)

'FFAFFFFFFF'

---
## Store Sample Using LMDB

As these sample files can contain many sequences, it is beneficial to read them from disk as needed,
rather than trying to load them all into memory at once. A good way to accomplish this is to utilize
[LMDB](http://www.lmdb.tech/doc/). The following cells provide a quick demonstration on how to
convert a FASTQ file into an LMDB data file.

In [17]:
from lmdbm import Lmdb
import numpy as np

In [18]:
with Lmdb.open("/tmp/test", 'c') as store:
    store.update(fastq.to_encoded_dict(sample))

In [19]:
with Lmdb.open("/tmp/test") as store:
    sequence_bytes = store['0']
    sequence = np.frombuffer(sequence_bytes, dtype=np.uint8)
sequence

array([2, 3, 2, 1, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 2, 2, 3, 0, 0, 3, 0, 1,
       2, 2, 2, 2, 2, 2, 0, 2, 1, 0, 0, 2, 1, 2, 3, 3, 2, 3, 3, 1, 2, 2,
       0, 3, 3, 3, 0, 1, 3, 2, 2, 2, 1, 2, 3, 0, 0, 0, 2, 2, 2, 1, 2, 1,
       2, 3, 0, 2, 2, 1, 2, 2, 3, 1, 0, 2, 1, 0, 1, 0, 0, 2, 3, 1, 0, 2,
       3, 3, 2, 3, 2, 0, 0, 0, 3, 1, 3, 1, 1, 2, 0, 2, 1, 3, 4, 0, 0, 1,
       3, 1, 2, 2, 0, 0, 4, 2, 2, 3, 1, 0, 0, 1, 3, 2, 0, 0, 0, 1, 3, 2,
       3, 2, 1, 2, 0, 1, 3, 0, 2, 0, 2, 3, 2, 1, 2, 2, 0, 0, 2, 2, 2],
      dtype=uint8)

## Keras Data Generators

Using the LMDB data files created above, these next cells provide a brief overview on using the
DNA sequence/sample generators to create datasets for Keras models.

In [21]:
from common.data import DnaLabelType, DnaSequenceGenerator

In [22]:
dataset = DnaSequenceGenerator(
    samples=["/tmp/test"],     # paths to LMDB files.
    sequence_length=150,       # length to trim down to
    kmer=3,                    # k-mer output
    batch_size=32,
    batches_per_epoch=100,
    augment=True,              # Enable sequence augmentation via sliding
    balance=False,             # Shrink samples to match the smallest size
    labels=DnaLabelType.OneMer # SampleId, OneMer, or KMer
)

In [24]:
x, y = dataset[0] # 1st batch

In [26]:
print(x.shape)
x

(32, 148)


array([[67., 86., 56., ..., 60., 50.,  2.],
       [86., 56., 30., ..., 60., 52., 10.],
       [30., 27., 11., ..., 10., 52., 12.],
       ...,
       [56., 30., 27., ..., 92., 85., 52.],
       [56., 30., 27., ...,  2., 10., 52.],
       [86., 56., 30., ..., 60., 52., 10.]])

In [27]:
print(y.shape)
y

(32, 150)


array([[2, 3, 2, ..., 0, 0, 2],
       [3, 2, 1, ..., 0, 2, 0],
       [1, 1, 0, ..., 0, 2, 2],
       ...,
       [2, 1, 1, ..., 2, 0, 2],
       [2, 1, 1, ..., 2, 0, 2],
       [3, 2, 1, ..., 0, 2, 0]], dtype=int32)

This dataset can be given to a model easily for training by passing it into the `fit` method directly.

```py
model.fit(dataset, ...)
```

Similar to the sequence generator, one can also generate subsamples (sets of sequences) using the
`DnaSampleGenerator` data generator.

In [28]:
from common.data import DnaSampleGenerator

In [38]:
dataset = DnaSampleGenerator(
    samples=["/tmp/test"],
    subsample_length=1000,
    sequence_length=150,
    kmer=1,
    batch_size=16,
    batches_per_epoch=100,
    augment=True,
    balance=False,
    labels=DnaLabelType.SampleIds
)

In [39]:
x, y = dataset[0] # 1st batch

In [40]:
print(x.shape)
x

(16, 1000, 150)


array([[[1., 1., 0., ..., 0., 2., 2.],
        [2., 1., 1., ..., 2., 0., 2.],
        [1., 1., 0., ..., 2., 2., 2.],
        ...,
        [1., 1., 0., ..., 0., 2., 2.],
        [1., 1., 0., ..., 0., 2., 2.],
        [1., 1., 0., ..., 0., 2., 2.]],

       [[2., 3., 2., ..., 0., 2., 0.],
        [3., 2., 2., ..., 3., 2., 0.],
        [2., 1., 1., ..., 0., 0., 2.],
        ...,
        [1., 1., 0., ..., 2., 2., 2.],
        [1., 1., 0., ..., 0., 3., 3.],
        [1., 1., 0., ..., 2., 2., 2.]],

       [[3., 2., 1., ..., 0., 2., 0.],
        [2., 1., 1., ..., 2., 0., 2.],
        [1., 1., 0., ..., 0., 2., 2.],
        ...,
        [3., 2., 1., ..., 1., 0., 2.],
        [2., 1., 1., ..., 2., 0., 2.],
        [3., 2., 1., ..., 0., 2., 2.]],

       ...,

       [[1., 1., 0., ..., 2., 2., 2.],
        [2., 1., 1., ..., 2., 0., 2.],
        [2., 3., 2., ..., 0., 0., 2.],
        ...,
        [1., 0., 2., ..., 2., 0., 2.],
        [1., 1., 0., ..., 0., 2., 3.],
        [3., 2., 1., ..., 2., 2.

In [41]:
print(y.shape)
y

(16,)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

---
## FASTQ Sequence ID

Example sequence ID: `@MN01227:252:000H3FWK5:1:11101:5434:1073 1:N:0:TGCTACATCA`

@\<instrument\>:\<run number\>:\<flowcell ID\>:\<lane\>:\<tile\>:\<pos_x\>:\<pos_y\> \<read type\>:\<is filtered (N|Y)\>:\<control number\>:\<sequence index\>


In [20]:
sequence_id = sample[0].sequence_id
sequence_id

@MN01227:252:000H3FWK5:1:11101:5434:1073 1:N:0:TGCTACATCA

In [21]:
sequence_id.instrument

'MN01227'

In [22]:
sequence_id.run_number

252

In [23]:
sequence_id.flowcell_id

'000H3FWK5'

In [24]:
sequence_id.lane

1

In [25]:
sequence_id.tile

11101

In [26]:
sequence_id.pos

(5434, 1073)

In [27]:
sequence_id.read_type

1

In [28]:
sequence_id.is_filtered

False

In [29]:
sequence_id.control_number

0

In [30]:
sequence_id.sequence_index

'TGCTACATCA'