In [1]:
from dnadb import fasta, taxonomy
import deepctx as dcs
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import wandb

from deepdna.nn import data_generators as dg
from deepdna.nn.models import load_model

In [2]:
# Only use GPU 0
dcs.tf.devices.use(gpus=[0])

2023-11-29 17:01:22.682764: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-29 17:01:22.683032: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-29 17:01:22.690670: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-29 17:01:22.690886: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-29 17:01:22.691053: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Converting Sequences and Taxonomies to DB Format

### SILVA

In [43]:
# 515f/806 sequences
!dnadb fasta import \
    /home/data2/deepdna/data/silva/silva-138-seqs-515f-806r-derep-uniq.fasta \
    ./silva-515f-806r.fasta.db

Importing FASTA...
573111it [00:04, 118177.34it/s]
Done. Imported 573,111 sequences. Skipped 0 sequences.


In [53]:
!dnadb taxonomy import \
    --depth 6 \
    --fasta-db ./silva-515f-806r.fasta.db \
    /home/data2/deepdna/data/silva/silva-138-tax-515f-806r-derep-uniq.tsv \
    ./silva-515f-806r.tax.fasta.db

Importing taxonomy TSV...
573111it [00:07, 73399.27it/s]
Writing labels disk...: 100%|██████████████| 8434/8434 [00:09<00:00, 907.70it/s]
Done. Imported 573,111 sequences. Skipped 0 sequences.


### Hopland

The following imports multpile FASTQs into a multiplexed FASTA file with sample mappings.

In [39]:
# Import forward reads from FASTQs
!dnadb fasta import-multiplexed \
    --output-sequences-path ./hopland.fasta.db \
    --output-mapping-path ./hopland.fasta.mapping.db \
    /home/shared/hopland/fastq/Ur*_R1_001.fastq

Importing FASTA...
Gathering Sequences from 128 file(s)...
128it [00:05, 21.93it/s]
Creating FASTA DB...
100%|█████████████████████████████| 4792854/4792854 [00:09<00:00, 512917.82it/s]
Creating FASTA Mapping DB...
Writing Sample Mappings...
100%|█████████████████████████████| 4792854/4792854 [00:24<00:00, 191807.80it/s]
Done. Imported 4,792,854 sequences. Skipped 0 sequences.


## Loading Sequences and Taxonomies

### SILVA

In [3]:
silva_sequences = fasta.FastaDb("./silva-515f-806r.fasta.db")
silva_taxonomies = taxonomy.TaxonomyDb("./silva-515f-806r.tax.fasta.db")

In [4]:
print(f"Total sequences: {len(silva_sequences):,}")

Total sequences: 573,111


In [5]:
print(f"Total taxonomy sequences: {silva_taxonomies.num_sequences:,}")

Total taxonomy sequences: 573,111


In [6]:
print(f"Total taxonomy labels: {silva_taxonomies.num_labels:,}")

Total taxonomy labels: 8,434


### Hopland

In [7]:
# Load samples
hopland_sequences = fasta.FastaDb("./hopland.fasta.db")
hopland_samples = hopland_sequences.mappings("./hopland.fasta.mapping.db")

In [8]:
print(f"Total sequences: {len(hopland_sequences):,}")

Total sequences: 1,027,593


In [9]:
# Number of samples loaded.
len(hopland_samples)

128

## Getting Sequences and Taxonomies

In [68]:
# Index using numeric indices
silva_sequences[0]

FastaEntry(sequence='TACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCGATTTAAGTCAGAGGTGAAAGCCCGGGGCTCAACCCCGGAATAGCCTTTGAGACTGGATTGCTTGAATCCGGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGGATCACTGGACCGGCATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGG', identifier='AB000106.1.1343', extra='Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingobium;Sphingomonas sp.')

In [70]:
# FASTA indices correspond to taxonomy indices
silva_taxonomies[0]

TaxonomyDbEntry(sequence_id='AB000106.1.1343', label='d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Sphingobium')

In [72]:
# FASTA indices correspond to taxonomy indices
assert silva_sequences[0].identifier == silva_taxonomies[0].sequence_id
print("Sequence IDs are the same")

Sequence IDs are the same


## Generating Samples for the Model

The following creates a data generator pipeline to encode sequences for the model, while also returning useful metadata for us to view.

### SILVA

In [14]:
silva = dg.BatchGenerator(batch_size=1, batches_per_epoch=1, pipeline=[
    dg.random_samples(silva_sequences), # The samples to choose from (uniformly)
    dg.random_sequence_entries(1000),   # Sample random FASTA entries from chosen samples
    dg.sequences(150),                  # Get the sequences from the FASTA entries and trim to length
    dg.encode_sequences(),              # Encode to integers,
    dg.encode_kmers(3),                 # Encode kmer integers
    dg.taxonomy_entries(silva_taxonomies),            # Get the corresponding taxonomy entries
    lambda encoded_kmer_sequences, taxonomy_entries: (encoded_kmer_sequences, taxonomy_entries)
])

In [15]:
# Grab a batch: (encoded_kmer_sequence, ground-truth taxonomies)
encoded_kmer_sequences, taxonomies = silva[0]

In [16]:
encoded_kmer_sequences.shape

(1, 1000, 148)

In [17]:
taxonomies[0][0]

TaxonomyDbEntry(sequence_id='JQ226360.1.1605', label='d__Eukaryota; p__Protalveolata; c__Syndiniales; o__Syndiniales; f__Syndiniales_Group_II; g__Syndiniales_Group_II')

### Hopland

In [18]:
hopland = dg.BatchGenerator(batch_size=8, batches_per_epoch=1, pipeline=[
    dg.random_samples(hopland_samples), # The samples to choose from (uniformly)
    dg.random_sequence_entries(1000),   # Sample random FASTA entries from chosen samples
    dg.sequences(150),                  # Get the sequences from the FASTA entries and trim to length
    dg.encode_sequences(),              # Encode to integers,
    dg.encode_kmers(3),                 # Encode kmer integers
    lambda samples, encoded_kmer_sequences: (samples, encoded_kmer_sequences)
])

In [19]:
samples, encoded_kmer_sequences = hopland[0]

In [20]:
encoded_kmer_sequences.shape

(8, 1000, 148)

In [21]:
len(samples)

8

In [22]:
# Print the corresponding samples
for sample in samples:
    print(sample.name)

Ur61-B-16S_S240_L001_R1_001
Ur64-R-16S_S196_L001_R1_001
Ur2-B-16S_S206_L001_R1_001
Ur64-R-16S_S196_L001_R1_001
Ur5-B-16S_S233_L001_R1_001
Ur25-R-16S_S136_L001_R1_001
Ur9-R-16S_S134_L001_R1_001
Ur7-B-16S_S249_L001_R1_001


In [10]:
api = wandb.Api()

In [11]:
path = api.artifact("sirdavidludwig/dnabert-taxonomy/dnabert-taxonomy-topdown-64d-150l:v0").download()
dnabert_tax_model = load_model(path)

[34m[1mwandb[0m:   4 of 4 files downloaded.  
2023-11-29 17:01:29.127399: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-29 17:01:29.129998: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-29 17:01:29.130243: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-29 17:01:29.130413: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), b

In [12]:
path = api.artifact("sirdavidludwig/model-registry/setbert-taxonomy-topdown-64d-150l:v0").download()
setbert_tax_model = load_model(path)
setbert_tax_model.base.chunk_size = 256 # sequence encoding chunk size

[34m[1mwandb[0m: Downloading large artifact setbert-taxonomy-topdown-64d-150l:v0, 90.01MB. 4 files... 
[34m[1mwandb[0m:   4 of 4 files downloaded.  
Done. 0:0:0.3


## DNABERT Top-down SILVA Classification

In [165]:
sequences, taxonomies = silva[0]

In [166]:
predicted_labels = dnabert_tax_model.classify(sequences[0])



In [167]:
predicted_labels[:5]

array(['d__Bacteria; p__Actinobacteriota; c__Coriobacteriia; o__Coriobacteriales; f__Coriobacteriaceae; g__Collinsella',
       'd__Eukaryota; p__Arthropoda; c__Insecta; o__Coleoptera; f__Coleoptera; g__Coleoptera',
       'd__Bacteria; p__Planctomycetota; c__Phycisphaerae; o__Phycisphaerales; f__AKAU3564_sediment_group; g__AKAU3564_sediment_group',
       'd__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrosopumilales; f__Nitrosopumilaceae; g__Nitrosarchaeum',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Endozoicomonadaceae; g__Endozoicomonas'],
      dtype='<U181')

In [170]:
for tax in taxonomies[0][:5]:
    print(tax.label)

d__Bacteria; p__Actinobacteriota; c__Coriobacteriia; o__Coriobacteriales; f__Coriobacteriaceae; g__Collinsella
d__Eukaryota; p__Arthropoda; c__Insecta; o__Coleoptera; f__Coleoptera; g__Coleoptera
d__Bacteria; p__Planctomycetota; c__Phycisphaerae; o__Phycisphaerales; f__AKAU3564_sediment_group; g__AKAU3564_sediment_group
d__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrosopumilales; f__Nitrosopumilaceae; g__Nitrosarchaeum
d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Oceanospirillales; f__Endozoicomonadaceae; g__Endozoicomonas


## SetBERT Hopland Classification

In [23]:
samples, sequences = hopland[0]

In [25]:
predicted_labels = setbert_tax_model.classify(sequences)

2023-11-29 17:02:34.306140: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




In [26]:
predicted_labels

array([['d__Bacteria; p__Verrucomicrobiota; c__Verrucomicrobiae; o__Chthoniobacterales; f__Chthoniobacteraceae; g__Candidatus_Udaeobacter',
        'd__Bacteria; p__Actinobacteriota; c__Thermoleophilia; o__Solirubrobacterales; f__67-14; g__67-14',
        'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Chitinophagales; f__Chitinophagaceae; g__Flavisolibacter',
        ...,
        'd__Bacteria; p__Acidobacteriota; c__Vicinamibacteria; o__Vicinamibacterales; f__; g__',
        'd__Bacteria; p__Actinobacteriota; c__Actinobacteria; o__Streptomycetales; f__Streptomycetaceae; g__Streptomyces',
        'd__Bacteria; p__Actinobacteriota; c__Thermoleophilia; o__Solirubrobacterales; f__Solirubrobacteraceae; g__Conexibacter'],
       ['d__Bacteria; p__Chloroflexi; c__Dehalococcoidia; o__SAR202_clade; f__SAR202_clade; g__SAR202_clade',
        'd__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Bacillaceae; g__Bacillus',
        'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o_