In [1]:
from collections import defaultdict
from dataclasses import replace
from dnadb import dna, fasta, fastq, sample, taxonomy
from dnadb.datasets import Silva
from functools import cache
import itertools
import numpy as np
from pathlib import Path
import tensorflow as tf
import tf_utilities as tfu
import time
from tqdm.auto import tqdm
import wandb

from deepdna.nn.models import load_model
from deepdna.nn.models.taxonomy import NaiveTaxonomyClassificationModel

In [2]:
tfu.devices.select_gpu(0)

2023-08-21 23:16:20.087097: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-21 23:16:20.087327: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-21 23:16:20.110016: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-21 23:16:20.110312: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-21 23:16:20.110546: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

([PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')],
 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')])

In [3]:
rng = np.random.default_rng()

## Synthetic Dataset Factory

In [4]:
dataset_root = Path("~/work/Datasets/synthetic").expanduser()
dataset_root.mkdir(exist_ok=True)
dataset_root

PosixPath('/home/dwl2x/work/Datasets/synthetic')

### SILVA Sequences and Taxonomy

Our synthetic samples will be created by mapping to the SILVA dataset.

In [5]:
from dataclasses import replace

In [5]:
# silva = Silva()
# with fasta.FastaDbFactory(dataset_root / "synthetic.fasta.db") as db:
#     db.write_entries(silva.sequences())
# with taxonomy.TaxonomyDbFactory(dataset_root / "synthetic.tax.tsv.db") as db:
#     for entry in silva.taxonomies():
#         db.write_entry(replace(entry, label=entry.label.replace("uncultured", "")))

In [6]:
fasta_db = fasta.FastaDb(dataset_root / "synthetic.fasta.db")
# tax_db = taxonomy.TaxonomyDb(dataset_root / "synthetic.tax.tsv.db")
tax_db = taxonomy.TaxonomyDb("/home/dwl2x/work/Datasets/Silva/Silva_138.1.tax.tsv.db")

In [7]:
tax_to_fasta_ids = {}
for label in tax_db:
    tax_to_fasta_ids[label] = list(tax_db.fasta_ids_with_label(tax_db.label_to_index(label)))

### Sequence ID Mapping

In [8]:
# with fasta.FastaIndexDbFactory(dataset_root / "synthetic.fasta.index.db") as db:
#     db.write_entries(fasta_db)

In [9]:
fasta_index = fasta.FastaIndexDb(dataset_root / "synthetic.fasta.index.db")

## Classification Model

In [10]:
model = load_model("./logs/models/dnabert_taxonomy_naive", NaiveTaxonomyClassificationModel)

2023-08-19 17:38:11.503465: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-19 17:38:11.505994: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-19 17:38:11.506240: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-19 17:38:11.506413: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

## Datasets

In [11]:
def trim_and_encode(entry: fasta.FastaEntry|fastq.FastqEntry, length: int = 150):
    # Trim the sequence
    offset = rng.integers(0, len(entry) - length + 1)
    sequence = entry.sequence[offset:offset+length]
    assert len(sequence) == length
    return dna.encode_sequence(sequence)

### Nachusa Sequences

In [12]:
sample_folders = [
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2015-soil16S-sequences",
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2016-soil16S-sequences",
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2017-soil16S-sequences",
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2018-soil16S-sequences",
    # "/home/shared/prism-data/Nachusa Sequences/nachusa-2019-soil16S-sequences", # missing
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2020-soil16S-sequences",
]

In [13]:
# Sample Mapping
fasta_mapping_factory = sample.SampleMappingDbFactory(dataset_root / "synthetic.fasta.mapping.db")

In [15]:
for folder in sample_folders:
    for f in Path(folder).iterdir():
        print(f"{f.name}\r\n", end="")
        mapping_entry = sample.SampleMappingEntryFactory(f.name, fasta_index)
        sequences = np.array(list(map(trim_and_encode, fastq.entries(f))))
        sequences = dna.encode_kmers(sequences, 3)
        tax_ids = []
        for i in range(0, len(sequences), 32):
            tax_ids.append(model(sequences[i:i+32]).numpy())
        tax_ids = np.concatenate(tax_ids, axis=0)
        # tax_ids = model.predict(sequences)
        taxa = [model.taxonomy_id_map[i] for i in np.argmax(tax_ids, axis=1)]
        for i, tax in enumerate(taxa):
            print(f"\rGathering FASTA IDs: {i+1}/{len(taxa)}", end="")
            fasta_id = rng.choice(tax_to_fasta_ids[tax])
            mapping_entry.add_fasta_id(fasta_id)
        fasta_mapping_factory.write_entry(mapping_entry.build())

WS-AG-May2015_S65_L001_R1_001.fastq
Gathering FASTA IDs: 71553/71553WS-HF-Sep2015_S83_L001_R1_001.fastq
Gathering FASTA IDs: 75468/75468WS-TC-May2015_S41_L001_R1_001.fastq
Gathering FASTA IDs: 69199/69199WS-HLP-Jul2015_S90_L001_R1_001.fastq
Gathering FASTA IDs: 66324/66324WS-SOY-Sep2015_S68_L001_R1_001.fastq
Gathering FASTA IDs: 74557/74557WS-HPN-May2015_S17_L001_R1_001.fastq
Gathering FASTA IDs: 71234/71234WS-HPW-May2015_S25_L001_R1_001.fastq
Gathering FASTA IDs: 72589/72589WS-AG-Sep2015_S44_L001_R1_001.fastq
Gathering FASTA IDs: 73181/73181WS-SF-Jul2015_S10_L001_R1_001.fastq
Gathering FASTA IDs: 70539/70539WS-HPN-Jul2015_S50_L001_R1_001.fastq
Gathering FASTA IDs: 71178/71178WS-L-May2015_S176_L001_R1_001.fastq
Gathering FASTA IDs: 78462/78462WS-TCR-Sep2015_S52_L001_R1_001.fastq
Gathering FASTA IDs: 68377/68377WS-MR-Sep2015_S67_L001_R1_001.fastq
Gathering FASTA IDs: 74652/74652WS-HPW-Jul2015_S58_L001_R1_001.fastq
Gathering FASTA IDs: 67807/67807WS-CCE-Jul2015_S19_L001_R1_001.fastq
Gath

In [16]:
fasta_mapping_factory.close()

In [5]:
synthetic_samples = sample.load_multiplexed_fasta(dataset_root / "synthetic.fasta.db", dataset_root / "synthetic.fasta.mapping.db")
len(synthetic_samples)

210

In [None]:
names = set([s.name for s in synthetic_samples])
len(names)

In [None]:
n = 0
for folder in sample_folders:
    for f in Path(folder).iterdir():
        # print(f)
        n += 1
n

In [None]:
for d in sample_folders:
    print(len(list(Path(d).iterdir())))

In [None]:
1

### DNABERT Classification

In [24]:
import wandb

In [25]:
api = wandb.Api()

In [26]:
path = "./logs/models/dnabert_taxonomy_naive/"
dnabert_tax = load_model(path)

2023-08-21 22:51:22.892292: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-21 22:51:22.908856: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-21 22:51:22.909210: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-21 22:51:22.909485: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

In [28]:
from deepdna.data.samplers import SampleSampler, SequenceSampler
from deepdna.nn.data_generators import _encode_sequences, BatchGenerator
import numpy.typing as npt
from typing import Any, cast, Iterable

class SequenceTaxonomyGenerator(BatchGenerator):
    def __init__(
        self,
        fasta_taxonomy_pairs: Iterable[tuple[sample.FastaSample, taxonomy.TaxonomyDb]],
        taxonomy_id_map: taxonomy.TaxonomyIdMap,
        sequence_length: int,
        kmer: int = 1,
        subsample_size: int|None = None,
        batch_size: int = 32,
        batches_per_epoch: int = 100,
        augment_slide: bool = True,
        augment_ambiguous_bases: bool = True,
        balance: bool = False,
        shuffle: bool = True,
        rng: np.random.Generator = np.random.default_rng()
    ):
        super().__init__(
            batch_size=batch_size,
            batches_per_epoch=batches_per_epoch,
            shuffle=shuffle,
            rng=rng
        )
        fasta_samples, taxonomy_dbs = zip(*fasta_taxonomy_pairs)
        self.sample_sampler = SampleSampler(cast(tuple[sample.FastaSample, ...], fasta_samples))
        self.sequence_sampler = SequenceSampler(sequence_length, augment_slide)
        self.taxonomy_dbs: tuple[taxonomy.TaxonomyDb, ...] = cast(Any, taxonomy_dbs)
        self.kmer = kmer
        self.taxonomy_id_map = taxonomy_id_map
        self.subsample_size = subsample_size
        self.augment_ambiguous_bases = augment_ambiguous_bases
        self.balance = balance

    @property
    def sequence_length(self) -> int:
        return self.sequence_sampler.sequence_length

    def generate_batch(
        self,
        rng: np.random.Generator
    ) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.int32]]:
        subsample_size = self.subsample_size or 1
        sequences = np.empty((self.batch_size, subsample_size), dtype=f"<U{self.sequence_length}")
        sample_ids = np.empty(self.batch_size, dtype=np.int32)
        sequence_ids = [None] * self.batch_size
        label_ids = np.empty((self.batch_size, subsample_size), dtype=np.int32)
        samples = self.sample_sampler.sample_with_ids(self.batch_size, self.balance, rng)
        for i, (sample_id, sample) in enumerate(samples):
            tax_db = self.taxonomy_dbs[sample_id]
            sequence_info = tuple(self.sequence_sampler.sample_with_ids(sample, subsample_size, rng))
            sequence_ids[i], sequences[i] = zip(*sequence_info)
            sample_ids[i] = sample_id
            label_ids[i] = [self.taxonomy_id_map[tax_db.fasta_id_to_label(fasta_id)] for fasta_id in sequence_ids[i]]
        sequences = _encode_sequences(sequences, self.augment_ambiguous_bases, self.rng)
        if self.subsample_size is None:
            sequences = np.squeeze(sequences, axis=1)
        sequences = sequences.astype(np.int32)
        if self.kmer > 1:
            sequences = dna.encode_kmers(sequences, self.kmer, not self.augment_ambiguous_bases).astype(np.int32) # type: ignore
        return sample_ids, sequence_ids, sequences, label_ids

    def reduce_batch(self, batch):
        # remove sample IDs and sequence IDs
        return batch[2:]

In [14]:
synthetic_samples = sample.load_multiplexed_fasta(dataset_root / "synthetic.fasta.db", dataset_root / "synthetic.fasta.mapping.db")
len(synthetic_samples)

210

In [52]:
tax_db = taxonomy.TaxonomyDb(dataset_root / "synthetic2.tax.tsv.db")

In [54]:
gen = SequenceTaxonomyGenerator(
    zip(synthetic_samples, itertools.repeat(tax_db)),
    dnabert_tax.taxonomy_id_map,
    150,
    3,
    1000,
    batch_size=16
)

In [80]:
for batch in gen:
    x, y = batch
    x = x.reshape((-1, 148))
    y_pred = []
    for i in range(0, len(x), 256):
        print(f"\r{i}/{len(x)}", end="")
        y_pred = np.concatenate((y_pred, np.argmax(dnabert_tax(x[i:i+256]), axis=1)), axis=0)
    y_pred = np.array(y_pred).flatten()
    print(f"\r{np.sum(y.flatten() == y_pred) / len(y_pred):<20}")
    break

0.72975             


Exception ignored in: <function DbWrapper.__del__ at 0x7fd73453e320>
Traceback (most recent call last):
  File "/home/dwl2x/Libs/dnadb/src/dnadb/db.py", line 89, in __del__
    self.close()
  File "/home/dwl2x/Libs/dnadb/src/dnadb/db.py", line 69, in close
    if self.__is_closed:
AttributeError: 'TaxonomyDb' object has no attribute '_DbWrapper__is_closed'
