In [1]:
from collections import defaultdict
from dataclasses import replace
from dnadb import dna, fasta, fastq, sample, taxonomy
from dnadb.datasets import Silva
from functools import cache
import itertools
import numpy as np
from pathlib import Path
import tensorflow as tf
import tf_utilities as tfu
import time
from tqdm.auto import tqdm
import wandb

from deepdna.nn.models import load_model
from deepdna.nn.models.taxonomy import NaiveTaxonomyClassificationModel

In [2]:
tfu.devices.select_gpu(0)

2023-08-10 15:46:30.433798: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-10 15:46:30.434100: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-10 15:46:30.517698: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-10 15:46:30.518040: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-10 15:46:30.518331: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

([PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')],
 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')])

In [3]:
rng = np.random.default_rng()

## Synthetic Dataset Factory

In [4]:
dataset_root = Path("~/work/Datasets/synthetic").expanduser()
dataset_root.mkdir(exist_ok=True)
dataset_root

PosixPath('/home/dwl2x/work/Datasets/synthetic')

### SILVA Sequences and Taxonomy

Our synthetic samples will be created by mapping to the SILVA dataset.

In [5]:
# silva = Silva()
# with fasta.FastaDbFactory(dataset_root / "synthetic.fasta.db") as db:
#     db.write_entries(silva.sequences())
# with taxonomy.TaxonomyDbFactory(dataset_root / "synthetic.tax.tsv.db") as db:
#     db.write_entries(silva.taxonomies())

In [6]:
fasta_db = fasta.FastaDb(dataset_root / "synthetic.fasta.db")
# tax_db = taxonomy.TaxonomyDb(dataset_root / "synthetic.tax.tsv.db")
tax_db = taxonomy.TaxonomyDb("/home/dwl2x/work/Datasets/Silva/Silva_138.1.tax.tsv.db")

In [7]:
tax_to_fasta_ids = {}
for label in tax_db:
    tax_to_fasta_ids[label] = list(tax_db.fasta_ids_with_label(tax_db.label_to_index(label)))

### Sequence ID Mapping

In [8]:
# with fasta.FastaIndexDbFactory(dataset_root / "synthetic.fasta.index.db") as db:
#     db.write_entries(fasta_db)

In [9]:
fasta_index = fasta.FastaIndexDb(dataset_root / "synthetic.fasta.index.db")

## Classification Model

In [10]:
model = load_model("./logs/models/dnabert_taxonomy_naive", NaiveTaxonomyClassificationModel)

2023-08-10 15:46:55.198545: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-10 15:46:55.202584: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-10 15:46:55.203082: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-10 15:46:55.203442: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

## Datasets

In [11]:
def trim_and_encode(entry: fasta.FastaEntry|fastq.FastqEntry, length: int = 150):
    # Trim the sequence
    offset = rng.integers(0, len(entry) - length + 1)
    sequence = entry.sequence[offset:offset+length]
    assert len(sequence) == length
    return dna.encode_sequence(sequence)

### Nachusa Sequences

In [12]:
sample_folders = [
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2015-soil16S-sequences",
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2016-soil16S-sequences",
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2017-soil16S-sequences",
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2018-soil16S-sequences",
    # "/home/shared/prism-data/Nachusa Sequences/nachusa-2019-soil16S-sequences", # missing
    "/home/shared/prism-data/Nachusa Sequences/nachusa-2020-soil16S-sequences",
]

In [13]:
# Sample Mapping
fasta_mapping_factory = sample.SampleMappingDbFactory(dataset_root / "synthetic.fasta.mapping.db")

In [None]:
for folder in sample_folders:
    for f in Path(folder).iterdir():
        print(f"{f.name}\r\n", end="")
        mapping_entry = sample.SampleMappingEntryFactory(f.name, fasta_index)
        sequences = np.array(list(map(trim_and_encode, fastq.entries(f))))
        sequences = dna.encode_kmers(sequences, 3)
        tax_ids = []
        for i in range(0, len(sequences), 32):
            tax_ids.append(model(sequences[i:i+32]).numpy())
        tax_ids = np.concatenate(tax_ids, axis=0)
        # tax_ids = model.predict(sequences)
        taxa = [model.id_to_taxonomy_map[i] for i in np.argmax(tax_ids, axis=1)]
        for i, tax in enumerate(taxa):
            print(f"\rGathering FASTA IDs: {i+1}/{len(taxa)}", end="")
            fasta_id = rng.choice(tax_to_fasta_ids[tax])
            mapping_entry.add_fasta_id(fasta_id)
        fasta_mapping_factory.write_entry(mapping_entry.build())

WS-AG-May2015_S65_L001_R1_001.fastq


2023-08-10 15:47:10.432073: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [15]:
fasta_mapping_factory.close()

In [16]:
synthetic_samples = sample.load_multiplexed_fasta(dataset_root / "synthetic.fasta.db", dataset_root / "synthetic.fasta.mapping.db")
len(synthetic_samples)

210

In [17]:
names = set([s.name for s in synthetic_samples])
len(names)

210

In [18]:
n = 0
for folder in sample_folders:
    for f in Path(folder).iterdir():
        # print(f)
        n += 1
n

210

In [19]:
for d in sample_folders:
    print(len(list(Path(d).iterdir())))

50
51
53
0
56


In [20]:
1

1