# Taxonomy Evaluation

This notebook evaluates all of the trained taxonomy models, producing the plots for the manuscript.

In [1]:
from dnadb import fasta, sample, taxonomy
from pathlib import Path
import subprocess
import wandb

In [2]:
HOME = Path.home()

In [3]:
# The path to the synthetic datasets
synthetic_root = HOME / "work/Datasets/Synthetic"

In [4]:
output_root = Path("../logs/taxonomy_classification")
output_root.mkdir(exist_ok=True, parents=True)

In [5]:
# The synthetic datasets to use
datasets = ["Nachusa", "Hopland", "Wetland", "SFD"]

In [6]:
synthetic_classifiers = ["Naive", "Bertax", "Topdown"]

---

## Predictions

In [7]:
# Used to enable/disable evalutation cells in this notebook.
# This is primarily used as a safety precaution to prevent
# overwriting data.
COMPUTE_PREDICTIONS = True
SKIP_EXISTING = False

In [8]:
tf_arguments = [
    "--gpu-ids", 0
]

In [9]:
def fastas_to_process(input_path, output_path, skip_existing):
    """
    Find FASTA files to process
    """
    existing_tax_files = set()
    if skip_existing:
        existing_tax_files.update([f.name for f in output_path.iterdir()])
    for f in input_path.iterdir():
        if f.suffix != ".fasta":
            continue
        if skip_existing and f.with_suffix(".tax.tsv").name in existing_tax_files:
            continue
        yield f

In [10]:
def evaluate(name, script, datasets, synthetic_classifiers, python_path="python3", args=None):
    assert COMPUTE_PREDICTIONS, "Evaluation disabled."
    assert output_root.is_dir(), "Output directory does not exist."
    args = args or []
    for dataset in datasets:
        for synthetic_type in synthetic_classifiers:
            path = synthetic_root / dataset / synthetic_type / "test"
            output_path = output_root / name / dataset / synthetic_type
            output_path.mkdir(exist_ok=True, parents=True)
            fastas = list(fastas_to_process(path, output_path, SKIP_EXISTING))
            subprocess.run(
                map(str, [python_path, script, output_path, *args]),
                universal_newlines=True,
                input='\n'.join(map(str, fastas))
            )

### Qiime2

In [147]:
q2_classifier_path = HOME / "work/qiime-classifier/classifier"

In [None]:
max_workers = 48

In [150]:
evaluate(
    "q2_naive_bayes",
    "../scripts/taxonomy_eval_qiime.py",
    datasets,
    synthetic_classifiers,
    python_path="/opt/conda/envs/qiime2-2022.8/bin/python3",
    args=[
        "--qiime-classifier-path", q2_classifier_path,
        "--workers", max_workers
    ]
)

Writing to: ../logs/qiime_tax_labels/Nachusa/Naive


0it [00:00, ?it/s]


### DNABERT (Naive)

In [32]:
model_artifact = "sirdavidludwig/dnabert-taxonomy-naive/dnabert-taxonomy-naive-64d-150l:v0"

In [33]:
evaluate(
    "dnabert_naive",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb:   4 of 4 files downloaded.  


Loading model...


  0%|          | 0/2100 [00:02<?, ?it/s]


### DNABERT (BERTax)

In [19]:
model_artifact = "sirdavidludwig/dnabert-taxonomy/dnabert-taxonomy-bertax-64d-150l:v0"

In [21]:
evaluate(
    "dnabert_bertax",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.9


Loading model...


0it [00:02, ?it/s]


### DNABERT (Top-down)

In [15]:
model_artifact = "sirdavidludwig/dnabert-taxonomy/dnabert-taxonomy-topdown-64d-150l:v0"

In [18]:
evaluate(
    "dnabert_topdown",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:01, ?it/s]


### DNABERT (Deep Top-down)

In [29]:
model_artifact = "sirdavidludwig/dnabert-taxonomy/dnabert-taxonomy-topdown-deep-64d-150l:v0"

In [30]:
evaluate(
    "dnabert_topdown_deep",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


  0%|          | 0/2100 [00:03<?, ?it/s]


### SetBERT Top-down (Single-sequence)

In [22]:
model_artifact = "sirdavidludwig/setbert-taxonomy/setbert-taxonomy-topdown-all-64d-150l:v0"

In [24]:
evaluate(
    "setbert_topdown_uniform",
    "../scripts/taxonomy_eval_setbert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
        "--single-sequence"
    ] + tf_arguments
)

wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


  1%|          | 24/2049 [00:20<25:36,  1.32it/s] 

KeyboardInterrupt: 

### SetBERT (Naive)

### SetBERT (BERTax)

N/A

### SetBERT (Top-down)

In [11]:
# The fine-tuned SetBERT top-down taxonomy model
model_artifact = "sirdavidludwig/setbert-taxonomy/setbert-taxonomy-topdown-all-64d-150l:v0"

In [13]:
evaluate(
    "setbert_topdown",
    "../scripts/taxonomy_eval_setbert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


0it [00:02, ?it/s]
