# Taxonomy Evaluation

This notebook evaluates all of the trained taxonomy models, producing the plots for the manuscript.

In [1]:
from dnadb import fasta, sample, taxonomy
from pathlib import Path
import subprocess
import wandb

In [2]:
HOME = Path.home()

In [3]:
# The path to the synthetic datasets
synthetic_root = HOME / "work/Datasets/Synthetic"

In [4]:
output_root = Path("../logs/taxonomy_classification")
output_root.mkdir(exist_ok=True, parents=True)

In [5]:
# The synthetic datasets to use
datasets = ["Nachusa", "Hopland", "Wetland", "SFD"]

In [6]:
synthetic_classifiers = ["Naive", "Bertax", "Topdown"]

---

## Predictions

In [27]:
# Used to enable/disable evalutation cells in this notebook.
# This is primarily used as a safety precaution to prevent
# overwriting data.
COMPUTE_PREDICTIONS = True
SKIP_EXISTING = True

In [8]:
tf_arguments = [
    "--gpu-ids", 0
]

In [9]:
def fastas_to_process(input_path, output_path, skip_existing):
    """
    Find FASTA files to process
    """
    existing_tax_files = set()
    if skip_existing:
        existing_tax_files.update([f.name for f in output_path.iterdir()])
    for f in input_path.iterdir():
        if f.suffix != ".fasta":
            continue
        if skip_existing and f.with_suffix(".tax.tsv").name in existing_tax_files:
            continue
        yield f

In [33]:
def evaluate(name, script, datasets, synthetic_classifiers, python_path="python3", args=None):
    assert COMPUTE_PREDICTIONS, "Evaluation disabled."
    assert output_root.is_dir(), "Output directory does not exist."
    args = args or []
    for dataset in datasets:
        for synthetic_type in synthetic_classifiers:
            path = synthetic_root / dataset / synthetic_type / "test"
            output_path = output_root / name / dataset / synthetic_type
            output_path.mkdir(exist_ok=True, parents=True)
            fastas = list(fastas_to_process(path, output_path, SKIP_EXISTING))
            if len(fastas) == 0:
                print("Skipping:", dataset, synthetic_type)
                continue
            subprocess.run(
                map(str, [python_path, script, output_path, *args]),
                universal_newlines=True,
                input='\n'.join(map(str, fastas))
            )

### Qiime2

In [11]:
q2_classifier_path = HOME / "work/qiime-classifier/classifier"

In [12]:
max_workers = 48

In [13]:
evaluate(
    "q2_naive_bayes",
    "../scripts/taxonomy_eval_qiime.py",
    datasets,
    synthetic_classifiers,
    python_path="/opt/conda/envs/qiime2-2022.8/bin/python3",
    args=[
        "--qiime-classifier-path", q2_classifier_path,
        "--workers", max_workers
    ]
)

Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Nachusa/Naive


100%|██████████| 2100/2100 [11:35<00:00,  3.02it/s]


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Nachusa/Bertax


100%|██████████| 2100/2100 [11:31<00:00,  3.04it/s]


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Nachusa/Topdown


100%|██████████| 2100/2100 [11:31<00:00,  3.04it/s]


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Hopland/Naive


100%|██████████| 1280/1280 [07:08<00:00,  2.99it/s]


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Hopland/Bertax


100%|██████████| 1280/1280 [07:07<00:00,  2.99it/s]


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Hopland/Topdown


100%|██████████| 1280/1280 [07:07<00:00,  2.99it/s]


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Wetland/Naive


100%|██████████| 7680/7680 [41:42<00:00,  3.07it/s]  


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Wetland/Bertax


100%|██████████| 7680/7680 [41:37<00:00,  3.07it/s]  


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/Wetland/Topdown


100%|██████████| 7680/7680 [41:38<00:00,  3.07it/s]  


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/SFD/Naive


100%|██████████| 8870/8870 [48:04<00:00,  3.08it/s]  


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/SFD/Bertax


100%|██████████| 8870/8870 [48:06<00:00,  3.07it/s]  


Writing to: ../logs/taxonomy_classification/q2_naive_bayes/SFD/Topdown


100%|██████████| 8870/8870 [47:59<00:00,  3.08it/s]  


### DNABERT (Naive)

In [23]:
model_artifact = "sirdavidludwig/dnabert-taxonomy-naive/dnabert-taxonomy-naive-64d-150l:v0"

In [24]:
evaluate(
    "dnabert_naive",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


0it [00:00, ?it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


100%|██████████| 3926/3926 [25:06<00:00,  2.61it/s]


### DNABERT (BERTax)

In [20]:
model_artifact = "sirdavidludwig/dnabert-taxonomy/dnabert-taxonomy-bertax-64d-150l:v0"

In [21]:
evaluate(
    "dnabert_bertax",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.9


Loading model...


100%|██████████| 2100/2100 [15:45<00:00,  2.22it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 2100/2100 [15:48<00:00,  2.21it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:2.0


Loading model...


100%|██████████| 2100/2100 [15:46<00:00,  2.22it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 1280/1280 [09:37<00:00,  2.22it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.9


Loading model...


100%|██████████| 1280/1280 [09:38<00:00,  2.21it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 1280/1280 [09:41<00:00,  2.20it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 7680/7680 [57:42<00:00,  2.22it/s] 
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 7680/7680 [57:53<00:00,  2.21it/s] 
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 7680/7680 [57:38<00:00,  2.22it/s] 
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 8870/8870 [1:07:17<00:00,  2.20it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.8


Loading model...


100%|██████████| 8870/8870 [1:07:03<00:00,  2.20it/s]
wandb: Downloading large artifact dnabert-taxonomy-bertax-64d-150l:v0, 1232.67MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:1.9


Loading model...


100%|██████████| 8870/8870 [1:07:10<00:00,  2.20it/s]


### DNABERT (Top-down)

In [34]:
model_artifact = "sirdavidludwig/dnabert-taxonomy/dnabert-taxonomy-topdown-64d-150l:v0"

In [None]:
evaluate(
    "dnabert_topdown",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

Skipping: Nachusa Naive
Skipping: Nachusa Bertax
Skipping: Nachusa Topdown
Skipping: Hopland Naive
Skipping: Hopland Bertax
Skipping: Hopland Topdown
Skipping: Wetland Naive
Skipping: Wetland Bertax
Skipping: Wetland Topdown
Skipping: SFD Naive


wandb:   4 of 4 files downloaded.  


Loading model...


100%|██████████| 2237/2237 [14:22<00:00,  2.59it/s]
wandb:   4 of 4 files downloaded.  


Loading model...


 20%|█▉        | 1739/8870 [11:10<45:34,  2.61it/s]

### DNABERT (Deep Top-down)

In [18]:
model_artifact = "sirdavidludwig/dnabert-taxonomy/dnabert-taxonomy-topdown-deep-64d-150l:v0"

In [19]:
evaluate(
    "dnabert_topdown_deep",
    "../scripts/taxonomy_eval_dnabert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 2100/2100 [23:00<00:00,  1.52it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 2100/2100 [23:00<00:00,  1.52it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 2100/2100 [22:59<00:00,  1.52it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 1280/1280 [14:02<00:00,  1.52it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 1280/1280 [13:59<00:00,  1.53it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 1280/1280 [14:04<00:00,  1.52it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 7680/7680 [1:25:45<00:00,  1.49it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 7680/7680 [1:25:41<00:00,  1.49it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 7680/7680 [1:25:38<00:00,  1.49it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 8870/8870 [1:39:44<00:00,  1.48it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 8870/8870 [1:39:02<00:00,  1.49it/s]
wandb: Downloading large artifact dnabert-taxonomy-topdown-deep-64d-150l:v0, 56.39MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 8870/8870 [1:39:15<00:00,  1.49it/s]


### SetBERT Top-down (Single-sequence)

In [16]:
model_artifact = "sirdavidludwig/setbert-taxonomy/setbert-taxonomy-topdown-all-64d-150l:v0"

In [17]:
evaluate(
    "setbert_topdown_uniform",
    "../scripts/taxonomy_eval_setbert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
        "--single-sequence"
    ] + tf_arguments
)

wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 2100/2100 [12:34<00:00,  2.78it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 2100/2100 [12:32<00:00,  2.79it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 2100/2100 [12:32<00:00,  2.79it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 1280/1280 [07:39<00:00,  2.78it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 1280/1280 [07:39<00:00,  2.78it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 1280/1280 [07:39<00:00,  2.78it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 7680/7680 [45:54<00:00,  2.79it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 7680/7680 [46:05<00:00,  2.78it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 7680/7680 [45:56<00:00,  2.79it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 8870/8870 [53:13<00:00,  2.78it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 8870/8870 [53:06<00:00,  2.78it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 8870/8870 [53:05<00:00,  2.78it/s]  


### SetBERT (Naive)

### SetBERT (BERTax)

N/A

### SetBERT (Top-down)

In [14]:
# The fine-tuned SetBERT top-down taxonomy model
model_artifact = "sirdavidludwig/setbert-taxonomy/setbert-taxonomy-topdown-all-64d-150l:v0"

In [15]:
evaluate(
    "setbert_topdown",
    "../scripts/taxonomy_eval_setbert.py",
    datasets,
    synthetic_classifiers,
    args=[
        "--model-artifact", model_artifact,
    ] + tf_arguments
)

wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 2100/2100 [12:35<00:00,  2.78it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.5


Loading model...


100%|██████████| 2100/2100 [12:30<00:00,  2.80it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 2100/2100 [12:29<00:00,  2.80it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 1280/1280 [07:42<00:00,  2.77it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 1280/1280 [07:38<00:00,  2.79it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 1280/1280 [07:39<00:00,  2.79it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 7680/7680 [45:48<00:00,  2.79it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 7680/7680 [45:50<00:00,  2.79it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 7680/7680 [45:44<00:00,  2.80it/s]
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.2


Loading model...


100%|██████████| 8870/8870 [52:44<00:00,  2.80it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.4


Loading model...


100%|██████████| 8870/8870 [53:03<00:00,  2.79it/s]  
wandb: Downloading large artifact setbert-taxonomy-topdown-all-64d-150l:v0, 90.01MB. 4 files... 
wandb:   4 of 4 files downloaded.  
Done. 0:0:0.3


Loading model...


100%|██████████| 8870/8870 [52:46<00:00,  2.80it/s]  
