In [13]:
import sys
sys.path.append("../")

In [14]:
from collections import defaultdict
import gzip
import itertools
import joblib
import numpy as np
import os
from pathlib import Path
import subprocess
from typing import Iterable, Union

In [15]:
def qiime(*args: Union[str, Path]):
    return subprocess.check_output(["qiime", *args])

## Find the Data Files

In [16]:
data_path = Path("/tmp/GreengenesSilva/")
train_path = data_path / "train"
test_path = data_path / "test"

In [17]:
files = sorted(list(train_path.iterdir()))
files

[PosixPath('/tmp/GreengenesSilva/train/0.fasta'),
 PosixPath('/tmp/GreengenesSilva/train/0.fasta.db'),
 PosixPath('/tmp/GreengenesSilva/train/0.sequences.qza'),
 PosixPath('/tmp/GreengenesSilva/train/0.taxonomy.qza'),
 PosixPath('/tmp/GreengenesSilva/train/0_taxonomy.tsv'),
 PosixPath('/tmp/GreengenesSilva/train/0_taxonomy.tsv.db')]

In [18]:
fasta_files = list(filter(lambda f: f.name.endswith(".fasta"), files))
fasta_files

[PosixPath('/tmp/GreengenesSilva/train/0.fasta')]

In [19]:
taxonomy_files = list(filter(lambda f: f.name.endswith("_taxonomy.tsv"), files))
taxonomy_files

[PosixPath('/tmp/GreengenesSilva/train/0_taxonomy.tsv')]

## Create the Qiime Artifacts

### Sequences Artifact

In [20]:
fasta_file = fasta_files[0]
sequences_artifact = train_path / "0.sequences.qza"

In [21]:
qiime("tools", "import",
      "--input-path", fasta_file,
      "--output-path", sequences_artifact,
      "--type", "FeatureData[Sequence]"
)

b'Imported /tmp/GreengenesSilva/train/0.fasta as DNASequencesDirectoryFormat to /tmp/GreengenesSilva/train/0.sequences.qza\n'

### Taxonomy Artifact

In [22]:
taxonomy_file = taxonomy_files[0]
taxonomy_artifact = train_path / "0.taxonomy.qza"

In [23]:
qiime("tools", "import",
    "--input-path", taxonomy_file,
    "--output-path", taxonomy_artifact,
    "--input-format", "HeaderlessTSVTaxonomyFormat",
    "--type", "FeatureData[Taxonomy]"
)

b'Imported /tmp/GreengenesSilva/train/0_taxonomy.tsv as HeaderlessTSVTaxonomyFormat to /tmp/GreengenesSilva/train/0.taxonomy.qza\n'

In [4]:
!qiime feature-classifier fit-classifier-naive-bayes \
    --i-reference-reads /tmp/qiime-artifacts/train.0.qza \
    --i-reference-taxonomy /tmp/qiime-artifacts/train.0.taxonomy.qza \
    --o-classifier /tmp/qiime-artifacts/classifier.qza

[32mSaved TaxonomicClassifier to: /tmp/qiime-artifacts/classifier.qza[0m
[0m

In [1]:
!qiime tools export \
    --input-path /tmp/qiime-artifacts/classifier.qza \
    --output-path /tmp/qiime-artifacts/classifier

[32mExported /tmp/qiime-artifacts/classifier.qza as TaxonomicClassiferTemporaryPickleDirFmt to directory /tmp/qiime-artifactsclassifier[0m
[0m

In [193]:
len(BASES + INCOMPLETE_BASES)

15

---

In [45]:
BASES = "ACGT"
INCOMPLETE_BASES = "MRWSYKVHDBN" # https://iubmb.qmul.ac.uk/misc/naseq.html

BASE_MAP = {c: i for i, c in enumerate(BASES + INCOMPLETE_BASES)}
BASE_MAP_REV = {i: c for c, i in BASE_MAP.items()}
INCOMPLETE_BASE_MAP = {b: c for b, c in zip(INCOMPLETE_BASES, [c for n in range(2, 5) for c in itertools.combinations(BASES, n)])}
ENC_INCOMPLETE_BASE_MAP = {BASE_MAP[b]: tuple(BASE_MAP[c] for c in cs) for b, cs in INCOMPLETE_BASE_MAP.items()}

In [185]:
class GreengenesFasta:
    def __init__(self, path, id_to_label_map, expanded_label_map, tax_map):
        self.id_to_label_map = id_to_label_map
        self.expanded_label_map = expanded_label_map
        self.tax_map = tax_map
        self.sequences: dict = None
        self.augmentable_indices: dict = None
        self.label_groups: dict = None
        self.__load(path)

    def __load(self, path):
        self.sequences = {}
        self.augmentable_indices = {}
        label_groups = {}
        open_method = gzip.open if path.endswith(".gz") else open
        with open_method(path, 'rb') as f:
            identifier = f.readline().decode()
            while identifier:
                identifier = int(identifier[1:])
                self.sequences[identifier] = np.array([BASE_MAP[c] for c in f.readline().decode().rstrip()], dtype=np.uint8)
                self.augmentable_indices[identifier] = np.where(self.sequences[identifier] >= 4)[0]

                label = self.id_to_label_map[self.tax_map[identifier]]
                if label not in label_groups:
                    label_groups[label] = []
                label_groups[label].append(identifier)

                identifier = f.readline().decode()
        self.label_groups = list(label_groups.values())

    def num_labels(self):
        return np.max(self.expanded_label_map, axis=0) + 1

    def random_sequence(self, rng):
        label = rng.integers(len(self.label_groups))
        sequence_id = rng.choice(self.label_groups[label])
        sequence = self.sequences[sequence_id]
        indices = self.augmentable_indices[sequence_id]
        label = self.expanded_label_map[self.id_to_label_map[sequence_id]]
        return sequence, indices, label

In [2]:
DATA_PATH = "/tmp/qiime-artifacts"

### Load label data

In [206]:
def parse_taxonomy_file(taxonomy_file):
    label_to_id_map = {}
    id_to_label_map = []
    tax_map = {}
    tax_groups = defaultdict(set)
    with gzip.open(taxonomy_file) as f:
        for line in f:
            identifier, label = line.decode().rstrip().split('\t')
            if label not in label_to_id_map:
                id_to_label_map.append(label)
                label_to_id_map[label] = len(label_to_id_map)
            tax_map[int(identifier)] = label_to_id_map[label]
            tax_groups[label_to_id_map[label]].add(int(identifier))
    return label_to_id_map, id_to_label_map, tax_map, tax_groups

In [207]:
label_to_id_map, id_to_label_map, tax_map, tax_groups = parse_taxonomy_file(os.path.join(DATA_PATH, "../../gg_13_5_taxonomy.txt.gz"))

### Load Sequences into Memory

In [208]:
val_data = GreengenesFasta(os.path.join(DATA_PATH, "val.fasta"), id_to_label_map, expanded_label_map, tax_map)

### Load Model

In [7]:
# Extract the tar first...
model = joblib.load(os.path.join(DATA_PATH, "classifier/sklearn_pipeline.pkl"))

In [15]:
label_map = {}
with open("/tmp/silva-splits/test.0.tsv") as f:
    for line in f:
        identifier, label = line.rstrip().split('\t')
        label_map[identifier] = label

In [None]:
open("/tmp/silva-splits/test.0.fasta")
open("/tmp/silva-splits/test.0.fasta")
from common.datasets import fasta_with_taxonomy_iterator

In [210]:
level_maps = [{'': -1} for _ in range(6)]
label_map = {}
expanded_labels = []
for label in model.classes_:
    label_map[label] = len(label_map)
    expanded_label = np.empty(len(level_maps), dtype=int)
    for i, (level_map, part) in enumerate(zip(level_maps, label.split('; '))):
        part = part[3:]
        if part not in level_map:
            level_map[part] = len(level_map) - 1
        expanded_label[i] = level_map[part]
    expanded_labels.append(expanded_label)
expanded_labels = np.array(expanded_labels)

In [211]:
val_expanded_labels = []
for identifier in val_data.sequences:
    label = id_to_label_map[tax_map[identifier]]
    expanded_label = np.empty(len(level_maps), dtype=int)
    for i, (level_map, part) in enumerate(zip(level_maps, label.split('; '))):
        part = part[3:]
        expanded_label[i] = level_map[part] if part in level_map else -1
    val_expanded_labels.append(expanded_label)
val_expanded_labels = np.array(val_expanded_labels)

In [212]:
import bisect

In [213]:
rng = np.random.default_rng(1)

sequences = []
y_true = []
for identifier, sequence in val_data.sequences.items():
    indices = val_data.augmentable_indices[identifier]
    offset = rng.integers(len(sequence) - 150 + 1)
    i = bisect.bisect_left(indices, offset)
    augmented_sequence = sequence[offset:offset+150].copy() # allow write
    while i < len(indices) and indices[i] - offset < len(augmented_sequence):
        bases = ENC_INCOMPLETE_BASE_MAP[augmented_sequence[indices[i] - offset]]
        augmented_sequence[indices[i] - offset] = rng.choice(bases)
        i += 1
    # label
    label = id_to_label_map[tax_map[identifier]]
    expanded_label = np.empty(len(level_maps), dtype=int)
    for i, (level_map, part) in enumerate(zip(level_maps, label.split('; '))):
        part = part[3:]
        expanded_label[i] = level_map[part] if part in level_map else -1
    sequences.append("".join([BASE_MAP_REV[c] for c in augmented_sequence]))
    y_true.append(expanded_label)
sequences = np.array(sequences)
y_true = np.array(y_true).T

In [214]:
model.predict(sequences[:1])

In [215]:
predictions = np.argmax(model.predict_proba(sequences), axis=1)

In [216]:
expanded_labels[predictions]

array([[  1,  20, 108, 148,  -1,  -1],
       [  1,  29, 119, 166, 231, 409],
       [  1,   6,  34,  37,  -1,  -1],
       ...,
       [  1,  13,  67,  71, 126, 201],
       [  1,  29, 120, 169, 245, 467],
       [  1,  -1,  -1,  -1,  -1,  -1]])

In [217]:
accuracies = []
for level_true, level_pred in zip(y_true, expanded_labels[predictions].T):
    indices = np.where(level_true != -1)
    accuracies.append(np.mean(level_true[indices] == level_pred[indices]))

In [220]:
print("Qiime Classifier Seed 0")
accuracies

Qiime Classifier Seed 0


[0.9995722806155991,
 0.952974899594413,
 0.9302736381039162,
 0.8948988348681388,
 0.795334428494946,
 0.7338880718893873]