In [13]:
import pandas as pd
from pathlib import Path

src = Path("blast/blast.accession")
out = Path("../tables/accession_taxid.parquet")  # put in repo if it's small enough

ia = pd.read_table(
    src,
    index_col=0,
    names=["name", "length", "taxid"],
)

# clean accession index
ia = ia.loc[~ia.index.isna()].copy()
ia.index = ia.index.astype(str)
ia = ia[~ia.index.duplicated(keep="first")]

# normalize taxid to first component
ia["taxid1"] = ia["taxid"].astype(str).str.split(";").str[0]
ia = ia[ia["taxid1"].str.fullmatch(r"\d+")]
ia["taxid1"] = ia["taxid1"].astype("int32")

acc = ia[["name", "taxid1", "taxid"]].copy()
acc.to_parquet(out)

print("saved", out, "rows:", acc.shape[0])


  ia = pd.read_table(


saved ../tables/accession_taxid.parquet rows: 1962798


# Build domain results table

In [None]:
%load_ext autoreload
%autoreload 2
import pickle, gzip
import sys
sys.path.append("./")

import run_modelling_cleaned as rm


In [None]:

ia = pd.read_parquet("../tables/accession_taxid.parquet")

print("accession_taxid rows:", ia.shape[0])

accession_taxid rows: 1962798


In [None]:
DOMAINS = ["bacteria", "archaea", "viruses"]
RANKS = ["species", "genus", "family", "order", "class", "phylum"]

domain_results = {}

for DOMAIN in DOMAINS:
    counts_taxid, meta, use_files = rm.build_counts_for_domain(DOMAIN, ia)

    print(f"\n=== {DOMAIN} ===")
    print("Files used:", len(use_files))
    print("Taxid matrix:", counts_taxid.shape)
    print(meta["group"].value_counts())

    # save taxid-level
    counts_taxid.to_csv(f"just_{DOMAIN}_counts.taxid.txt", sep="\t")

    # save ranks
    rank_tables = {}
    for rank in RANKS:
        xr = rm.collapse_to_rank(counts_taxid, rank)
        xr.to_csv(f"just_{DOMAIN}_counts_{rank}.txt", sep="\t")
        rank_tables[rank] = xr
        print(rank, xr.shape)

    domain_results[DOMAIN] = {
        "counts_taxid": counts_taxid,
        "meta": meta,
        "rank_tables": rank_tables
    }


In [None]:


def save_domain_results_pickle(domain_results, out_path="domain_results.pkl.gz"):
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
    with gzip.open(out_path, "wb") as f:
        pickle.dump(domain_results, f, protocol=pickle.HIGHEST_PROTOCOL)

save_domain_results_pickle(domain_results, "../tables/domain_results.pkl.gz")