# Exploring the set of L-asparaginase k-mers

In [1]:
import os, sys
import time
import json
import numpy as np
import matplotlib.pyplot as plt
from numpy.typing import NDArray
from Bio import SeqIO
import tqdm.notebook as tqdm

from humprot.helpers import sym2int
from humprot.helpers import get_kmers_from_sequences
from humprot.helpers import count_kmers_in_seqs, build_i2s_s2i_maps
from humprot.trees import MultiKmerTree
from humprot.constants import AA_LIST_STD20

In [2]:
DATDIR = "../data"
OUTDIR = "../out/asparaginase_kmer_exploration"
IMGDIR = os.path.join(OUTDIR, "images")

os.makedirs(OUTDIR, exist_ok=True)
os.makedirs(IMGDIR, exist_ok=True)

mktree_fpath = os.path.join(DATDIR, "asp_trees", "asp_P0A962_mktree_25.npz")
fasta_fpath = os.path.join(DATDIR, "asp_seqs", "P0A962.fasta")

if not os.path.isfile(mktree_fpath):
    raise FileNotFoundError(mktree_fpath)

if not os.path.isfile(fasta_fpath):
    raise FileNotFoundError(fasta_fpath)


In [3]:
AA_LIST = AA_LIST_STD20
MASK = len(AA_LIST)

int2sym_map, sym2int_map = build_i2s_s2i_maps(AA_LIST, mask=MASK)

In [4]:
tree = MultiKmerTree.load(mktree_fpath)
print(f"Loaded tree from file: {mktree_fpath}")

Loaded tree from file: ../data/asp_trees/asp_P0A962_mktree_25.npz


In [5]:
# Load the sequence data
id2seq = {}
for record in SeqIO.parse(fasta_fpath, "fasta"):
    seqid = record.id
    seq = sym2int(record.seq, sym2int_map)
    id2seq[seqid] = seq

print(f"Loaded {len(id2seq)} protein sequences")

Loaded 1 protein sequences


In [23]:
x = tree.map_bfs(lambda x: None, depth=4)
print(len(x))

873
