# testing pyhmmer work

In [1]:
# system dependecies
import logging
import os
import sys
import tempfile
import time
from typing import Union

# library dependencies
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import duckdb as ddb
from joblib import Parallel, delayed
import pandas as pd
import pyhmmer

# local dependencies
import learn2therm.utils

In [12]:
HMM_PATH = '../data/HMM/'  # ./Pfam-A.hmm
PRESS_PATH = './data/HMM'

## testing ground for loading funciton

In [4]:
# stolen from Evan's t1.0 scripts
with tempfile.TemporaryDirectory(dir='./') as tmpdir:
    # Establishing a connection with Duck DB
    conn = ddb.connect(tmpdir+'proteins.db', read_only=False)
    # Making a SQL table
    conn.execute("CREATE TABLE proteins AS SELECT * FROM read_parquet('../data/proteins/uniprot_chunk_0.parquet')")
    # Committing DB
    conn.commit()

     # get some test proteins to run resource test alignment on
    # considering the max protein length
    query_proteins = conn.execute(
        f"""SELECT pid, protein_seq, LENGTH(protein_seq) AS len FROM proteins 
        WHERE len<=250
        ORDER BY RANDOM()
        LIMIT 1000
        """).df()
    subject_proteins = conn.execute(
        f"""SELECT pid, protein_seq, LENGTH(protein_seq) AS len FROM proteins 
        WHERE len<=250
        ORDER BY RANDOM()
        LIMIT 1000
        """).df()

    # get some metadata about taxa protein join
    # protein total counts per organism was tracked in s0.3
    # but lets recompute that data considering a max protien length
    protein_counts = conn.execute(
        f"""SELECT taxid, COUNT(*) AS n_proteins
        FROM proteins
        WHERE LENGTH(protein_seq)<=250
        GROUP BY taxid""").df()

In [5]:
query_proteins

Unnamed: 0,pid,protein_seq,len
0,A0A1M4MKW1,MPDALIALLKDWHSVPVATTGADGMPNVAAKSVMVRDPETIVWGEL...,137
1,A0A495QVR1,MSTIRVTDEVKERLRDLKRDDESFNDLLDRLSRSEKDVEEMAGFLS...,74
2,A0A830EDH1,MNLGFGLLLFAMVPFLLDRPVLLVAVGGHIVAVAVAVVSLAAMATL...,66
3,L0AN55,MDPNDVEERIESELEDAEATVTHARDEHDDDHLAATVVSPAFEGLS...,81
4,O57991,MREVKLVTFDVWNTLLDLNIMLDEFSHQLAKISGLHIKDVANAVIE...,235
...,...,...,...
995,A0A830G254,MKEYERKGLLERIGRESATVGATIPDEIALDGEPFPLREFVFETSG...,161
996,L0JL80,MTQNPFTAVFDAQRTALEQSQSFAHEALEAQQTSISAIADVVETSG...,161
997,D3SR76,MDVDRKDLRILKAIADEETRSPERIAELTNVPLSTVHYRIDSLQER...,166
998,L0JIY1,MQYLVGTDSIHTTAAICDYLEERASGDDAVTVVAVAPADDPTARRD...,127


In [5]:
subject_proteins

Unnamed: 0,pid,protein_seq,len
0,A0A1I0FQE4,MDVTQIGLGATLLVIGTLTLIGPATLATGPLAYLVTGSTLVVTVAA...,58
1,A0A1I0M298,MGFGSYDESEQQEQTTSDEDVEAVNVHENDHEGKLSFESDLSTDEL...,60
2,A0A0Q2QPZ4,MYKIKDEWGEFLVRLARRAIEEYVRNGRTIKPPEDTPPQLWERMGV...,204
3,A0A0F8FH29,MTRFIKYHPRSNTYVIEKRAFLEEDLTLDGNVIVGPEVKFWKNLTV...,142
4,B8D531,MNRVYVTVLATLLIIVGLLLGASMWLDILRANTYVDTGELDWEIVE...,225
...,...,...,...
995,A0A1H9YYW1,MNNIEEMIQKAVELQANGLVTGQIANELNVSRETVTWLLTRSKKDV...,203
996,L0JKK2,MGTHVLVPLDGSSQAWAAFDHAVSNHDGGRITTLHVVDPMAGVYSD...,146
997,L0JZ77,MCVSSRMPITVVQLTLSALAAVFAFITAVYGRLNYADGELDRQKKI...,195
998,L0JZQ8,MTVFALLRVTPITDDDATEDVAAAIDALEEYDVEYETTPLATTLEA...,104


So, in theory, I could sample the .parquet files and create a df out of them for loading

In [None]:
# goal to make this function
def load_protein_data(database, **kwargs):
    """
    Load the protein sequences and associated taxonomic information from the input CSV files
    for DB version 1.0, or from the input Parquet files for DB version 1.1.

    Parameters
    ----------
    database : _type_
        _description_
    """
    # TODO
    pass

## Putting the pieces

In [7]:
# stolen from Evan's t1.0 scripts
with tempfile.TemporaryDirectory(dir='../tmp/') as tmpdir:
    # Establishing a connection with Duck DB
    conn = ddb.connect(tmpdir+'proteins.db', read_only=False)
    # Making a SQL table
    conn.execute(f"CREATE TABLE proteins AS SELECT * FROM read_parquet('../data/proteins/uniprot_chunk_0.parquet')")
    # Committing DB
    conn.commit()

    # get some test proteins to run resource test alignment on
    # considering the max protein length
    query_proteins = conn.execute(
        """SELECT pid, protein_seq, LENGTH(protein_seq) AS len FROM proteins 
        WHERE len<=250
        ORDER BY RANDOM()
        LIMIT 1000
        """).df()

    # get some metadata about taxa protein join
    # protein total counts per organism was tracked in s0.3
    # but lets recompute that data considering a max protien length
    protein_counts = conn.execute(
        """SELECT taxid, COUNT(*) AS n_proteins
        FROM proteins
        WHERE LENGTH(protein_seq)<=250
        GROUP BY taxid""").df()

In [8]:
query_proteins

Unnamed: 0,pid,protein_seq,len
0,O74089,MIKKYLCKEFIEEIDRLERSVVELEEQIIELKTQLKMKNEEITTLA...,92
1,Q5UZR0,MEDVDGEEMPGAIVEAFLEREEGVRALLEELEKLTIEGRHEEVRDR...,217
2,A0A076LHC6,MRYVAYKIYPEEFLNNEVIDNALIIEGRKVRRVRILGKVEDINVGN...,195
3,O59042,MAILQPLPLLNFFLNSRISSLIFLIGSSLTFFFACSFIWSFFKYFS...,125
4,A0A1I0PU40,MDSARLVEGEWELKAGSYSVEYRFSSDDSYHRVTISEIEDLESEDV...,92
...,...,...,...
995,L0JY03,MQLDSRPTVSIPEEFDSAQAKLIYLYLREWPNASADEICTALGIEK...,72
996,A0A2H4VB07,MFGMTKKQFLKKSKNCLKDTGIQLILIRKIFIRESGGEIDIDETYR...,137
997,L0ALR6,MTTHEMDLRGLDCPQPTLEIGVKASELDDGEVIVAEADCSTFPDDV...,74
998,A0A1M4ML41,MKNVDMAVEKNILTIRVDLAKEFGASKSGKSITIASTEGNVPVPGH...,59


The lesson here is that I shouldn't do maby parquets w/o Hyak. Need to figure out to use vscode with Hyak.

In [6]:
def prefetch_targets(hmms_path: str):
    """
    Prefetch HMM profiles from a given HMM database.
    Parameters
    ----------
    hmms_path : str
        Path to the HMM database.
    Returns
    -------
    targets : pyhmmer.plan7.OptimizedProfileBlock
        The HMM profiles loaded from the database.
    """
    # amino acid alphabet and prefetched inputs
    amino_acids = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmms_path))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        amino_acids, optimized_profiles)
    return targets

In [7]:
def save_sequences_to_fasta(
        sequences: pd.core.frame.DataFrame,
        inputname: str = "input"):
    """
    Returns a list of SeqRecord objects and creates a corresponding input Fasta of them
    Parameters:
    ------------
    sequences : pandas.core.frame.DataFrame
        a dataframe with string amino acid sequences in a 'protein_seq' column
    input name : str, default = 'input'
        a name for the input fasta file
    Returns:
    ------------
    file : TextIOWrapper
        the input fasta file created from the list of SeqRecord objects
    Raises
    -------
    ValueError :
        if the input dataframe is empty
    AttributeError :
        if any of the sequences are invalid
    """
    # ensure input file has .fasta extension
    if not inputname.endswith('.fasta'):
        inputname = f"{os.path.splitext(inputname)[0]}.fasta"

    # check if input is empty
    if sequences.empty:
        raise ValueError("Input dataframe is empty")

    # check if sequences are valid
    for seq in sequences['protein_seq']:
        try:
            Seq(seq)
        except BaseException as exc:
            raise AttributeError("Invalid sequence") from exc

    # function
    records = []
    for index, seq in sequences.itertuples():
        try:
            record = SeqRecord(Seq(seq), id=str(index))
            records.append(record)
        except AttributeError as exc:
            raise AttributeError(f"Invalid sequence: {seq}") from exc

    # raise error if seq not valid
    if not records:
        raise AttributeError("No valid sequences found in input")

    with open(inputname, "w", encoding="utf-8") as file:
        SeqIO.write(records, file, "fasta")
    return file

In [8]:
def run_pyhmmer(
        input_file: str,
        hmms_path: str,
        prefetch: bool = False,
        output_file: str = None,
        cpu: int = 4,
        eval_con: float = 1e-10):
    """
    Run HMMER's hmmscan program on a set of input sequences using with HMMs from a database.
    Parameters
    ----------
    input_file : str
        Path to the input sequence file.
    hmms_path : str
        Path to the HMM database.
    prefetch : bool, optional
        Specifies how the HMM are stored in meomry.
    output_file : str, optional
        Path to the output file if the users wants to write the file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.
    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or domtblout file
        If the output_file has a name, it will be written to a domtblout file.
        Otherwise, the user will get a list of pyhmmeer TopHits objects.
    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # ensure input file has .fasta extension
    if not input_file.endswith('.fasta'):
        input_file = f"{os.path.splitext(input_file)[0]}.fasta"

    # ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # HMM profile modes
    if prefetch:
        targets = prefetch_targets(hmms_path)
    else:
        targets = pyhmmer.plan7.HMMFile("../data/pfam/.h3m")

    # HMMscan execution with or without saving output to file
    with pyhmmer.easel.SequenceFile(input_file, digital=True) as seqs:
        all_hits = list(pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con))
        # check if we should save the output
        if output_file is not None:
            with open(output_file, "wb") as dst:
                for i, hits in enumerate(all_hits):
                    hits.write(dst, format="domains", header=i == 0)

    return all_hits

In [9]:
def parse_pyhmmer(all_hits):
    """
    Parses the TopHit pyhmmer object getting the query and accession IDs and saves to a DataFrame
    Parameters
    ----------
    all_hits : list
        A list of TopHit objects from pyhmmer.
    Returns
    -------
    pandas.DataFrame
        A dataframe containing the query and accession IDs.
    """
    # initialize an empty dictionary to store the data
    parsed_hits = {}

    # iterate over each protein hit
    for top_hits in all_hits:
        for hit in top_hits:
            # extract the query and accession IDs and decode the query ID
            query_id = hit.hits.query_name.decode('utf-8')
            accession_id = hit.accession.decode('utf-8')

            # if the query_id already exists in the dictionary, append the accession_id
            # to the existing value
            if query_id in parsed_hits:
                parsed_hits[query_id].append(accession_id)
            # otherwise, create a new key-value pair in the dictionary
            else:
                parsed_hits[query_id] = [accession_id]

    # create the DataFrame from the dictionary and convert list of accession IDs to string
    df = pd.DataFrame(parsed_hits.items(), columns=["query_id", "accession_id"])
    df["accession_id"] = df["accession_id"].apply(lambda x: ';'.join(x))

    return df

In [None]:
def worker_function(chunk_index, sequences, which, wakeup=None):
    """
    A wrapping function that runs and parses pyhmmer in chunks
    Parameters
    ----------
    chunk_index : int
        number of sequences chunks
    sequences : str
        a list of dataframe containing protein sequences
    """
    # we want to wait for execution to see if this worker is actually being used
    # or if it is in the process of being killed
    if wakeup is not None:
        time.sleep(wakeup)
    
    # define paths for input and output files
    input_file_path = f"./results/{which}_input_{chunk_index}"
    output_file_path = f"./results/{which}_output_{chunk_index}"

    # convert sequences to FASTA files
    save_sequences_to_fasta(sequences, input_file_path)

    # run HMMER via pyhmmer
    hits = run_pyhmmer(
        input_file=input_file_path,
        hmms_path=PFAM_PATH,
        prefetch=True,
        output_file=output_file_path,
        cpu=1,
        eval_con=1e-5)

    # Parse pyhmmer output and save to CSV file
    accessions_parsed = parse_pyhmmer(all_hits=hits)
    accessions_parsed.to_csv(
        f"./results/{which}_result_{chunk_index}.csv",
        index=False)

### Faster sequence for pyhmmer

In [23]:
list1 = []

amino_acids = pyhmmer.easel.Alphabet.amino()

for _, row in query_proteins.iterrows():
    pid = bytes(row['pid'], encoding='utf-8')
    seq_str = row['protein_seq']
    test = pyhmmer.easel.TextSequence(name=pid, sequence= seq_str)
    test = test.digitize(amino_acids)
    list1.append(test)
    
test2 = pyhmmer.easel.DigitalSequenceBlock(amino_acids, list1)

  test = test.digitize(amino_acids)


In [17]:
list1

[<pyhmmer.easel.DigitalSequence at 0x1129081c0>,
 <pyhmmer.easel.DigitalSequence at 0x112908ac0>,
 <pyhmmer.easel.DigitalSequence at 0x112960b40>,
 <pyhmmer.easel.DigitalSequence at 0x112960c80>,
 <pyhmmer.easel.DigitalSequence at 0x1128ebac0>,
 <pyhmmer.easel.DigitalSequence at 0x1128e9e80>,
 <pyhmmer.easel.DigitalSequence at 0x1128e9000>,
 <pyhmmer.easel.DigitalSequence at 0x1128e8d80>,
 <pyhmmer.easel.DigitalSequence at 0x1128e9fc0>,
 <pyhmmer.easel.DigitalSequence at 0x1128ebc00>,
 <pyhmmer.easel.DigitalSequence at 0x1128e1180>,
 <pyhmmer.easel.DigitalSequence at 0x1128e08c0>,
 <pyhmmer.easel.DigitalSequence at 0x1128e2640>,
 <pyhmmer.easel.DigitalSequence at 0x1128e1c00>,
 <pyhmmer.easel.DigitalSequence at 0x112dd6e00>,
 <pyhmmer.easel.DigitalSequence at 0x112dd72c0>,
 <pyhmmer.easel.DigitalSequence at 0x112dd6740>,
 <pyhmmer.easel.DigitalSequence at 0x112dd69c0>,
 <pyhmmer.easel.DigitalSequence at 0x112dd7e80>,
 <pyhmmer.easel.DigitalSequence at 0x112dd7480>,
 <pyhmmer.easel.Digi

In [18]:
list1[0].name

b'M0ATD1'

In [25]:
test2

DigitalSequenceBlock(pyhmmer.easel.Alphabet.amino(), [<pyhmmer.easel.DigitalSequence object at 0x11251f540>, <pyhmmer.easel.DigitalSequence object at 0x1136eba00>, <pyhmmer.easel.DigitalSequence object at 0x11361aa80>, <pyhmmer.easel.DigitalSequence object at 0x1136f53c0>, <pyhmmer.easel.DigitalSequence object at 0x113450900>, <pyhmmer.easel.DigitalSequence object at 0x113450580>, <pyhmmer.easel.DigitalSequence object at 0x1129541c0>, <pyhmmer.easel.DigitalSequence object at 0x113451c80>, <pyhmmer.easel.DigitalSequence object at 0x113450380>, <pyhmmer.easel.DigitalSequence object at 0x113451380>, <pyhmmer.easel.DigitalSequence object at 0x113450200>, <pyhmmer.easel.DigitalSequence object at 0x113450840>, <pyhmmer.easel.DigitalSequence object at 0x113451ac0>, <pyhmmer.easel.DigitalSequence object at 0x113451700>, <pyhmmer.easel.DigitalSequence object at 0x113450e00>, <pyhmmer.easel.DigitalSequence object at 0x113451bc0>, <pyhmmer.easel.DigitalSequence object at 0x1134507c0>, <pyhmmer.ea

YAAAAS!

### testing some stuff for script 

Below is wrong, but I am just testing out stuff:

In [9]:
def load_protein_data():
    """
    TODO:
    check s1.3 for inspiration
    """
    # stolen from Evan's t1.0 scripts
    with tempfile.TemporaryDirectory(dir='./tmp') as tmpdir:
      # Establishing a connection with Duck DB
      conn = ddb.connect(tmpdir+'proteins.db', read_only=False)
      # Making a SQL table
      conn.execute("CREATE TABLE proteins AS SELECT taxid AS taxid, pid AS pid, protein_seq AS sequence FROM read_parquet('../data/proteins/uniprot_chunk_0.parquet')")
      # Committing DB
      conn.commit()

      # Create table of taxa pairs
      conn.execute("CREATE TEMP TABLE pair_labels AS SELECT * FROM read_parquet('../data/taxa_pairs/pair_labels/*.parquet')")
      conn.execute("CREATE TEMP TABLE pair_scores AS SELECT * FROM read_parquet('../data/taxa_pairs/alignment/*.parquet')")
      conn.execute("CREATE TABLE pairs AS SELECT * FROM pair_labels INNER JOIN pair_scores ON (pair_labels.__index_level_0__ = pair_scores.__index_level_0__) WHERE pair_labels.is_pair=True")
      conn.commit()
      conn.execute("CREATE INDEX meso_index ON pairs (subject_id)")
      conn.execute("CREATE INDEX thermo_index ON pairs (query_id)")
      conn.commit()
      conn.close()

    return tmpdir, tmpdir+'/proteins.db'

# setup the database and get some pairs to run
tmpdir_database, db_path = load_protein_data()

Need Ryan's help

In [None]:
def load_protein_data():
    """
    TODO:
    check s1.3 for inspiration
    """
    # stolen from Evan's t1.0 scripts
    with tempfile.TemporaryDirectory(dir='./tmp') as tmpdir:
      # Establishing a connection with Duck DB
      conn = ddb.connect(tmpdir+'proteins.db', read_only=False)
      # Making a SQL table
      conn.execute("CREATE TABLE proteins AS SELECT taxid AS taxid, pid AS pid, protein_seq AS sequence FROM read_parquet('../data/proteins/uniprot_chunk_0.parquet')")
      # Committing DB
      conn.commit()

      # Create table of taxa pairs /gscratch/stf/halanzi/git_repos/learn2thermDB/data/protein_pairs/align_taxa_128-406548.parquet
      conn.execute("CREATE TEMP TABLE protein_pairs AS SELECT * FROM read_parquet('../data/protein_pairs/pair_labels/*.parquet')")
      conn.execute("CREATE TABLE pairs AS SELECT * FROM protein_pairs INNER JOIN proteins ON (protein_pairs.__index_level_0__ = pair_scores.__index_level_0__) WHERE pair_labels.is_pair=True")
      cmd = """CREATE TABLE proteins_in_pairs AS
               SELECT pid,
               protein_seq
               FROM proteins
               WHERE pid IN (SELECT meso_pid FROM protein_pairs) OR
               pid IN (SELECT thermo_pid FROM protein_pairs)"""
      conn.execute(cmd)
      conn.commit()
      # conn.execute("CREATE INDEX meso_pid ON pairs (subject_id)")
      # conn.execute("CREATE INDEX thermo_pid ON pairs (query_id)")
      # conn.commit()
      # conn.close()

    return tmpdir, tmpdir+'/proteins.db'

# setup the database and get some pairs to run
tmpdir_database, db_path = load_protein_data()

In [3]:
with tempfile.TemporaryDirectory(dir='./tmp') as tmpdir:
        conn = ddb.connect(tmpdir+'proteins.db', read_only=False)
        # Making a SQL table
        conn.execute("CREATE TABLE proteins AS SELECT taxid AS taxid, pid AS pid, protein_seq AS sequence FROM read_parquet('../data/proteins/uniprot_chunk_0.parquet')")
        # Committing DB
        conn.commit()

        # Create table of taxa pairs
        conn.execute("CREATE TEMP TABLE protein_pairs AS SELECT * FROM read_parquet('../data/protein_pairs/pair_labels/*.parquet')")
        cmd = """CREATE TABLE proteins_in_pairs AS
                SELECT pid,
                protein_seq
                FROM proteins
                WHERE pid IN (SELECT meso_pid FROM protein_pairs) OR
                pid IN (SELECT thermo_pid FROM protein_pairs)"""
        conn.execute(cmd)
        conn.commit()

IOException: IO Error: No files found that match the pattern "../data/protein_pairs/pair_labels/*.parquet"

In [4]:
def load_protein_data():
    """
    TODO:
    check s1.3 for inspiration
    """
    # stolen from Evan's t1.0 scripts
    with tempfile.TemporaryDirectory(dir='../tmp/') as tmpdir:
      # Establishing a connection with Duck DB
      conn = ddb.connect(tmpdir+'proteins.db', read_only=False)
      # Making a SQL table of protein
      conn.execute("CREATE TABLE proteins AS SELECT pid AS pid, protein_seq AS sequence FROM read_parquet('../data/proteins/uniprot_chunk_0.parquet')")
      # Committing DB
      conn.commit()

      # Create table of protein_in_pairs
      conn.execute("CREATE OR REPLACE TEMP TABLE protein_pairs AS SELECT meso_pid As meso_pid, thermo_pid as thermo_pid FROM read_parquet('../data/protein_pairs/*.parquet')")
      cmd = """
      CREATE OR REPLACE TABLE proteins_in_pairs 
      AS SELECT pid, protein_seq FROM proteins 
      WHERE pid in (SELECT meso_pid FROM protein_pairs) OR (SELECT thermo_pid from protein_pairs)
            """
      conn.execute(cmd)
      conn.commit()
      conn.close()

    return tmpdir, tmpdir+'/proteins.db'

# setup the database and get some pairs to run
tmpdir_database, db_path = load_protein_data()

pids = conn.execute("""SELECT * FROM proteins_in_pairs LIMIT 10""").df()

: 

: 

## Does pyhmmer run?

In [13]:
def prefetch_targets(hmms_path: str):
    """
    Prefetch HMM profiles from a given HMM database.
    Parameters
    ----------
    hmms_path : str
        Path to the HMM database.
    Returns
    -------
    targets : pyhmmer.plan7.OptimizedProfileBlock
        The HMM profiles loaded from the database.
    """
    # amino acid alphabet and prefetched inputs
    amino_acids = pyhmmer.easel.Alphabet.amino()
    optimized_profiles = list(pyhmmer.plan7.HMMPressedFile(hmms_path))
    targets = pyhmmer.plan7.OptimizedProfileBlock(
        amino_acids, optimized_profiles)
    return targets

def save_to_digital_sequences(dataframe: pd.DataFrame):
    """
    Save protein sequences from a DataFrame to a digital sequence block.
    Parameters
    ----------
    dataframe : pd.DataFrame
        DataFrame containing PIDs (Protein IDs) and sequences.
    Returns
    -------
    DigitalSequenceBlock
        A digital sequence block containing the converted sequences.
    """
    # Create empty list
    seqlist = []

    # Establish pyhmmer alphabet
    amino_acids = pyhmmer.easel.Alphabet.amino()

    # Convert proteins in dataframe to suitable format
    for _, row in dataframe.iterrows():
        pid = bytes(row['pid'], encoding='utf-8')
        seq_str = row['protein_seq']
        sequences = pyhmmer.easel.TextSequence(name=pid, sequence= seq_str)
        sequences = sequences.digitize(amino_acids)
        seqlist.append(sequences)
    
    # Convert so SequenceBlocks
    seqblock = pyhmmer.easel.DigitalSequenceBlock(amino_acids, seqlist)

    return seqblock

def run_pyhmmer(
        seqs: pyhmmer.easel.DigitalSequenceBlock,
        hmms_path: str,
        prefetch: bool = False,
        output_file: str = None,
        cpu: int = 4,
        eval_con: float = 1e-10):
    """
    Run HMMER's hmmscan program on a set of input sequences using with HMMs from a database.
    Parameters
    ----------
    seqs : pyhmmer.easel.DigitalSequenceBlock
        Path to the input sequence file.
    hmms_path : str
        Path to the HMM database.
    prefetch : bool, optional
        Specifies how the HMM are stored in meomry.
    output_file : str, optional
        Path to the output file if the users wants to write the file.
    cpu : int, optional
        The number of CPUs to use. Default is 4.
    eval_con : float, optional
        E-value threshold for domain reporting. Default is 1e-10.
    Returns
    -------
    all_hits : pyhmmer.plan7.TopHits or domtblout file
        If the output_file has a name, it will be written to a domtblout file.
        Otherwise, the user will get a list of pyhmmeer TopHits objects.
    Notes
    -----
    This function runs HMMER's hmmscan program on a set of input sequences
    using HMMs from a given database.
    The function supports two modes: normal mode and prefetching mode.
    In normal mode, the HMMs are pressed and stored in a directory before execution.
    In prefetching mode, the HMMs are kept in memory for faster search.
    """
    # ensure output_file has .domtblout extension
    if not output_file.endswith('.domtblout'):
        output_file = f"{os.path.splitext(output_file)[0]}.domtblout"

    # HMM profile modes
    if prefetch:
        targets = prefetch_targets(hmms_path)
    else:
        targets = pyhmmer.plan7.HMMFile(PRESS_PATH)

    # HMMscan execution with or without saving output to file
    all_hits = list(pyhmmer.hmmer.hmmscan(seqs, targets, cpus=cpu, E=eval_con))
    # check if we should save the output
    if output_file is not None:
        with open(output_file, "wb") as dst:
            for i, hits in enumerate(all_hits):
                hits.write(dst, format="domains", header=i == 0)
    return all_hits

In [14]:
# execution
test_input = save_to_digital_sequences(query_proteins)

run_pyhmmer(
        seqs=test_input,
        hmms_path=HMM_PATH,
        prefetch=True,
        output_file="test",
        cpu=1,
        eval_con=1e-5)

[<pyhmmer.plan7.TopHits at 0x55a14852b680>,
 <pyhmmer.plan7.TopHits at 0x55a148500a20>,
 <pyhmmer.plan7.TopHits at 0x55a1485202f0>,
 <pyhmmer.plan7.TopHits at 0x55a14852c2f0>,
 <pyhmmer.plan7.TopHits at 0x55a14852cd90>,
 <pyhmmer.plan7.TopHits at 0x55a148701690>,
 <pyhmmer.plan7.TopHits at 0x55a18d4c2960>,
 <pyhmmer.plan7.TopHits at 0x55a18d4dd320>,
 <pyhmmer.plan7.TopHits at 0x55a18d4dd560>,
 <pyhmmer.plan7.TopHits at 0x55a18d66a220>,
 <pyhmmer.plan7.TopHits at 0x55a18d675540>,
 <pyhmmer.plan7.TopHits at 0x55a18d67f7a0>,
 <pyhmmer.plan7.TopHits at 0x55a18d689a00>,
 <pyhmmer.plan7.TopHits at 0x55a18d4dd800>,
 <pyhmmer.plan7.TopHits at 0x55a18d4de250>,
 <pyhmmer.plan7.TopHits at 0x55a18d4deca0>,
 <pyhmmer.plan7.TopHits at 0x55a18d4df6f0>,
 <pyhmmer.plan7.TopHits at 0x55a18d6bd030>,
 <pyhmmer.plan7.TopHits at 0x55a18d6c8180>,
 <pyhmmer.plan7.TopHits at 0x55a18d6d30f0>,
 <pyhmmer.plan7.TopHits at 0x55a148700ee0>,
 <pyhmmer.plan7.TopHits at 0x55a148701120>,
 <pyhmmer.plan7.TopHits at 0x55a

I see, so it is the same error as my script. At least we isolated the needed pieces now.

It was the HMM_PATH....