In [7]:
# imports 
from collections import defaultdict
import pandas as pd
from Bio import SearchIO
import glob

In [4]:
# set list of attributes needed
attribs = ['accession', 'bias', 'bitscore', 'description', 'cluster_num', 
            'domain_exp_num',  'domain_included_num', 'domain_obs_num', 
            'domain_reported_num', 'env_num', 'evalue', 'id', 'overlap_num', 'region_num']

In [18]:
# define all hmm outputs
hmm_out = glob.glob("./hmm_out/*.tab.out")

In [20]:
hits = defaultdict(list)

# Get hmm hits for each contig and append to dictionary
dfdict = {}
for inf in hmm_out:
    with open(inf) as handle:
        for queryresult in SearchIO.parse(handle, 'hmmer3-tab'):
            for hit in queryresult.hits:
                for attrib in attribs:
                    hit.accession = queryresult.id
                    hits[attrib].append(getattr(hit, attrib))
    # make df frim dict
    hitdf = pd.DataFrame.from_dict(hits)


In [24]:
# select for hits with smallest evalue
idx_min_evalue = hitdf.groupby('accession')['evalue'].idxmin()
filtered_df = hitdf.loc[idx_min_evalue]
filtered_df.to_csv('RdRp_hmmr.csv')

In [26]:
# get the nucleotide contigs instead of the predicted protein contig
filtered_df['contig'] = filtered_df['accession'].str.rsplit('_', n=1).str[0]

# Remove contigs that are in there twice (>1 proteins were found to match)
idx_min_evalue = filtered_df.groupby('contig')['evalue'].idxmin()
df = filtered_df.loc[idx_min_evalue]

In [None]:
# write to csv
df.to_csv('RdRp_hmmr.csv')