In [1]:
import pyhmmer
import time
import numpy as np

## Press into database for speed

In [2]:
hmms = pyhmmer.plan7.HMMFile("../data/Pfam-A.hmm")
pyhmmer.hmmer.hmmpress(hmms, '../data/pfam')

19632

## Searching for ~12k sequences against pfam, use 5 cpus

In [2]:
with pyhmmer.plan7.HMMFile("../data/pfam.h3m") as hmms:
    with pyhmmer.easel.SequenceFile("../data/test.fasta", digital=True) as seqs:
        t1 = time.time()
        hits = list(pyhmmer.hmmer.hmmscan(seqs, hmms, cpus=5, E=1e-10))
        total = len(hits)
        print(f"- hmmscan found a total of {total} hits in {time.time() - t1:.3} seconds")

- hmmscan found a total of 16555 hits in 4.07e+02 seconds


In [4]:
type(hits[0])

pyhmmer.plan7.TopHits

Collection of hits: https://pyhmmer.readthedocs.io/en/stable/api/plan7.html?highlight=Hit#tophits

looks like they organize by original query

In [6]:
tophits = hits[0]

In [11]:
tophits.query_name

b'0'

In [12]:
len(tophits)

4

Got 4 hits

In [14]:
dir(tophits[0])

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'accession',
 'best_domain',
 'bias',
 'description',
 'domains',
 'dropped',
 'duplicate',
 'evalue',
 'hits',
 'included',
 'name',
 'new',
 'pre_score',
 'pvalue',
 'reported',
 'score',
 'sum_score']

In [17]:
tophits[0].accession

b'PF00501.31'

In [19]:
tophits[0].evalue

2.7230017339031424e-209

In [21]:
tophits[0].name

b'AMP-binding'

In [27]:
tophits[0].best_domain

<pyhmmer.plan7.Domain at 0x7f6a1b15ad00>

Contains some info about the domain

In [29]:
dir(tophits[0].best_domain)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'alignment',
 'bias',
 'c_evalue',
 'correction',
 'env_from',
 'env_to',
 'envelope_score',
 'hit',
 'i_evalue',
 'included',
 'pvalue',
 'reported',
 'score']

In [31]:
dir(tophits[0].best_domain.alignment)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_cython__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__setstate_cython__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'domain',
 'hmm_accession',
 'hmm_from',
 'hmm_name',
 'hmm_sequence',
 'hmm_to',
 'identity_sequence',
 'target_from',
 'target_name',
 'target_sequence',
 'target_to']

In [32]:
tophits[0].best_domain.alignment.target_sequence

'FRRQVAQSADAPAVLF--EGETLSYRELNERANRLARFLVDSGVRAEDRVALALPRAPELVVAMLAVLKAGAAYVPVDPDYPAERIAYILDDSRPAAVV---TARSTDAAVG------DHASRRVVLDEIA--DD--LARLD---AAD----LDETPDPHQAAYVIYTSGSTGAPKGVVVSHAGTAALAATQ--AERLRVGPGDRVLQFASPSFDAAF-WETVMALLSGAALVVAPSERLRPGPDLAALAAEYDVTHLTLPPSALAALPDGGLP----------EQATLVSAGEALTPDLMRRWSAG-R-RMINAYGPTETTVCAT-MSDPLTDRSAPLIGRPVDDARVYVLDAC-LRPVPPGVTGELYVAGAGLARGYVNRPDLTAERFVAapwgaPgERMYRTGDLVRWTEDGALEFVGRADDQVKVR'

In [37]:
tophits[0].best_domain.alignment.identity_sequence

'+++q+a+  d++a+ +  eg+tl+Yrel+er+nrlA++L   Gv+ +drVa++lp+ pe+vva+la+lkaGa++vp++p+++ae++ayil+ds++ +++   + +   +++       ++ +  +vld+ +  +   l   +   +++     +e+++p++ ay+iYTSG+TG PKGV++sh+ ++a +a++   +r +++++d+vl++++ ++++++ +e+ +allsGa++v+++ +  ++   l++l +e+ vt+++++P++l al + g +             +++++ge+l+p+l+r+++a  + +++++YG+TEtt+++t + ++  +++++ +G+p+ +++v+++D   ++pvp+g++Gel+v g+g+++gY ++p++tae+f++     + +r++rTGDl+r++edG le+vgR++d++k+r'

In [38]:
tophits[0].best_domain.alignment.hmm_sequence

'leeqaaktpdkvalvdsgegqtlTYreldervnrlAagLrslGvkkgdrVaillpnspewvvallailkaGavvvplnptltaeelayiledseakvlitatvdselleklleaaakleavklvivldadellkeeelkeeeeseekaekpeeeeelkpddlayiiYTSGTTGkPKGVmlshrnlvasvaslkletrfklkeddvvlsilplahifglilelllallsGatvvllekdaaldpklllelieelkvtviagvPrvleallesgalkaakkakdlsslrlvllggeplspellrffralfgeevlegYGlTEttgvvtvlpleedkkklgsvGkplpgvevkivDeetgepvpdgeeGellvrgpgvmkgYlkdpeataeafds.....e.grwlrTGDlgrldedGyleivgRkkdliklr'