### imports

In [1]:
import sys
import os
import time
from tables import *
from datetime import datetime
import importlib


# Get the current working directory - assuming athina working directory here
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")
### change the current working directory if necessary
#os.chdir('..')
#current_directory = os.getcwd()
#print(f"Current working directory: {current_directory}")

# Add the parent directory 'HogProf/src/HogProf' to sys.path (for importing lshbuilder)
sys.path.append(os.path.abspath('HogProf/src/HogProf'))

# Import lshbuilder module from the parent directory
try:
    import lshbuilder
    importlib.reload(lshbuilder)
    print("Successfully imported lshbuilder.")
except ImportError as e:
    print(f"Error importing lshbuilder: {e}")
# Import profiler module from the parent directory
try:
    import profiler
    importlib.reload(profiler)
    print("Successfully imported profiler.")
except ImportError as e:
    print(f"Error importing profiler: {e}")

# Add the utils directory 'HogProf/src/HogProf/utils' to sys.path (for importing phyhamutils and hashutils)
sys.path.append(os.path.abspath('HogProf/src/HogProf/utils'))

# Import phyhamutils and hashutils from the utils directory
try:
    import pyhamutils
    importlib.reload(pyhamutils)
    import hashutils
    importlib.reload(hashutils)
    print("Successfully imported pyhamutils and hashutils.")
except ImportError as e:
    print(f"Error importing pyhamutils or hashutils: {e}")


Current working directory: /work/FAC/FBM/DBC/cdessim2/default/agavriilidou
Successfully imported lshbuilder.
Successfully imported profiler.
Successfully imported pyhamutils and hashutils.


### lshbuilder main function

In [2]:
def lshbuilder_main(dbname, orthoglob=None, nperm=256, omafile=None, taxfilter=None, taxmask=None, weights=None, 
mastertree=None, lossonly=False, duplonly=False, taxcodes=False, verbose=False, reformat_names=False, threads=4,
dbtype=None):
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('--taxweights', help='load optimised weights from keras model',type = str)
    parser.add_argument('--taxmask', help='consider only one branch',type = str)
    parser.add_argument('--taxfilter', help='remove these taxa' , type = str)
    parser.add_argument('--outpath', help='name of the db', type = str)
    parser.add_argument('--dbtype', help='preconfigured taxonomic ranges' , type = str)
    parser.add_argument('--OMA', help='use oma data ' , type = str)
    parser.add_argument('--OrthoGlob', help='a glob expression for orthoxml files ' , type = str)
    parser.add_argument('--tarfile', help='use tarfile with orthoxml data ' , type = str)
    parser.add_argument('--nperm', help='number of hash functions to use when constructing profiles' , type = int)
    parser.add_argument('--mastertree', help='master taxonomic tree. nodes should correspond to orthoxml' , type = str)
    
    parser.add_argument('--nthreads', help='nthreads for multiprocessing' , type = int)
    parser.add_argument('--lossonly', help='only compile loss events' , type = bool)
    parser.add_argument('--duplonly', help='only compile duplication events' , type = bool)
    parser.add_argument('--taxcodes', help='use taxid info in HOGs' , type = str)
    parser.add_argument('--verbose', help='print verbose output' , type = bool)
    parser.add_argument('--reformat_names', help='try to correct broken species trees by replacing all names with numbers.' , type = bool)
    '''
    ### the following dictionary may be outdated or does not work with reformat_names (14.10.2024)
    dbdict = {
        'all': { 'taxfilter': None , 'taxmask': None },
        'plants': { 'taxfilter': None , 'taxmask': 33090 },
        'archaea':{ 'taxfilter': None , 'taxmask': 2157 },
        'bacteria':{ 'taxfilter': None , 'taxmask': 2 },
        'eukarya':{ 'taxfilter': None , 'taxmask': 2759 },
        'protists':{ 'taxfilter': [2 , 2157 , 33090 , 4751, 33208] , 'taxmask':None },
        'fungi':{ 'taxfilter': None , 'taxmask': 4751 },
        'metazoa':{ 'taxfilter': None , 'taxmask': 33208 },
        'vertebrates':{ 'taxfilter': None , 'taxmask': 7742 },
    }

    if dbtype:
        taxfilter = dbdict[dbtype]['taxfilter']
        taxmask = dbdict[dbtype]['taxmask']
        print('using dbtype', dbtype, 'with taxfilter', taxfilter, 'and taxmask', taxmask)

    if weights:
        from keras.models import model_from_json
        json_file = open(  args['taxweights']+ '.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        model = model_from_json(loaded_model_json)
        # load weights into new model
        model.load_weights(  args['taxweights']+".h5")
        print("Loaded model from disk")
        weights = model.get_weights()[0]
        weights += 10 ** -10

    start = time.time()
    if omafile:
        with open_file( omafile , mode="r") as h5_oma:
            lsh_builder = lshbuilder.LSHBuilder(h5_oma = h5_oma,  fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
            treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , 
            lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , reformat_names=reformat_names, verbose=verbose )
            #### maybe here is where load_one and saver are needed instaed of the run_pipeline!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            lsh_builder.run_pipeline(threads)
    else:
        lsh_builder = lshbuilder.LSHBuilder(h5_oma = None,  fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
        treeweights= weights , taxfilter = taxfilter, taxmask=taxmask ,
          masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , reformat_names=reformat_names, verbose=verbose)
        lsh_builder.run_pipeline(threads)
    print(time.time() - start)
    print('DONE')

### testing

In [3]:
today = datetime.now().strftime("%y%m%d")
### 1 thread needs more than 2 ours for Toxicofera taxmask - do not let run from here
args = {
    'dbname': f'/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/venom_project/2a_hogprof_testing/test_{today}_',
    #'orthoglob': None,
    'omafile': '/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/oma_downloads/OmaServer.h5',
    #'taxweights': None,
    'taxmask': 'Toxicofera',            # 'Toxicofera', #4115 taxa after taxmask 
    #'taxfilter': None,
    #'dbtype': 'vertebrates',   ###does not seem to work, taxmask seems to be ignored later in the pipeline
    #'nperm': 256,
    'mastertree': '/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/oma_downloads/speciestree.nwk',
    'threads': 1,
    #'lossonly': False,
    #'duplonly': False,
    #'taxcodes': True,        ### does not seem to work due to bacteria and archaea not being in the taxcodes
    'verbose': True,
    'reformat_names': True    ### seems to be necessary or broken newick file errors appear
}
lshbuilder_main(**args)

initializing LSHBuilder
reformatted tree
taxmask Toxicofera
changed tree string (3844:1,(3904:1,3905:1)3845:1,(((4005:1,4006:1)3956:1,(4007:1,4008:1)3957:1,(4009:1,4010:1)3958:1)3906:1,3907:1)3846:1)3778:1;
Number of nodes before change: 4115
making tree weights w n taxa = : 17

configuring pyham functions
swap ids False
reformat names True
use phyloxml False
use taxcodes False
reading oma hdf5 with n groups: 1040435
done

run w n threads: 1
start workers


0it [00:00, ?it/s]

generating dataframes
worker init 0creating dataset

filtered at taxonomic level: NoFilter_Mask3778
{}
saver init 0


1it [00:00,  1.58it/s]

first ortholog group Metazoa
highest populated group Toxicofera
No top level hogs found for family 790033.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4010" NCBITaxId="103944" taxonId="1352">
    <database name="Protobothrops mucrosquamatus from Refseq" version="Refseq; P.Mucros_1.0; GCF_001527695.2; 23-MAY-2019; Protobothrops mucrosquamatus Annotation Release 101">
      <genes>
        <gene id="15787594" protId="PROMU09585" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15537855" protId="ANOCA07078" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15669188" p

2it [00:01,  1.60it/s]

<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4010" NCBITaxId="103944" taxonId="1352">
    <database name="Protobothrops mucrosquamatus from Refseq" version="Refseq; P.Mucros_1.0; GCF_001527695.2; 23-MAY-2019; Protobothrops mucrosquamatus Annotation Release 101">
      <genes>
        <gene id="15783432" protId="PROMU05423" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15537189" protId="ANOCA06412" />
      </genes>
    </database>
  </species>
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene id="15609648" protId="PANGU17152" />
      </genes>
    </database>
  </species>
  <species

3it [00:01,  1.58it/s]

first ortholog group Eukaryota
highest populated group Toxicofera
No top level hogs found for family 802267.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15533079" protId="ANOCA02302" />
      </genes>
    </database>
  </species>
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15562066" protId="SCEUN13260" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15507424" protId="VARKO05828" />


4it [00:02,  1.64it/s]

 Eukaryota
highest populated group Toxicofera
No top level hogs found for family 812742.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15701152" protId="NAJNA04769" />
        <gene id="15726309" protId="NAJNA29926" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15728687" protId="CROTI02339" />
        <gene id="15731473" protId="CROTI05125" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2

5it [00:03,  1.63it/s]

Sarcopterygii
highest populated group Serpentes
No top level hogs found for family 738056.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15664463" protId="THAEL26911" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15540059" protId="ANOCA09282" />
      </genes>
    </database>
  </species>
  <species name="4010" NCBITaxId="103944" taxonId="1352">
    <database name="Protobothrops mucrosquamatus from Refseq" version="Refseq; P.Mucros_1.0; GCF_001527695.2; 23-MAY-2019; Protobothrops mucrosquamatus Annotation Release 101">
      <genes>
    

6it [00:03,  1.70it/s]

<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15713807" protId="NAJNA17424" />
        <gene id="15711678" protId="NAJNA15295" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15538580" protId="ANOCA07803" />
      </genes>
    </database>
  </species>
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15655059" protId="THAEL17507" />
        <gene id="15662498" protId="THAEL24946" />
      </genes>
    </da

7it [00:04,  1.71it/s]

first ortholog group LUCA
highest populated group Toxicofera
No top level hogs found for family 1037131.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene id="15621002" protId="PANGU28506" />
      </genes>
    </database>
  </species>
  <species name="4010" NCBITaxId="103944" taxonId="1352">
    <database name="Protobothrops mucrosquamatus from Refseq" version="Refseq; P.Mucros_1.0; GCF_001527695.2; 23-MAY-2019; Protobothrops mucrosquamatus Annotation Release 101">
      <genes>
        <gene id="15797219" protId="PROMU19210" />
      </genes>
    </database>
  </species>
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamn

8it [00:04,  1.73it/s]

<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15755860" protId="CROTI29512" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15691935" protId="PSETE23507" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15534561" protId="ANOCA03784" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <databa

9it [00:05,  1.79it/s]

Bilateria
highest populated group Toxicofera
No top level hogs found for family 761064.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15643198" protId="THAEL05646" />
      </genes>
    </database>
  </species>
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene id="15604622" protId="PANGU12126" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annota

10it [00:05,  1.88it/s]

first ortholog group Sauria
highest populated group Toxicofera
No top level hogs found for family 712253.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15661495" protId="THAEL23943" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15517640" protId="VARKO16044" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <

11it [00:06,  1.74it/s]

No top level hogs found for family 712537.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15524973" protId="VARKO23377" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15724712" protId="NAJNA28329" />
        <gene id="15724708" protId="NAJNA28325" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15693526" protId="PSETE25098" />
        <gene id="15693527" protId="PSETE25099" />
      </genes>

12it [00:07,  1.66it/s]


<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15814872" protId="PYTBI13512" />
      </genes>
    </database>
  </species>
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15568699" protId="SCEUN19893" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15673547" protId="PSETE05119" />
      </genes>
    </database>
 

13it [00:07,  1.65it/s]

Bilateria
highest populated group Toxicofera
No top level hogs found for family 763390.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15663678" protId="THAEL26126" />
        <gene id="15649087" protId="THAEL11535" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15543224" protId="ANOCA12447" />
        <gene id="15548130" protId="ANOCA17353" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 

14it [00:08,  1.60it/s]

highest populated group Toxicofera
No top level hogs found for family 793084.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15671827" protId="PSETE03399" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15502885" protId="VARKO01289" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15802426" protId="PYTBI01066" />
   

15it [00:09,  1.48it/s]

first ortholog group Opisthokonta
highest populated group Iguania
No top level hogs found for family 793066.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15583820" protId="SCEUN35014" />
        <gene id="15583102" protId="SCEUN34296" />
        <gene id="15566860" protId="SCEUN18054" />
        <gene id="15583320" protId="SCEUN34514" />
        <gene id="15583097" protId="SCEUN34291" />
        <gene id="15583911" protId="SCEUN35105" />
        <gene id="15582600" protId="SCEUN33794" />
        <gene id="15576803" protId="SCEUN27997" />
        <gene id="15583096" protId="SCEUN34290" />
        <gene id="15562565" protId="SCEUN13759" />
        <gene id="15583098" protId="SCEUN34292" />
        <gene id="15574910" protId="S

16it [00:09,  1.42it/s]

 Tetrapoda
highest populated group Toxicofera
No top level hogs found for family 736663.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15684484" protId="PSETE16056" />
      </genes>
    </database>
  </species>
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene id="15619914" protId="PANGU27418" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15523674" protId="VARKO22078" />
      </genes>
    <

17it [00:10,  1.51it/s]

No top level hogs found for family 738932.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15819279" protId="PYTBI17919" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15738453" protId="CROTI12105" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15672214" protId="PSETE03786" /

18it [00:11,  1.57it/s]

<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15577262" protId="SCEUN28456" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15735701" protId="CROTI09353" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15510946" protId="VARKO09350" />
      </genes>
    </database>
  </species>
  <species name="4

19it [00:11,  1.54it/s]

Chordata
highest populated group Iguania
No top level hogs found for family 756503.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15576864" protId="SCEUN28058" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15747813" protId="CROTI21465" />
        <gene id="15743794" protId="CROTI17446" />
        <gene id="15743793" protId="CROTI17445" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL

20it [00:12,  1.56it/s]

Eukaryota
highest populated group Toxicofera
No top level hogs found for family 811441.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15653597" protId="THAEL16045" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15687162" protId="PSETE18734" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15709760" protId="NAJNA13377" />
      </genes>
    </data

21it [00:12,  1.66it/s]

Vertebrata
highest populated group Toxicofera
No top level hogs found for family 752445.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4010" NCBITaxId="103944" taxonId="1352">
    <database name="Protobothrops mucrosquamatus from Refseq" version="Refseq; P.Mucros_1.0; GCF_001527695.2; 23-MAY-2019; Protobothrops mucrosquamatus Annotation Release 101">
      <genes>
        <gene id="15778851" protId="PROMU00842" />
      </genes>
    </database>
  </species>
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15641936" protId="THAEL04384" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
      

22it [00:13,  1.78it/s]

 Eumetazoa
highest populated group Colubroidea
No top level hogs found for family 780027.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4010" NCBITaxId="103944" taxonId="1352">
    <database name="Protobothrops mucrosquamatus from Refseq" version="Refseq; P.Mucros_1.0; GCF_001527695.2; 23-MAY-2019; Protobothrops mucrosquamatus Annotation Release 101">
      <genes>
        <gene id="15783679" protId="PROMU05670" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15700677" protId="NAJNA04294" />
      </genes>
    </database>
  </species>
  <groups>
    <orthologGroup taxonId="1536">
      <property name="TaxRange" value="Eumetazoa" />
      <paralogGroup>
        <orthologGroup taxonId="1534">
          <property name="TaxRange" value="Deuterostomia" />
          <orthologG

23it [00:14,  1.68it/s]


highest populated group Toxicofera
No top level hogs found for family 747190.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15676806" protId="PSETE08378" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15727958" protId="CROTI01610" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15533675" protId="ANOCA02898" />
      </genes>
    </database>
  

24it [00:14,  1.65it/s]

first ortholog group Chordata
No top level hogs found for family 756931.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15715820" protId="NAJNA19437" />
      </genes>
    </database>
  </species>
  <groups>
    <orthologGroup taxonId="1533">
      <property name="TaxRange" value="Chordata" />
      <paralogGroup>
        <orthologGroup taxonId="1530">
          <property name="TaxRange" value="Euteleostomi" />
          <paralogGroup>
            <geneRef id="12225106" />
            <geneRef id="12227335" />
            <orthologGroup taxonId="1325">
              <property name="TaxRange" value="Clupeocephala" />
              <orthologGroup taxonId="1324">
                <property name="TaxRange" value="Euteleosteomorpha" />
                </orthologGroup>
            </orthologGro

25it [00:15,  1.72it/s]

highest populated group Toxicofera
No top level hogs found for family 1027243.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15641065" protId="THAEL03513" />
      </genes>
    </database>
  </species>
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15560759" protId="SCEUN11953" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15707735" protId="NAJNA1

26it [00:15,  1.62it/s]

Chordata
highest populated group Sauria
No top level hogs found for family 757613.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15536847" protId="ANOCA06070" />
      </genes>
    </database>
  </species>
  <groups>
    <orthologGroup taxonId="1533">
      <property name="TaxRange" value="Chordata" />
      <paralogGroup>
        <geneRef id="11643054" />
        <orthologGroup taxonId="1527">
          <property name="TaxRange" value="Amniota" />
          <paralogGroup>
            <orthologGroup taxonId="1362">
              <property name="TaxRange" value="Testudinoidea" />
              </orthologGroup>
            <orthologGroup taxonId="1394">
              <property name="TaxRange" value="Sauria" />
              <orthologGroup taxonId="1393">
                <property n

27it [00:16,  1.66it/s]

 LUCA
highest populated group Toxicofera
No top level hogs found for family 1027256.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15573794" protId="SCEUN24988" />
      </genes>
    </database>
  </species>
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene id="15625358" protId="PANGU32862" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotat

28it [00:17,  1.71it/s]


<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15643282" protId="THAEL05730" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15513174" protId="VARKO11578" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15812004" protId="PYTBI10644" />
      </genes>
    </database>
  </spec

29it [00:17,  1.63it/s]

<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15833527" protId="PYTBI32167" />
        <gene id="15816356" protId="PYTBI14996" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15766856" protId="CROTI40508" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15725038" protId="NAJNA28655" />
        <

30it [00:18,  1.56it/s]

Lepidosauria
highest populated group Toxicofera
No top level hogs found for family 708287.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15646800" protId="THAEL09248" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15827400" protId="PYTBI26040" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>


31it [00:18,  1.62it/s]

Eukaryota
highest populated group Toxicofera
No top level hogs found for family 813514.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15698238" protId="NAJNA01855" />
      </genes>
    </database>
  </species>
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15554414" protId="SCEUN05608" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15531096" protId="ANOCA00319" />
      </genes>
    </database>
  </

32it [00:19,  1.49it/s]

highest populated group Toxicofera
No top level hogs found for family 780078.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15541709" protId="ANOCA10932" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15525700" protId="VARKO24104" />
      </genes>
    </database>
  </species>
  <groups>
    <orthologGroup taxonId="1536">
      <property name="TaxRange" value="Eumetazoa" />
      <paralogGroup>
        <orthologGroup taxonId="1535">
          <property name="TaxRange" value="Bilateria" />
          <paralogGroup>
            <orthologGroup taxonId="1356" id="HOG:E0780078">
          

33it [00:20,  1.60it/s]

Eukaryota
highest populated group Toxicofera
No top level hogs found for family 801520.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene id="15600017" protId="PANGU07521" />
      </genes>
    </database>
  </species>
  <species name="4010" NCBITaxId="103944" taxonId="1352">
    <database name="Protobothrops mucrosquamatus from Refseq" version="Refseq; P.Mucros_1.0; GCF_001527695.2; 23-MAY-2019; Protobothrops mucrosquamatus Annotation Release 101">
      <genes>
        <gene id="15778053" protId="PROMU00044" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="157115

34it [00:20,  1.58it/s]

Gnathostomata
highest populated group Toxicofera
No top level hogs found for family 748362.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15697817" protId="NAJNA01434" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15528327" protId="VARKO26731" />
      </genes>
    </database>
  </species>
  <species name="4006" NCBITaxId="35005" taxonId="1346">
    <database name="Thamnophis elegans from Refseq" version="Refseq; rThaEle1.pri; GCF_009769535.1; 31-JAN-2020; Thamnophis elegans Annotation Release 100">
      <genes>
        <gene id="15638549" protId="THAEL00997" />
      </genes>
    </datab

35it [00:21,  1.60it/s]

first ortholog group Gnathostomata
highest populated group Toxicofera
No top level hogs found for family 745501.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15765917" protId="CROTI39569" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15811420" protId="PYTBI10060" />
      </genes>
    </database>
  </species>
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-20

36it [00:22,  1.66it/s]

Bilateria
highest populated group Toxicofera
No top level hogs found for family 766784.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15751532" protId="CROTI25184" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15530839" protId="ANOCA00062" />
      </genes>
    </database>
  </species>
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15688762" protId="PSETE20334" />
      </genes>
    </dat

37it [00:22,  1.71it/s]

Gnathostomata
highest populated group Toxicofera
No top level hogs found for family 746219.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15682476" protId="PSETE14048" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15519276" protId="VARKO17680" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15831591" protId="PYTB

38it [00:23,  1.80it/s]

Metazoa
highest populated group Toxicofera
No top level hogs found for family 790284.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15516700" protId="VARKO15104" />
      </genes>
    </database>
  </species>
  <species name="3905" NCBITaxId="8520" taxonId="1342">
    <database name="Sceloporus undulatus from Refseq" version="Refseq; SceUnd_v1.1; GCF_019175285.1; 19-JUL-2021; Sceloporus undulatus Annotation Release 100">
      <genes>
        <gene id="15567432" protId="SCEUN18626" />
        <gene id="15580889" protId="SCEUN32083" />
      </genes>
    </database>
  </species>
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15713136" pr

39it [00:23,  1.89it/s]

Chordata
highest populated group Serpentes
No top level hogs found for family 755477.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15523225" protId="VARKO21629" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15809595" protId="PYTBI08235" />
      </genes>
    </database>
  </species>
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene i

40it [00:24,  1.89it/s]


<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4007" NCBITaxId="8673" taxonId="1348">
    <database name="Pseudonaja textilis from Ensembl Vertebrates 51" version="Ensembl Vertebrates 51; EBS10Xv2-PRI">
      <genes>
        <gene id="15677551" protId="PSETE09123" />
      </genes>
    </database>
  </species>
  <species name="3904" NCBITaxId="28377" taxonId="1341">
    <database name="Anolis carolinensis from ENSEMBL v70" version="Ensembl 70; AnoCar2.0; 11-DEC-2012">
      <genes>
        <gene id="15544551" protId="ANOCA13774" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15820355" protId="PYTBI18995" />
      </genes>
    </database>
  </species>
  <species name="4010" NCBITaxId="103944" t

41it [00:24,  1.92it/s]


<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4005" NCBITaxId="94885" taxonId="1345">
    <database name="Pantherophis guttatus from Refseq" version="Refseq; CU_Pguttatus_1; GCF_029531705.1; 03-NOV-2023; GCF_029531705.1-RS_2023_10">
      <genes>
        <gene id="15633792" protId="PANGU41296" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15811709" protId="PYTBI10349" />
      </genes>
    </database>
  </species>
  <species name="4009" NCBITaxId="88082" taxonId="1351">
    <database name="Crotalus tigris from Refseq" version="Refseq; ASM1654583v1; GCF_016545835.1; 05-FEB-2021; Crotalus tigris Annotation Release 100">
      <genes>
        <gene id="15776513" protId="CROTI50165" />
      </g

42it [00:25,  1.75it/s]

Eukaryota
highest populated group Toxicofera
No top level hogs found for family 815629.
<orthoXML origin="OMA" originVersion="Jun 2024" version="0.5">
  <species name="4008" NCBITaxId="35670" taxonId="1349">
    <database name="Naja naja from Ensembl 111" version="Ensembl 111; Nana_v5; GCA_009733165.1">
      <genes>
        <gene id="15716222" protId="NAJNA19839" />
      </genes>
    </database>
  </species>
  <species name="3844" NCBITaxId="61221" taxonId="1340">
    <database name="Varanus komodoensis from Ensembl 107" version="Ensembl 107; ASM479886v1; GCA_004798865.1">
      <genes>
        <gene id="15503292" protId="VARKO01696" />
      </genes>
    </database>
  </species>
  <species name="3907" NCBITaxId="176946" taxonId="1344">
    <database name="Python bivittatus from Refseq" version="Refseq; Python_molurus_bivittatus-5.0.2; GCF_000186305.1; 24-MAY-2018; Python bivittatus Annotation Release 102">
      <genes>
        <gene id="15823159" protId="PYTBI21799" />
      </gene

43it [00:25,  1.65it/s]Process Process-2:
Traceback (most recent call last):
Process Process-1:
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/miniconda3/envs/hogprof/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):

  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/miniconda3/envs/hogprof/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/HogProf/src/HogProf/lshbuilder.py", line 360, in saver
    this_dataframe = retq.get()
                     ^^^^^^^^^^
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/miniconda3/envs/hogprof/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/miniconda3/envs/hogprof/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._k

KeyboardInterrupt: 

### check results

In [4]:
hogprof_test_dir = '/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/venom_project/2a_hogprof_testing'
outfileprefix = '241017_toxicofera_newpyham_'
### get all output files
outfiles = [os.path.join(hogprof_test_dir, f) for f in os.listdir(hogprof_test_dir) if f.startswith(outfileprefix)]
### tidy up and print outfiles
print("Found output files:")
outfiles_dict = {}
filetypes = 'errors,hashes,idmapper,newlshforest,reformatted_tree,taxaIndex,wmg'.split(',')
for type in filetypes:
    for f in outfiles:
        if type in os.path.basename(f):
            outfiles_dict[type] = f
            print(type, os.path.basename(f))
            break
### check profiler object
print("\nLoading profiler object:")
p = profiler.Profiler(lshforestpath=outfiles_dict['newlshforest'], 
                      hashes_h5=outfiles_dict['hashes'], 
                      mat_path=os.path.join(hogprof_test_dir, outfileprefix),
                      oma='/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/oma_downloads/OmaServer.h5',
                      nsamples=256,
                      mastertree=outfiles_dict['reformatted_tree'])

print("\nCheck the shape of the first dataset in the HDF5 file!")

### function to grab HOGs by protein ID
def grabHog(ID, verbose = True):
    try:
        entry = p.db_obj.entry_by_entry_nr(p.db_obj.id_resolver.resolve(ID))
        #print(entry)
        return entry
    except:
        return np.nan,np.nan


Found output files:
errors 241017_toxicofera_newpyham_errors.txt
hashes 241017_toxicofera_newpyham_hashes.h5
idmapper 241017_toxicofera_newpyham_idmapper.pkl
newlshforest 241017_toxicofera_newpyham_newlshforest.pkl
reformatted_tree 241017_toxicofera_newpyham_reformatted_tree.nwk
taxaIndex 241017_toxicofera_newpyham_taxaIndex.pkl
wmg 241017_toxicofera_newpyham_wmg.pkl

Loading profiler object:
loading lsh
indexing lsh
h5 <HDF5 file "241017_toxicofera_newpyham_hashes.h5" (mode r)> <KeysViewHDF5 ['NoFilter_Mask3778']>
first dataset in h5 file has shape (100, 0)
using newick
making tree weights w n taxa = : 2929
DONE

Check the shape of the first dataset in the HDF5 file!


In [5]:
### testing from PhylogeneticGraphAnalysis.ipynb
hog_id_toxicofera = [x[0] for x in p.db_obj.get_all_hogs_at_level('Toxicofera')]
print(f"Number of HOGs in Toxicofera: {len(hog_id_toxicofera)}")
print("Example HOG:", hog_id_toxicofera[0])
### generate hits for a HOG 
hits = p.hog_query(hog_id=hog_id_toxicofera[0], k=10)
print(f"Number of hits in the HOG: {len(hits)}")

Number of HOGs in Toxicofera: 31744
Example HOG: 706758
no dataset specified, using first dataset in the hdf5 file


IndexError: Index (1032346) out of range for (0-99)