### imports

In [1]:
import sys
import os
import time
from tables import *

# Get the current working directory
current_directory = os.getcwd()
print(f"Current working directory: {current_directory}")

# Add the parent directory 'HogProf/src/HogProf' to sys.path (for importing lshbuilder)
sys.path.append(os.path.abspath('HogProf/src/HogProf'))

# Import lshbuilder module from the parent directory
try:
    import lshbuilder
    print("Successfully imported lshbuilder.")
except ImportError as e:
    print(f"Error importing lshbuilder: {e}")

# Add the utils directory 'HogProf/src/HogProf/utils' to sys.path (for importing phyhamutils and hashutils)
sys.path.append(os.path.abspath('HogProf/src/HogProf/utils'))

# Import phyhamutils and hashutils from the utils directory
try:
    import pyhamutils
    import hashutils
    print("Successfully imported pyhamutils and hashutils.")
except ImportError as e:
    print(f"Error importing pyhamutils or hashutils: {e}")


Current working directory: /work/FAC/FBM/DBC/cdessim2/default/agavriilidou
Successfully imported lshbuilder.
Successfully imported pyhamutils and hashutils.


### lshbuilder main function

In [2]:
def lshbuilder_main(dbname, orthoglob=None, nperm=256, omafile=None, taxfilter=None, taxmask=None, weights=None, 
mastertree=None, lossonly=False, duplonly=False, taxcodes=False, verbose=False, reformat_names=False, threads=4,
dbtype=None):
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument('--taxweights', help='load optimised weights from keras model',type = str)
    parser.add_argument('--taxmask', help='consider only one branch',type = str)
    parser.add_argument('--taxfilter', help='remove these taxa' , type = str)
    parser.add_argument('--outpath', help='name of the db', type = str)
    parser.add_argument('--dbtype', help='preconfigured taxonomic ranges' , type = str)
    parser.add_argument('--OMA', help='use oma data ' , type = str)
    parser.add_argument('--OrthoGlob', help='a glob expression for orthoxml files ' , type = str)
    parser.add_argument('--tarfile', help='use tarfile with orthoxml data ' , type = str)
    parser.add_argument('--nperm', help='number of hash functions to use when constructing profiles' , type = int)
    parser.add_argument('--mastertree', help='master taxonomic tree. nodes should correspond to orthoxml' , type = str)
    
    parser.add_argument('--nthreads', help='nthreads for multiprocessing' , type = int)
    parser.add_argument('--lossonly', help='only compile loss events' , type = bool)
    parser.add_argument('--duplonly', help='only compile duplication events' , type = bool)
    parser.add_argument('--taxcodes', help='use taxid info in HOGs' , type = str)
    parser.add_argument('--verbose', help='print verbose output' , type = bool)
    parser.add_argument('--reformat_names', help='try to correct broken species trees by replacing all names with numbers.' , type = bool)
    '''
    ### the following dictionary may be outdated or does not work with reformat_names (14.10.2024)
    dbdict = {
        'all': { 'taxfilter': None , 'taxmask': None },
        'plants': { 'taxfilter': None , 'taxmask': 33090 },
        'archaea':{ 'taxfilter': None , 'taxmask': 2157 },
        'bacteria':{ 'taxfilter': None , 'taxmask': 2 },
        'eukarya':{ 'taxfilter': None , 'taxmask': 2759 },
        'protists':{ 'taxfilter': [2 , 2157 , 33090 , 4751, 33208] , 'taxmask':None },
        'fungi':{ 'taxfilter': None , 'taxmask': 4751 },
        'metazoa':{ 'taxfilter': None , 'taxmask': 33208 },
        'vertebrates':{ 'taxfilter': None , 'taxmask': 7742 },
    }

    if dbtype:
        taxfilter = dbdict[dbtype]['taxfilter']
        taxmask = dbdict[dbtype]['taxmask']
        print('using dbtype', dbtype, 'with taxfilter', taxfilter, 'and taxmask', taxmask)

    if weights:
        from keras.models import model_from_json
        json_file = open(  args['taxweights']+ '.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        model = model_from_json(loaded_model_json)
        # load weights into new model
        model.load_weights(  args['taxweights']+".h5")
        print("Loaded model from disk")
        weights = model.get_weights()[0]
        weights += 10 ** -10

    start = time.time()
    if omafile:
        with open_file( omafile , mode="r") as h5_oma:
            lsh_builder = lshbuilder.LSHBuilder(h5_oma = h5_oma,  fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
            treeweights= weights , taxfilter = taxfilter, taxmask=taxmask , masterTree =mastertree , 
            lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , reformat_names=reformat_names, verbose=verbose )
            #### maybe here is where load_one and saver are needed instaed of the run_pipeline!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            lsh_builder.run_pipeline(threads)
    else:
        lsh_builder = lshbuilder.LSHBuilder(h5_oma = None,  fileglob=orthoglob ,saving_name=dbname , numperm = nperm ,
        treeweights= weights , taxfilter = taxfilter, taxmask=taxmask ,
          masterTree =mastertree , lossonly = lossonly , duplonly = duplonly , use_taxcodes = taxcodes , reformat_names=reformat_names, verbose=verbose)
        lsh_builder.run_pipeline(threads)
    print(time.time() - start)
    print('DONE')

### testing

In [None]:
args = {
    'dbname': '/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/venom_project/2a_hogprof_testing',
    #'orthoglob': None,
    'omafile': '/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/oma_downloads/OmaServer.h5',
    #'taxweights': None,
    'taxmask': 'Toxicofera',            # 'Toxicofera',
    #'taxfilter': None,
    #'dbtype': 'vertebrates',   ###does not seem to work, taxmask seems to be ignored later in the pipeline
    #'nperm': 256,
    'mastertree': '/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/oma_downloads/speciestree.nwk',
    'threads': 1,
    #'lossonly': False,
    #'duplonly': False,
    #'taxcodes': True,        ### does not seem to work due to bacteria and archaea not being in the taxcodes
    'verbose': True,
    'reformat_names': True    ### seems to be necessary or broken newick file errors appear
}
lshbuilder_main(**args)

reformatted tree
taxmask Toxicofera
making tree weights w n taxa = : 4115
configuring pyham functions
swap ids False
reformat names True
use phyloxml False
use taxcodes False
reading oma hdf5 with n groups: 1040435
done

run w n threads: 2
start workers


0it [00:00, ?it/s]

worker init 0worker init 1
creating dataset

filtered at taxonomic level: NoFilter_Mask3778
{}
saver init 0


2it [00:01,  1.24it/s]

error species name '3626' maps to an ancestral name, not a leaf of the taxonomy


Process Process-2:
Traceback (most recent call last):
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/HogProf/src/HogProf/utils/pyhamutils.py", line 108, in get_ham_treemap_from_row
    ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml" , tree_format = format  , use_internal_name=use_internal_name, orthoXML_as_string=orthoXML_as_string )
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/miniconda3/envs/hogprof/lib/python3.12/site-packages/pyham/ham.py", line 259, in __init__
    self.top_level_hogs, self.extant_gene_map, self.external_id_mapper = self._build_hogs_and_genes(orthoxml_file, filter_object=self.filter_obj)
                                                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/work/FAC/FBM/DBC/cdessim2/default/aga

error

TypeError: species name '3561' maps to an ancestral name, not a leaf of the taxonomy


 species name '3321' maps to an ancestral name, not a leaf of the taxonomy


Process Process-1:
Traceback (most recent call last):
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/HogProf/src/HogProf/utils/pyhamutils.py", line 108, in get_ham_treemap_from_row
    ham_obj = pyham.Ham(tree, orthoxml, type_hog_file="orthoxml" , tree_format = format  , use_internal_name=use_internal_name, orthoXML_as_string=orthoXML_as_string )
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/work/FAC/FBM/DBC/cdessim2/default/agavriilidou/miniconda3/envs/hogprof/lib/python3.12/site-packages/pyham/ham.py", line 259, in __init__
    self.top_level_hogs, self.extant_gene_map, self.external_id_mapper = self._build_hogs_and_genes(orthoxml_file, filter_object=self.filter_obj)
                                                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/work/FAC/FBM/DBC/cdessim2/default/aga