In [1]:
# == NATIVE MODULES
import re
import os
# == INSTALLED MODULES
import pandas as pd
from Bio import SeqIO
# == PROJECT MODULES
from py.attach_taxid2fasta import import_taxdump

## CONSTANT FILE PATHS

In [2]:
ROOTDIR_ALN_FASTA = "/Users/bellieny/projects/nomburg_j-ligT_phylogeny/dump/"
FILE_PATTERN = re.compile(r".*ligT_phyrec.*/clustalo_.*/merged/db-nr_query_merged-txid\.msa\.fasta")
TAXID_FILE = "/Users/bellieny/projects/nomburg_j-ligT_phylogeny/input_data/taxidlineage.dmp"
OUTPUT_ROOT_DIRECTORY = "input_data/sequences_for_pdes/ligT/sequence_reps"

## CONSTANT INTERNAL VARIABLES

In [11]:
HIGH_TAXA_COLOR_ASSIGNMENT = {
# Eukaryota
2759: "#2F00EA",
# Bacteria
2: "#C24154",
# Viruses
10239: "#06AD2D",
# Archaea
2157: "#EACC2B"
}
LOW_TAXA_COLOR_ASSIGNMENT = {
    # Rotavirus
10912: "#795599", 
# Torovirus
11155: "#CAE56F", 
# Betacorona
694002: "#84C6EB", 
# Iridoviridae
10486: "#EF6F6F", 
# Marseilleviridae
944644: "#EA046A", 
# Poxviridae
10240: "#F0D699", 
# Straboviridae
2946170: "#0AEBBF", 
# Mimiviridae
549779: "#E19F56", 
# Potyviridae
39729: "#9003A3", 
# Asfarviridae
137992: "#0AEBBF", 
# Herpesvirales
548681: "#9003A3", 
# Parvoviridae
10780: "#E19F56", 
# Orthomyxoviridae
11308: "#E19F56", 
# Rhabdoviridae
11270: "#9003A3", 
# Baculoviridae
10442: "#0AEBBF"
}

## DEFINE FUNCTIONS

In [34]:
def catch_txids_with_underscores(record_id, txid_dictionary, txid_to_color):
    # for id_piece in record_id.split("_"):
    id_piece = record_id.split("_")[-1]
    id_piece = id_piece.strip("_")
    if re.search(r"\|", record_id):
        return None, None
    try:
        id_piece = int(id_piece)
    except ValueError:
        return None, None
    try:
        lineage = txid_dictionary[id_piece]
        lineage_string = "|".join(lineage)
        corrected_id = f"{record_id}|{lineage_string}"
        for txid in lineage:
            try:
                hex_color = txid_to_color[int(txid)]
                return corrected_id, hex_color
            except (KeyError, ValueError):
                continue            
    except KeyError:
        return None, None
    return None, None

In [35]:
def attach_txid_to_read_fasta(root_dir, filename_pattern, color_to_taxa_dictionary, txid_dictionary):
    color_assigned_dictionary = {}
    for root, directory, files in os.walk(root_dir):
        for file in files:
            filepath = f"{root}/{file}"
            # print(filepath)
            if filename_pattern.match(filepath):
                # print(f"Processing {filepath}")
                file_prefix = re.search(r".*clustalo_(\S+)\/merged\S+", filepath)
                clean_prefix = file_prefix.group(1).replace("cluster", "cl_")
                # print(f"File prefix: {clean_prefix}")
                with open(filepath, "r") as fasta_aln_handle:
                    for record in SeqIO.parse(fasta_aln_handle, "fasta"):
                        for txid in color_to_taxa_dictionary.keys():
                            pattern = f"\|{txid}\|"                        
                            if re.search(pattern, record.id):
                                color_hex = color_to_taxa_dictionary[txid]
                                clean_record_id = record.id.rstrip("(+-)")
                                color_assigned_dictionary.setdefault(clean_prefix, {}).setdefault("color_hex", {}).setdefault(clean_record_id, color_hex)
                            if not re.search(pattern, record.id):
                                recovered_record, color_hex = catch_txids_with_underscores(record.id, txid_dictionary, color_to_taxa_dictionary)
                                if recovered_record is not None:
                                    clean_record_id = record.id.rstrip("(+-)")
                                    color_assigned_dictionary.setdefault(clean_prefix, {}).setdefault("color_hex", {}).setdefault(clean_record_id, color_hex)
    return color_assigned_dictionary

In [44]:
def cleanup_underscores(id_list):
    clean_list = []
    for raw_id in id_list:
        clean_id = raw_id.rstrip("_-")
        clean_list.append(clean_id)
    return clean_list

## PROCESS CORE VARIABLES

In [ ]:
txid_lineage_dictionary = import_taxdump(TAXID_FILE)

In [37]:
super_kingdom_colors = attach_txid_to_read_fasta(ROOTDIR_ALN_FASTA, FILE_PATTERN, HIGH_TAXA_COLOR_ASSIGNMENT, txid_lineage_dictionary)
viral_taxa_colors= attach_txid_to_read_fasta(ROOTDIR_ALN_FASTA, FILE_PATTERN, LOW_TAXA_COLOR_ASSIGNMENT, txid_lineage_dictionary)

## IMPORT SEQUENCE IDs 
### **SORTED ACCORDING TO PHYLOGENY BRANCH ORDER

In [30]:
cl_55 = cleanup_underscores(["KAB8670392|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3502|3514|12989|176857", "KAI4270596|131567|2759|33154|4751|451864|4890|716545|147538|716546|147547|1520881|388435|107465|1302176|88646|1301143|88741|2633068|2903222", "KAI4129189|131567|2759|33154|4751|451864|4890|716545|147538|716546|147547|1520881|388435|107465|1302176|88646|1301213|1301585|1301586", "TVY93121|131567|2759|33154|4751|451864|4890|716545|147538|716546|715989|147548|5178|2589077|47830|215461", "OMJ30233|131567|2759|33154|4751|112252|1913638|451828|2219693|61421|4883|4888|133412", "CRL00856|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7147|7148|43786|41828|7149|43808|315556|568069", "CAI5787563|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|8504|8509|1329961|1329950|1329912|1329976|1329975|8522|162266|42163|74358", "XP_037758730|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|1329799|2841271|8459|8464|1579337|1579336|27791|8465|8468|8469", "KAF0408355|131567|2759|33154|4751|112252|1913637|214504|214506|214509|36753|4873|4874", "putative_superfamily_II_RNA_helicase__YP_009052108__Aureococcus_anophagefferens_virus__1474867", "YP_009173671|10239|2732004|2732005|2732007|2732523|2732524|10501|455363|455364|dsDNA", "hypothetical_protein_ATCV1_Z596R__YP_001427077__Acanthocystis_turfacea_chlorella_virus_1__322019", "MAP87588|131567|2|2323|1783234|265317|2894089|2026781", "GJQ12604|131567|2759|2763|2797|3039357|3039359|83373|83374", "XP_059813464|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|7777|7778|117893|117851|117839|30469|2055258|79690", "CAG8510740|131567|2759|33154|4751|112252|1913637|214504|214506|214508|144536|144537|144539", "KAI7855348|131567|2759|33154|4751|112252|1913637|451507|2212703|4827|499202|101102|101103", "CEJ00218|131567|2759|33154|4751|112252|1913637|451507|2212703|4827|1344963|1344955|4842|58291", "KAG2228637|131567|2759|33154|4751|112252|1913637|451507|2212703|4827|1344963|34489|101141|101142", "GAN09375|131567|2759|33154|4751|112252|1913637|451507|2212703|4827|1344963|34489|4830|91626", "QFZ79300|10239|2559587|2732396|2732408|2732507|2732550|39729|12195|31743|ssRNA___", "CI__YP_007969888__Donkey_orchid_virus_A__1198144", "NP_734343|10239|2559587|2732396|2732408|2732507|2732550|39729|12195|12211|ssRNA___", "BCY04559|10239|2732004|2732005|2732007|2732525|2732526|137992|697905|2839893|dsDNA", "B962L__YP_009703129__African_swine_fever_virus__10497", "P0C9A4|10239|2732004|2732005|2732007|2732525|2732526|137992|39743|10497|561445|dsDNA", "HrpA-like_helicase__YP_007354542__Acanthamoeba_polyphaga_moumouvirus__1269028", "AXN90925|10239|2732004|2732005|2732007|2732523|2732554|549779|985780|1302812|dsDNA", "KAJ6240145|131567|2759|2611341|2683626|2055135|1746091", "XP_008223753|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3744|3745|171637|721805|3754|102107", "XP_003673275|131567|2759|33154|4751|451864|4890|716545|147537|4891|4892|4893|278028|27288|1064592", "XP_013753845|131567|2759|554296|2925400|172820|877559|529818|461836", "XP_058776481|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|72025|3803|3814|2231393|2231382|2233838|2233839|163743|3904|3911", "XP_017031957|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7147|7203|43733|480118|480117|43738|43741|43746|7214|43845|46877|7215|32341|32346|32352|30033", "PWA95452|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91882|4209|4210|102804|102810|886714|4219|35608", "KAG6417796|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91888|4143|4136|216706|216718|2836339|21880|2026555|2026556|180675", "KAK0582910|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|41937|23672|1977916|1977919|4022|4024", "WJX61645|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|72025|3803|3814|2231393|2231382|2233838|2233839|163742|3898|3899", "KAF1869792|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|72025|3803|3814|2231393|2231384|2231385|163729|3869|3870", "KAI8914517|131567|2759|33154|4751|112252|4761|2683659|451435|451442|1395787|1395788|382530", "KAG2018726|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2982305|184208|184431|5346|1132390", "RAR01829|131567|2759|33154|4751|451864|4890|716545|147538|716546|715962|147541|451868|92860|715340|28556|95729|183478", "KXT11234|131567|2759|33154|4751|451864|4890|716545|147538|716546|715962|147541|451867|2726947|93133|131324|113226", "OQO28001|131567|2759|33154|4751|451864|4890|716545|147538|716546|715962|147541|451867|2726946|452563|470018|2685422|1974281", "KAI4858062|131567|2759|33154|4751|451864|4890|716545|147538|716546|715962|147541|451867|5014|1570301|5579|2647315|2894280"
])

In [38]:
cl_56 = cleanup_underscores(["AOX47651|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1507401|1507402|1911555|ssDNA",
"NS1__YP_009508797__Miniopterus_schreibersii_bat_bocavirus__1911554",
"NS1__YP_009315891__Rhinolophus_sinicus_bocaparvovirus__1911555",
"NS2__YP_009508798__Miniopterus_schreibersii_bat_bocavirus__1911554",
"NS2__YP_009315890__Rhinolophus_sinicus_bocaparvovirus__1911555",
"NS1__YP_009553047__Rhinolophus_pusillus_bocaparvovirus_2__2053080",
"YP_009552695|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1507401|1507402|2053079|ssDNA",
"non-structural_protein_1__YP_009272913__Mink_bocavirus_1__2259808",
"NS1__YP_008802580__Feline_bocaparvovirus_2__1417756",
"NS1__YP_009508781__Feline_bocaparvovirus_3__2259807",
"NS1__YP_006272944__Feline_bocavirus__1174530",
"NS1__YP_007518454__Canine_bocavirus_1__1511885",
"NS1__YP_009507354__California_sea_lion_bocavirus_3__1073961",
"NS1__YP_009507350__California_sea_lion_bocavirus_1__1073959",
"NS1_protein__YP_009229908__Bat_bocavirus__1329649",
"nonstructural_protein__YP_009508793__Bat_bocavirus_XM30__2259811",
"nonstructural_protein__YP_009508789__Bat_bocavirus_WM40__2259810",
"nonstructural_protein_1__YP_009215301__Lagomorph_bocaparvovirus_1__2169774",
"NS1__YP_009551693__Lupine_bocavirus__2017714",
"NS1__YP_009046822__Porcine_bocavirus_1_pig_ZJD_China_2006__795694",
"NS1__YP_009553050__Rousettus_leschenaultii_bocaparvovirus_1__2053082",
"NS1__YP_010086823__Murine_bocavirus__2171381",
"NS1__YP_009507358__Bocavirus_pig_SX_China_2010__912967",
"NS1__YP_009227291__Rat_bocavirus__1788315",
"YP_009227290|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1507401|3052043|1788315|ssDNA",
"NS1__YP_009508785__Myotis_myotis_bocavirus_1__1195367",
"NS1__YP_010087787__Primate_bocaparvovirus_3__2654204",
"NS1__YP_002808454__Human_bocavirus_3__638313",
"non_structural_protein_1__YP_003799996__Bocavirus_gorilla_GBoV1_2009__864686",
"ASS83769|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1507401|3052040|ssDNA",
"NS1__YP_002586773__Human_bocavirus_2c_PK__1511882",
"non_structural_protein_NS2__YP_002916059__Human_bocavirus_4_NI__1511883",
"non_structural_protein_NS1__YP_002916060__Human_bocavirus_4_NI__1511883",
"NS1__YP_009389292__Dromedary_camel_bocaparvovirus_1__2014603",
"NS1_gene_product__YP_005086948__Porcine_bocavirus_5_JS677__1131622",
"AHK26969|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1507401|3052049|1084715|ssDNA",
"NS1__YP_009010974__Porcine_bocavirus__1165907",
"NS1__YP_004869645__Porcine_bocavirus_4-1__1084717",
"NS1__YP_004869641__Porcine_bocavirus_3__1084715",
"nonstructural_protein__YP_009259490__Ungulate_bocaparvovirus_6__1864484",
"NS1__YP_009389289__Dromedary_camel_bocaparvovirus_2__2014604",
"ASC49396|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1507401|3052051|2014604|ssDNA",
"YP_010802414|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1511864|2137543|2137544|ssDNA",
"nonstructural_protein_1__YP_009552825__Red-crowned_crane_parvovirus__2079601",
"nonstructural_protein_1__YP_009552126__Red-crowned_crane_parvovirus__2079601",
"NS__YP_010086570__Seal_parvovirus__1427158",
"NS1__YP_009507375__Chipmunk_parvovirus__56820",
"NS1__YP_009507368__Simian_parvovirus__31598",
"non-structural_protein_NS1__YP_004928144__Human_parvovirus_B19__10798",
"NS1_protein__NP_694863__Human_erythrovirus_V9__72197",
"non-structural_protein__YP_009112878__Porcine_parvovirus_2__1126383",
"QBA84539|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1511911|3052773|1126383|ssDNA",
"nonstructural_protein__YP_009507388__Parvovirus_YX-2010_CHN__754189",
"gp1_protein__YP_009389278__Ungulate_tetraparvovirus_3__1511916",
"unnamed_protein_product__YP_005090504__Eidolon_helvum_-bat-_parvovirus__1131483",
"YP_010802410|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1511911|1908803|2137541|ssDNA",
"ORF1__YP_238482__Human_parvovirus_4_G1__1511919",
"nonstructural_protein_1__YP_008439207__Porcine_partetravirus__1208310",
"NS1__YP_009315886__Tetraparvovirus_sp.__1908804",
"nonstructural_protein__YP_009175068__Ungulate_tetraparvovirus_1__1511914",
"non-structural_protein__YP_009507390__Ovine_hokovirus__1096142",
"QKE54972|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|1940570|ssDNA",
"YP_005097851|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|2733235|3051981|1131485|ssDNA",
"QJI54729|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1511888|2555546|2731271|ssDNA",
"nonstructural_protein__YP_009325417__Bosavirus_MS-2016a__1917013",
"QYW06833|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1511888|2555546|2839030|ssDNA",
"nonstructural_protein__YP_009552775__Equine_parvovirus_H__2079554",
"NS1__YP_009116876__Sesavirus_CSL10538__1519097",
"nonstructural_protein__YP_009021566__Porcine_parvovirus_6__1472911",
"replicase__YP_004013956__Porcine_parvovirus_4__707546",
"replicase__YP_008888533__Porcine_parvovirus_5__1241957",
"AWO81967|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1506574|1506575|1241957|ssDNA",
"NS1__YP_009111339__Slow_loris_parvovirus_1__1581151",
"YP_009154712|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|10803|3052180|1670662|ssDNA",
"QKN88777|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|10803|341670|2052559|ssDNA",
"Rep__YP_010086821__Murine_adeno-associated_virus_2__2171378",
"Rep__YP_010086819__Murine_adeno-associated_virus_1__2171377",
"rep_protein__YP_009552823__Adeno-associated_virus__272636",
"REP_protein__YP_068410__Muscovy_duck_parvovirus__37325",
"REP__NP_043514__Goose_parvovirus__38251",
"AQT18929|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|10803|1511895|341673|ssDNA",
"rep_protein__YP_003858571__Bat_adeno-associated_virus_YNM__727962",
"Rep78__YP_009507366__California_sea_lion_adeno-associated_virus_1__1073950",
"Rep_78_protein__YP_680423__adeno-associated_virus_2__10804",
"Rep_68_protein__YP_680422__adeno-associated_virus_2__10804",
"nonstructural_protein__YP_077179__Adeno-associated_virus_-_8__202813",
"nonstructural_protein__YP_077177__Adeno-associated_virus_-_7__202812",
"nonstructural_protein_1__YP_009241376__Megabat_bufavirus_1__1756191",
"nonstructural_protein_1__YP_009508802__Wuharv_parvovirus_1__1245561",
"NS1__YP_009508804__Cutavirus__1867125",
"NS1__YP_009058894__Bufavirus-3__1391667",
"BAS32595|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|207678|1391667|ssDNA",
"nonstructural_protein__YP_009130650__Eulipotyphla_protoparvovirus_1__2170148",
"NS1__YP_009186840__Rat_bufavirus_SY-2015__1763507",
"non-structural_protein__partial__YP_009666130__Protoparvovirus_Zsana_2013_HUN__1755694",
"NS1__YP_009272690__Sea_otter_parvovirus_1__1882382",
"NS1__YP_009507341__Gray_fox_amdovirus__1093101",
"putative_non-structural_protein_1__YP_009361877__Skunk_amdoparvovirus__1961072",
"NS1__YP_009315908__Amdoparvovirus_sp.__1908805",
"NS1__YP_009110759__Raccoon_dog_amdovirus__1513315",
"Parvovirus_rep_homolog__NP_050269__Human_betaherpesvirus_6B__32604",
"NP_042987|10239|2731341|2731360|2731361|2731363|548681|3044472|10357|40272|3050296|32603",
"WAQ80635|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|2555903|3003963|ssDNA",
"KAJ7355041|131567|2759|33154|33208|6072|6073|6101|6102|6125|123763|46746|214966|174260",
"QTE03877|10239|2731342|2732092|2732415|2732422|2732534|10780|2732887|2733231|2736646|2794492|ssDNA",
"NS1__YP_009551964__Porcine_parvovirus_7__1820046",
"WNO11827|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|1940570|ssDNA",
"non-structural_protein_1__YP_010086594__Rat_parvovirus_2__1885571",
"nonstructural_protein_NS1__YP_009553675__Mouse_kidney_parvovirus__2316143",
"nonstructural_protein_1__YP_009328889__Desmodus_rotundus_parvovirus__1926498",
"WAX26075|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|207678|3003986|ssDNA",
"XP_038063171|131567|2759|33154|33208|6072|33213|33511|7586|133551|7587|7588|41243|41166|7592|35076|46514",
"KXJ25075|131567|2759|33154|33208|6072|6073|6101|6102|6103|42823|1720308|2652724",
"UMO75531|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|207678|1885570|2917168|ssDNA",
"YP_010087281|10239|2731342|2732092|2732415|2732422|2732534|10780|2732887|3044736|3052253|2662396|ssDNA",
"XP_052817762|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6447|6544|2785011|6599|735337|2785015|2908833|278205|6601|6602|6603|6604",
"QTZ83163|10239|2731342|2732092|2732415|2732422|2732534|10780|2732887|2733233|2788254|2794551|ssDNA",
"WAX26081|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|207678|3003987|ssDNA",
"QID88579|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|207678|1885570|2529481|2448649|ssDNA",
"WAX26070|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|207678|3003985|ssDNA",
"QVW56858|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2794482|ssDNA",
"GFU19424|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|450948|6914|299642",
"KAF8789558|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|6913|94028|94029",
"KAF8789384|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|6913|94028|94029",
"GBO16608|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|6913|29012|182803",
"QKE54997|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|1940570|ssDNA",
"ASM94078|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2021901|ssDNA",
"QKE54987|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|1940570|ssDNA",
"QKE54932|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|1940570|ssDNA",
"WAQ80633|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|1940570|ssDNA",
"KAH3837678|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6447|6544|2785011|6599|735337|2785015|2908833|278205|105711|45950|45951|45954",
"KAI0217859|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6340|6341|105389|105391|6362|41324|6423|104711",
"KAI0234581|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6340|6341|105389|105391|6362|41324|6423|104711",
"QTZ83199|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2794540|ssDNA",
"QTE04048|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2794538|ssDNA",
"YP_010802435|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|185880|2672571|ssDNA",
"XP_047486854|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6657|2172821|6681|72041|6682|6683|6684|111520|6685|133894|139456",
"AGO44085|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|1513223|1513224|ssDNA",
"AJD76763|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|185880|1593240|ssDNA",
"KAF2368535|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6657|2172821|6681|72041|6820|6821|1732196|1732206|199476|92169|1923958|1923959",
"AXQ04869|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|185880|2304518|ssDNA",
"AXQ04852|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|185880|2304518|ssDNA",
"KAJ8891357|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33341|7020|213547|523720|55199|262157|523929|614101",
"XP_023727852|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33341|6970|85823|1049657|1912919|46562|105801|60568|105785",
"CDI96499|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6199|6200|6201|6208|6209|6211",
"CAH8618923|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|6180|31244|31245|6181|6188",
"CAH8618910|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|6180|31244|31245|6181|6188",
"CAH8664152|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|27871|27872|1776223|73421|57077|57078",
"CAI2737803|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|27871|27872|1776223|73421|57077|57078",
"VDP40914|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|27871|27841|404429|99737|27847|27848",
"KAA3680354|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|27871|116925|34502|34503|34504",
"KAA3679444|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|27871|116925|34502|34503|34504",
"KAA3669889|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6157|6178|6179|27871|116925|34502|34503|34504",
"MBS3768428|131567|2|1783270|456828|2030808",
"KAK2710715|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6657|6658|116556|6659|38009|6660|6661",
"EFX61411|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6657|6658|116557|84337|6665|116561|77658|6668|6669",
"QTE03826|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|2555903|2794458|ssDNA",
"AWV66973|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|2555903|2050976|ssDNA",
"AWV66983|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|2555903|2050976|ssDNA",
"KAI2802364|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6933|6946|83137|6951|66561|41303|40696|40697",
"KAI2800364|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6933|6946|83137|6951|66561|41303|40696|40697",
"WGL41075|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|2555903|3042446|ssDNA",
"QTE04070|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2794532|ssDNA",
"XP_018907625|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33377|7036|33379|7037|7038",
"UQB76443|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2940904|ssDNA",
"XP_050527957|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33380|33382|33384|58001|58002",
"nonstructural_protein_1__YP_008658568__Acheta_domestica_mini_ambidensovirus__1404345",
"QTE04075|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2794533|ssDNA",
"QNS31045|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|185880|2771466|ssDNA",
"ASU47551|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|185880|2027355|ssDNA",
"UQT02526|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|185880|2936550|ssDNA",
"putative_nonstructural_protein_NS1__YP_009256211__Diaphorina_citri_densovirus__1776153",
"KAI5693964|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33375|1585420|121844|121845",
"CAH0562759|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7041|41084|41088|71526|116151|577241|1431902|1431903",
"CAH1106986|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7041|41084|41088|71528|27439|63710|131578|294691|2598218",
"putative_NS1__YP_009552708__Bombus_cryptarum_densovirus__2094261",
"QKE54882|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|1940570|ssDNA",
"XP_018915236|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33377|7036|33379|7037|7038",
"CAH0395963|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33377|7036|33379|7037|7038",
"QTE04065|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|40122|2766834|2794562|ssDNA",
"NP_542609|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|40122|3052255|46253|ssDNA",
"YP_004678720|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|40122|3052259|1045778|ssDNA",
"XP_040065318|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6933|6934|6935|297308|6939|426442|6944|6945",
"XP_037517702|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6933|6934|6935|297308|6939|426437|34630|426455|578835|34632",
"XP_049268860|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6933|6934|6935|297308|6939|426437|34630|426455|578835|34632",
"XP_049525173|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6933|6934|6935|297308|6939|426437|34619|543639",
"XP_054917332|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6933|6934|6935|297308|6939|426437|34619|34620",
"QTE04129|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|2555903|2794461|ssDNA",
"KAG8176787|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|81835|247617|447591|931172",
"KAG8176088|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|81835|247617|447591|931172",
"XP_047116525|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33341|6993|7001|1955150|70910|92621|7002|37267|7008|274613",
"GFR09370|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|450948|2585208|2740835",
"GBN37837|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|6843|6854|6893|6905|74971|74974|74975|6913|29012|182803",
"NS1__YP_008766862__Solenopsis_invicta_densovirus__1414671",
"putative_nonstructural_protein__NP_694843__Planococcus_citri_densovirus__159153",
"ARV85890|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|2733229|3052742|159153|ssDNA",
"QTE03821|10239|2731342|2732092|2732415|2732422|2732534|10780|535600|2794518|ssDNA",
"XP_050443865|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33380|33382|33383|38118|749396|133065",
"CAI6374886|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33380|33385|27482|133076|33386|13130|13131",
"CAI6374191|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33380|33385|27482|133076|33386|13130|13131",
"KAE9530106|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33380|33385|27482|133076|33387|80764|464929|307491",
"YP_009362129|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|1979751|551224|ssDNA",
"nonstructural_protein__YP_009507340__Sea_star-associated_densovirus__1979755",
"non-structural_protein_1__YP_009134732__Cherax_quadricarinatus_densovirus__1642018",
"QTE04088|10239|2731342|2732092|2732415|2732422|2732534|10780|40120|10806|2555903|2794449|ssDNA",
"nonstructural_protein_NS-1__YP_002887625__Culex_pipiens_densovirus__185638",
"WEY02526|10239|2731342|2732092|2732415|2732422|2732534|10780|40119|1506574|3052707|10788|ssDNA",
"nonstructural_protein_NS1__YP_007003823__Pseudoplusia_includens_densovirus__185637",
"nonstructural_protein_NS1__NP_046813__Diatraea_saccharalis_densovirus__72003"])

In [40]:
cl_28 = cleanup_underscores(["DAS70704|10239|2731341|2731360|2731618|2731619|2788787|2832643",
"DAX18138|10239|2731341|2731360|2731618|2731619|2788787|2832643",
"WP_291631833|131567|2|1783272|1239|186801|186802|31979|1485|2614128|1506",
"MBP3431438|131567|2|1783272|1239|186801|348841|2044939",
"RTL07789|131567|2|2323|1783234|95901|1104668|2030827",
"WP_200601543|131567|2|1224|28211|356|119045|407|2738439",
"WP_200338444|131567|2|1224|28211|356|2844403|85274|1088",
"WKY48942|131567|2|1783272|1239|186801|186802|186806|207769|3064258",
"MBO7527664|131567|2|1783272|1239|186801|348841|2044939",
"RHO64986|131567|2|1783272|1239|186801|186802|31979|580596|2633649|2292004",
"WP_295249102|131567|2|1783272|1239|186801|186802|216572|1263|2608920|41978",
"DAN70698|10239|2731341|2731360|2731618|2731619|2788787|2832643",
"MCK9529451|131567|2|1224|1236|33811|1913989",
"TFG97417|131567|2157|1935183|1706441|2053491",
"hypothetical_protein_BNJ_00086__YP_009352506__Kaumoebavirus_Viruses.__X",
"QHN71438|10239|12429|2204151|2700071",
"XP_004333849|131567|2759|554915|555280|1485168|555407|33677|5754|5755|1257118",
"MDE5721999|131567|2|1783272|1239|186801|348841|2044939",
"WP_303911026|131567|2|1783270|68336|976|200643|171549|815|816|85831",
"WP_251547217|131567|2|1783272|1239|186801|186802|2941492|2941493|2941511",
"MCL2593098|131567|2|1783272|1239|186801|186802|1185407|1185411|2660712",
"MCD7723722|131567|2|1783272|1239|186801|186802|39779|1898207",
"RGF93587|131567|2|1783272|1239|84086|2292896",
"WP_022231141|131567|2|1783272|1239|186801|186802|216572|3073565|2885356",
"WP_302810064|131567|2|1783272|1239|186801|186802|186806|1730|39496",
"MCI9565622|131567|2|1783272|1239|186801|186802|186806|1730|2624479|142586",
"MBS5669385|131567|2|1783272|1239|186801|186802|31979|1485|2614128|1506",
"MBP3488832|131567|2|1783272|1239|186801|186802|186803|841|2637578|2049040",
"MBO5239443|131567|2|1783272|1239|186801|186802|186803|186928|1898203",
"MCM1495987|131567|2|1783270|68336|976|200643|171549|815|816|2646097|29523",
"MBO5239161|131567|2|1783272|1239|186801|186802|186803|186928|1898203",
"MBP3544005|131567|2|1783272|1239|186801|186802|186803|186928|1898203",
"MBP1548086|131567|2|1783272|1239|186801|186802|216572|473772|2485925",
"MCI5584695|131567|2|1783272|1239|186801|186802|186803|186928|1898203",
"OLA00688|131567|2|1783272|1239|186801|186802|39779|1897050",
"WP_255882465|131567|2|1783272|1239|186801|186802|216572|1263|2608920",
"MCH5199706|131567|2|1783272|1239|186801|186802|216572|473772|2485925",
"MCL2076835|131567|2|1783272|1239|186801|186802|216572|473772|2485925",
"WP_283606935|131567|2|1783272|1239|186801|186802|31979|1485|2614128|2991836",
"WP_187301472|131567|2|1783272|1239|186801|186802|538999|543314|2944199|2763644",
"MBS4795577|131567|2|1783272|1239|186801|186802|39779|1898207",
"MBS6271790|131567|2|1783272|1239|186801|186802|31979|189971|1898204",
"PWL72038|131567|2|1783272|1239|186801|186802|39779|1898207",
"MBS5088849|131567|2|1783272|1239|186801|186802|31979|189971|1898204",
"WP_263042104|131567|2|1783272|1239|186801|186802|216572|1918385|2620728|2983399",
"TQD78492|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3744|3745|171637|721813|3749|106549",
"CAI9111570|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91888|4055|24966|169617|169663|1667295|43535|43536|529605",
"KAH0658879|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91888|4069|4070|424551|424574|4107|4113",
"KAF4397478|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3744|3481|3482|3483",
"RXH69607|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3744|3745|171637|721813|3749|3750",
"KAA8548226|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|41934|4289|4290|561372",
"KAI6671390|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|41944|3931|1699513|1699522|178174|1042139",
"KAJ6949292|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3646|3688|238069|3689|444605",
"KAJ7013843|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3646|3688|238069|3689|444605",
"CAE6203357|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|3699|3700|980083|3701|38785",
"KAF1859939|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|72025|3803|3814|2231393|2231384|2231385|163729|3869|3870",
"GKV37181|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|41938|40588|65009|3068352|152421",
"TQE04184|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3744|3745|171637|721813|3749|106549",
"XP_044559165|131567|2759|2611352|5752|2601529|2601530|5765|5761|5763",
"uridine_kinase__YP_007354317__Acanthamoeba_polyphaga_moumouvirus__1269028",
"YP_010782092|10239|2732004|2732005|2732007|2732523|2732554|549779|3044648|2094720|2126985|dsDNA",
"KAF4666815|131567|2759|2698737|33630|2497438|27997|27998|27999|28000|330153",
"ONH69696|131567|2759|33154|4751|451864|4890|716545|147537|4891|4892|115784|604195|36022",
"XP_047843893|131567|2759|33154|4751|451864|4890|716545|147538|716546|715989|147550|222543|5125|474942|1052105|2060973",
"MDR0942773|131567|2|1224|28211|1921002|1921007|2053559",
"ORF_10__YP_001876447__Beluga_whale_coronavirus_SW1__694015",
"QII89030|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|694013|2509476|2724084|2716128|ssRNA___",
"MCX6764301|131567|2|2323|1783234|1783273|1794811|1817911|2093793",
"A240L__YP_009703091__African_swine_fever_virus__10497",
"YP_009702935|10239|2732004|2732005|2732007|2732525|2732526|137992|39743|10497|dsDNA",
"thymidylate_kinase__YP_003358216__Anguillid_herpesvirus_1__150286",
"KAF4693247|131567|2759|2698737|33630|2497438|27997|27998|27999|28000|32597",
"AYO41447|131567|2759|33154|4751|451864|5204|452284|1538075|162474|742845|55193|76775|425264",
"XP_027482480|131567|2759|33154|4751|451864|5204|452284|1538075|162474|742845|55193|76775",
"KAJ1662809|131567|2759|33154|4751|112252|1913638|451828|2219690|4861|4862|4863|2641067|2512252",
"KAJ1949035|131567|2759|33154|4751|112252|1913638|451828|2219690|4861|4862|4867|4868",
"CRL00243|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7147|7148|43786|41828|7149|43808|315556|568069",
"KAJ8978249|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7041|41084|41088|71528|34667|79514|192380|1323399|1323400",
"CAH7712160|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7041|41084|41088|71528|27439|64387|256835|64390|146774",
"XP_021377737|131567|2759|33154|33208|6072|33213|33317|2697495|1206795|6447|6544|2785011|6545|106218|106219|6566|186466|6573",
"KAI1235731|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|1329799|8492|436486|436489|436491|436492|8782|8825|9126|9170|85070|245042",
"GLD65411|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489904|1489905|8184|8186|270547",
"KHJ44075|131567|2759|33154|33208|6072|33213|33317|1206794|6231|119088|1457286|6329|119093|36086|68888",
"SPR00040|131567|2759|2698737|543769|2604748|2779609|37357|37358|37359|37360",
"KAG9284144|131567|2759|33154|4751|112252|1913637|214504|214506|1133283|214505|50955|50956",
"OZJ02075|131567|2759|33154|4751|112252|1913637|451507|2212702|4869|2052803|2026842|1938954",
"KAJ1654042|131567|2759|33154|4751|112252|1913638|451828|2219692|78918|78919|78922|2789359",
"KAF9920374|131567|2759|33154|4751|112252|1913637|1137986|2212732|214503|4854|2779861|979765",
"KAG9323964|131567|2759|33154|4751|112252|1913637|1137986|2212732|214503|4854|4855|64518",
"XP_057303228|131567|2759|33154|33208|6072|6073|6074|37516|406427|406428|6094|13092|13093",
"XP_001742500|131567|2759|33154|28009|1924738|81529|81525|81824|431895",
"CAD5335339|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|3699|3700|980083|3701|3702",
"XP_009120455|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|3699|3700|981071|3705|3711",
"KAG5376332|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|3699|3700|981071|3705|3711|1813537",
"MBA0653360|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|41938|3629|214907|3633|34286",
"KAJ6424055|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3646|3688|238069|40685|889485",
"KAJ0464973|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91882|4209|4210|102804|911341|102814|4231|4232",
"KAI3793895|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91882|4209|4210|102804|911341|219129|176612|185202",
"WMV38862|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91888|4069|4070|424551|424574|4107|315347",
"KAK1398698|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|91882|4036|364270|4037|241778|241780|489404|1589896|40917|360622",
"KAF6173689|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|41768|39161|39163|39325",
"WKA04251|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91834|403667|3602|2304100|3603|29760",
"THG04627|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71274|41945|27065|4441|4442|542762",
"POF26496|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3502|3503|3511|58331",
"POF26494|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3502|3503|3511|58331",
"KAF3971524|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3502|3503|21019|60419",
"KAI4346363|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|72025|3803|1978181|163092|2115957|3805|167791",
"KAK2423568|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|72025|3803|3814|2231393|2231382|2233838|2233839|163742|3898|3899",
"KAF2309099|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91835|3646|3977|235631|235882|3980|3981",
"KAK0579620|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|41937|23672|1977916|1977919|4022|4024",
"KAI9177279|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|71240|91827|1437201|71275|91836|41937|23672|1977916|1977919|4022|4023",
"ORF40-like_protein__YP_009552967__Bufonid_herpesvirus_1__2282206",
"YP_656695|10239|2731341|2731360|2731361|2731363|548681|548682|3044671|3050346|85655|dsDNA",
"NRK-1__YP_009345740__Mythimna_unipuncta_granulovirus_B__2169746",
"ORF117__YP_003517857__Lymantria_xylina_nucleopolyhedrovirus__166921",
"YP_002268093|10239|2840056|2840070|10442|558016|3047352|208013|dsDNA",
"hypothetical_protein__YP_006908569__Epinotia_aporema_granulovirus__166056",
"YP_010799455|10239|2840056|2840070|10442|558017|3051998|307448|dsDNA",
"hypothetical_protein_SlsnVgp096__YP_009505905__Spodoptera_littoralis_nucleopolyhedrovirus__10456",
"AYU75285|10239|2840056|2840070|10442|558016|3048336|10456|dsDNA",
"UIZ27122|131567|2759|2698737|33634|4762|4776|4777|70742|542832",
"XP_018906690|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33377|7036|33379|7037|7038",
"CAF2559857|131567|2759|33154|33208|6072|33213|33317|2697495|2697496|10190|2816136|44578|231623|2631648|2762512",
"CAF3362328|131567|2759|33154|33208|6072|33213|33317|2697495|2697496|10190|2816136|44578|231623|392032",
"KAA8593798|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489922|8111|8112|8165|698016|54318|909696|54343",
"KAG7321966|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|186634|32519|186626|186628|7995|1489793|31013|156982|337641",
"XP_018598201|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|1489343|31089|41712|27723|27726|113540",
"XP_030676944|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|9443|376913|314293|9526|314295|9577|325165|61853",
"XP_011801738|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|9443|376913|314293|9526|314294|9527|9569|9570|54131|336983",
"XP_008529410|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|9787|9788|9789|9798",
"hypothetical_protein_HVRV_s11_gp2__YP_002790895__Homalodisca_vitripennis_reovirus__411854",
"ADN64701|10239|2559587|2732396|2732405|2732459|2732541|2946186|10985|411855|411854",
"HZV_115-like_protein__YP_002321369__Oryctes_rhinoceros_nudivirus__92521",
"YP_009553421|10239|2840056|2840070|1511852|1511853|3051615|2053981|dsDNA",
"HZV_115-like_protein__YP_010087650__Homarus_gammarus_nudivirus__2509616",
"UHB41734|10239|2840056|2840070|1511852|1110703|2904783|dsDNA",
"QLI62370|10239|2840056|2840070|1511852|1110703|2707358|dsDNA",
"nicotinamide_riboside_kinase_1__YP_009001526__Anomala_cuprea_entomopoxvirus__62099",
"YP_009255362|10239|2840056|2840070|10442|558016|3047737|1850906|dsDNA",
"DUTP_pyrophosphatase_fused__YP_009165712__Perigonia_lusca_single_nucleopolyhedrovirus__1675865",
"NP_046187|10239|2840056|2840070|10442|558016|3048238|262177|dsDNA",
"YP_009116691|10239|2840056|2840070|1511852|2842638|3048365|1546257|dsDNA",
"YP_004956782|10239|2840056|2840070|1511852|1511854|3052000|1128424|dsDNA",
"QLI62409|10239|2840056|2840070|1511852|1110703|2707358|dsDNA",
"guanosine_monophosphate_kinase__YP_009051876__Penaeus_monodon_nudivirus__1529056",
"UVT30827|10239|2840056|2840070|10442|35255|2974676|dsDNA",
"YP_009051903|10239|2840056|2840070|1511852|2842693|3048260|1529056|dsDNA",
"KAJ8670212|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7399|7400|1955251|7422|108385|272339|77301|131215",
"UDM55355|10239|2840056|2840070|1511852|1110703|2742594|dsDNA",
"deoxynucleotide_monophosphate_-dNMP-_kinase__YP_009162466__Salmon_gill_poxvirus__1680908",
"YP_001111301|10239|2840056|2840070|1511852|1511853|3047860|432587|dsDNA",
"AHW98267|10239|2840056|2840070|1511852|1110703|1487700|dsDNA",
"YP_002321428|10239|2840056|2840070|1511852|1511853|3048239|92521|dsDNA",
"guanosine_monophosphate_kinase__YP_009553147__Drosophila_innubila_nudivirus__2057187",
"YP_010797710|10239|2840056|2840070|1511852|1511853|3051617|2486603|dsDNA",
"KAJ8670218|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7399|7400|1955251|7422|108385|272339|77301|131215",
"YP_001111311|10239|2840056|2840070|1511852|1511853|3047860|432587|dsDNA",
"KAE9522046|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33380|33385|27482|133076|33387|80764|464929|307491",
"GrBNV_gp44-like_protein__YP_002321436__Oryctes_rhinoceros_nudivirus__92521",
"YP_010797719|10239|2840056|2840070|1511852|1511853|3051617|2486603|dsDNA",
"YP_004956780|10239|2840056|2840070|1511852|1511854|3052000|1128424|dsDNA",
"YP_009116661|10239|2840056|2840070|1511852|2842638|3048365|1546257|dsDNA",
"WP_273963153|131567|2|1783272|1239|186801|186802|31979|44258|2629145|3023090",
"p-loop_NTPase__YP_009051884__Penaeus_monodon_nudivirus__1529056",
"YP_010801214|10239|2840056|2840070|1511852|2842693|3052206|2880837|dsDNA",
"hypothetical_protein_pv_135__YP_009001037__Pithovirus_sibericum__1450746",
"CAH6418282|10239|186616|340016",
"QKE50461|10239|12429|1477405",
"YP_009447900|10239|2731341|2731360|2731361|2731363|548681|548682|3044737|3050302|508441|dsDNA",
"KAH0515926|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|314147|9989|1963758|337687|337677|39087|10053|79684",
"KAI5770794|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|33554|379584|3072906|9655|2892069|48419|48420|2888765",
"XP_055168296|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|33554|379584|9608|34879|34880",
"ELW64014|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|9392|9393|9394|246437",
"ELK18391|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|9397|30559|9398|77225|9401|9402",
"XP_053935213|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|1329799|8492|436486|436489|436491|436492|8782|8825|8940|8941|33592|55661",
"KAJ7419830|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|1329799|8492|436486|436489|436491|436492|8782|8825|9126|28728|371929|371930",
"XP_048175562|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|1329799|8492|436486|436489|436491|436492|8782|8825|9126|192204|28725|30420|134902",
"XP_039415337|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|1329799|8492|436486|436489|436491|436492|8782|8825|9126|192204|28725|30420|181096|932674",
"KAF7252443|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|8504|8509|1329961|1329950|1329912|1329911|8548|1330544|1329920|8555|8556|61221",
"XP_025241049|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|9443|376913|314293|9526|314294|9527|9528|9564|9565",
"XP_059778867|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|91561|2653789|9721|9761|9765|9766|2746895",
"CAG5867613|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489908|1489913|8075|461499|238742|238743|238744",
"TRY94982|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|186634|32519|186626|186627|7952|30727|2743709|2743711|432408|623744",
"TRY94981|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|186634|32519|186626|186627|7952|30727|2743709|2743711|432408|623744",
"CAG5085234|131567|2759|33154|33208|6072|33213|33511|7711|7712|30302|2507557|41302|34763|34765",
"XP_013761608|131567|2759|554296|2925400|172820|877559|529818|461836",
"KRX10429|131567|2759|2698737|33630|5878|431838|6020|35094|198613|35102|35103|266149",
"MCX6099012|131567|2|2323|1783234|67810|1046990|2306012",
"RYH12842|131567|2157|93506|1906665",
"YP_008052468|10239|2732004|2732005|2732007|2732523|2732524|10501|181086|358403|251749|dsDNA",
"MBP02608|131567|2|1224|28211|204441|41295|41296|1898112",
"Deoxyribonucleoside_kinase__YP_009173510__Chrysochromulina_ericina_virus__455364",
"MAU36512|131567|2|1783270|68336|976|117743|200644|403978|2021391",
"KAG0566724|131567|2759|33090|35493|131221|3193|3208|404260|3214|114657|1929018|3223|3224|3225",
"QDZ21423|131567|2759|33090|3041|2302911|2302912|2302913|2302914|1764295",
"XP_002507121|131567|2759|33090|3041|1035538|13792|41873|38832|296587",
"KAI8468469|131567|2759|33090|3041|2692248|3166|2812636|35491|35466|34111|39955",
"XP_005644541|131567|2759|33090|3041|2692248|75966|75981|2511161|41891|248742|574566",
"GMH42028|131567|2759|33090|3041|33103|2546214|33104|2791029|3127|3128|2562701|3041901",
"KAI3424829|131567|2759|33090|3041|2692248|75966|35460|35461|2511126|3071|3077",
"PRW45341|131567|2759|33090|3041|2692248|75966|35460|35461|2511126|3071|3076",
"KAI7842119|131567|2759|33090|3041|2692248|75966|35460|35461|2511126|3071|2649997",
"CPXV195_protein__NP_619976__Cowpox_virus__10243",
"AGJ91384|10239|2732004|2732005|2732007|2732525|2732527|10240|10241|10242|10245|dsDNA",
"KFO20089|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|314147|9989|33550|10167|423606|885580",
"XP_055090031|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|9443|376913|314293|9526|314295|9577|325166|9590",
"XP_057564612|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|91561|2653789|2653790|9831|9832|9833|575201",
"KAI4569179|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|91561|9845|35500|9895|9963|9935|2918886",
"TRZ12769|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|8457|32561|1329799|8492|436486|436489|436491|436492|8782|8825|9126|2116661|36297|36298|364589",
"XP_054608327|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489908|1489913|28738|45443|405002|28779|105023",
"XP_031430355|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|186634|282425|32446|1489460|55118|7948|7949|7950",
"XP_048108489|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|186634|282425|32446|1489460|55118|55119|34772|278164",
"XP_016372210|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|186634|32519|186626|186627|7952|30727|7953|2743694|75365|307959",
"XP_045907727|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489922|1489940|1545897|8180|27705|147949",
"PWA25449|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489908|1489913|28738|8087|8079|586240|33527|33528",
"XP_041847328|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489908|1489913|8075|30700|32459|1250792",
"MBQ9785456|131567|2|1783272|1239|186801|348841|2044939",
"WP_299284275|131567|2|1783272|200795|1495646|1495647|1495648|1495649|2648435|1969742",
"MCH9039512|131567|2|1783272|200795|189774|2026724",
"ONK55465|131567|2759|33090|35493|131221|3193|58023|78536|58024|3398|1437183|4447|1437197|73496|40552|703533|4685|4686",
"WP_182958758|131567|2|1224|28211|204441|433|89583|1286189",
"WP_151542027|131567|2|1783272|201174|1760|85012|2012|1988|1803903",
"GLZ10533|131567|2|1783272|201174|1760|85012|2012|1988|2626254|3032204",
"MDE2654370|131567|2|1783270|142182|234665|2026742",
"YP_009480733|10239|12429|2204151|51368|2060084|2107707",
"YP_009120260|10239|12429|2204151|51368|2060084|1605721",
"YP_009481662|10239|12429|2204151|51368|2060084|2107708",
"WP_312422935|131567|2|1783272|1239|909932|909929|1843490|244825|244830",
"RQW12190|131567|2|1783272|1239|91061|1385|186822|44249|1850366",
"ULO08348|131567|2|1783272|1239|91061|1385|186822|44249|185978|2758563",
"YP_009165833|10239|12429|2204151|51368|1100043",
"BCU09350|10239|12429|2204151|51368|2801461",
"BCT22638|10239|2732004|2732005|2732007|2732525|2732527|10240|40069|1653087|dsDNA",
"QBK87376|10239|2732004|2732005|2732007|2732523|2732555|944644|1513461|2506605|dsDNA",
"putative_deoxynucleotide_monophosphate_kinase__YP_004347186__Lausannevirus__999883",
"AVP72248|10239|2731341|2731360|2731361|2731363|548681|548682|172653|431038|1862332|dsDNA",
"WP_231503965|131567|2|1783272|1239|91061|1385|186817|1386|185979|1340430",
"MCI4437323|131567|2157|1783275|28889|183924|114380|2272|334767|2644577|2268142",
"MDI6715314|131567|2|40117|2811502|2811503|2811504|28261|2645936|2067987",
"NLK08748|131567|2|1783272|1239|84086|1879010",
"WP_171299720|131567|2|1783272|1239|91061|186826|81852|1350|44008",
"DAI78595|10239|2731341|2731360|2731618|2731619|2788787|2832643",
"WP_242072784|131567|2|1783272|1239|91061|1385|186822|55080|1465",
"WP_141264560|131567|2|1783272|1239|186801|3039167|3039168|44260|2676739",
"MCL6592000|131567|2|1783272|1239|84086|1879010",
"MBU3913882|131567|2157|1783276|192989|2303497|2026764",
"MBC7107099|131567|2|1783272|1239|84086|1879010",
"BDR74226|131567|2|1783272|1239|186801|186802|31979|1485|1513",
"YP_009052244|10239|2732004|2732005|2732007|2732523|2732554|3044476|3044747|3060070|1474867",
"WP_075660462|131567|2|1783272|1239|1737404|1737405|1570339|162289|1261639",
"VBB17879|10239|2732004|2732005|2732007|2732523|2732554|549779|1977630|3044860|3060817|2420051|dsDNA",
"YP_010781651|10239|2732004|2732005|2732007|2732523|2732554|549779|3044648|2094720|2126985|dsDNA",
"ARF09209|10239|2732004|2732005|2732007|2732523|2732554|549779|1977630|2788772|1977635|1977631|dsDNA",
"WP_119833200|131567|2|1224|28211|204441|2829815|191|2320860",
"WP_287101303|131567|2|1783270|68336|976|1853228|1853229|563835|1004300|2643788|2501297",
"NET30721|131567|2|1783272|1798711|1117|3028117|3079757|1892249|43988|2649277|2607788",
"MBR4316369|131567|2|1224|28211|33807|1913988",
"putative_deoxynucleoside_monophosphate_kinase__YP_003969712__Cafeteria_roenbergensis_virus_BV-PW1__693272",
"DAH68156|10239|2731341|2731360|2731618|2731619|2788787|2832643",
"MCP4485646|131567|2|1783270|68336|976|117743|200644|49546|61432|1871037",
"KAG5183115|131567|2759|2698737|33634|2696291|569578|2833|2978|2979|2980|303371",
"MCC8538641|131567|2|1224|1236|135614|32033|338|56463",
"WP_279641401|131567|2|1224|1236|72274|135621|2901164|136846|578833|316",
"WP_136540337|131567|2|1224|28211|356|82115|227290|379|1368430",
"WP_174514028|131567|2|1224|28211|356|45404|120652|227605",
"WP_312414151|131567|2|1224|28211|356|82115|323620|2643062|1870904",
"WP_198939218|131567|2|1224|28211|356|82115|227290|357|2632611|1923829",
"WP_240154358|131567|2|1224|28216|80840|80864|283|2638500|2608341",
"MCB1712299|131567|2|1224|1236|91347|543|401618|2630307|2725488",
"WP_292897370|131567|2|1783272|201174|1760|85006|85023|33882|2609290|51671",
"MDA8317546|131567|2|1783272|201174|1752188|2900548",
"WP_267843406|131567|2|1783272|201174|1760|85011|2062|1883|2593676|2996462",
"WJN62600|10239|2731341|2731360|2731618|2731619|1982875|2948901|2961173|3038334",
"WP_251153769|131567|2|1783272|201174|1760|85006|85017|157920|2624466|2937992",
"AWY04861|10239|2731341|2731360|2731618|2731619|2788787|2201433",
"WP_310022844|131567|2|1783272|201174|1760|85006|85023|33882|156977",
])

In [45]:
cl_735 = cleanup_underscores(["KAI5693314|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33375|1585420|121844|121845",
"UVF62196|10239|2559587|2732396|2497569|2497571|2497577|2499411|11308|35324|2972735|ssRNA_-_",
"BES90385|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33343|33345|33347|33349|33351|33354|33355|30083|236347|236915|236919|355586|355587",
"UQS95349|10239|2559587|2585030|2936537",
"APG77894|10239|2559587|2585030|1922348|1923006",
"CAH0386963|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|7524|33373|33377|7036|33379|7037|7038",
"UCR92664|10239|2559587|2732396|2497569|2497570|2497574|11157|11270|35303|1983568|ssRNA_-_",
"CAG9814606|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|7041|41084|41088|71528|27439|63707|63708|80248|80249",
"hemagglutinin_protein__YP_009110688__Wellfleet_Bay_virus__1566309",
"hemagglutinin__YP_009987463__Lake_Chad_virus__688438",
"hemagglutinin__YP_009996582__Quaranjavirus_johnstonense__688437",
"ACY56283|10239|2559587|2732396|2497569|2497571|2497577|2499411|11308|1299308|688437|ssRNA_-_",
"APG77884|10239|2559587|2585030|1922348|1922890",
"UDL13964|10239|2559587|2732396|2497569|2497571|2497577|2499411|11308|35324|2886237|ssRNA_-_",
"KAJ1518775|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|30262|38130|45049|45053|153976|439357|439358",
"KAJ1520364|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33342|30262|38130|45049|45053|153976|439357|439358",
"glycoprotein__YP_009553284__Oz_virus__2137161",
"GP64__YP_009666390__Lonomia_obliqua_multiple_nucleopolyhedrovirus__134394",
"GP64__YP_009182316__Diatraea_saccharalis_granulovirus__1675862",
"GP64__YP_008378382__Choristoneura_rosaceana_nucleopolyhedrovirus__58094",
"major_budded_virus_envelope_glycoprotein__YP_009316051__Anticarsia_gemmatalis_multiple_nucleopolyhedrovirus__268591",
"major_budded_virus_envelope_glycoprotein__YP_009118511__Condylorrhiza_vestigialis_MNPV__1592576",
"gp64_protein__YP_007250533__Thysanoplusia_orichalcea_nucleopolyhedrovirus__101850",
"gp64__YP_009666553__Oxyplax_ochracea_nucleopolyhedrovirus__2083176",
"GP64__YP_009255283__Catopsilia_pomona_nucleopolyhedrovirus__1850906",
])

In [49]:
cl_ligT = cleanup_underscores(["UQJ95078|10239|2731341|2731360|2731618|2731619|2946170|1985328|2212402|2935722",
"Phage_T4__Acb1__NP_049750.1_",
"YP_009203881|10239|2731341|2731360|2731618|2731619|2946170|1198136|1985329|1985341|1675602",
"CAG7580038|10239|12333|156614|707152",
"Marseillevirus__MAR_ORF259__YP_003406995_",
"AYV76224|10239|2732004|2732005|2732007|2732523|2732554|549779|1977630|2788772|2487775|dsDNA",
"Pithovirus__DUF1868__Uniprot__A0A481Z2W6_",
"QBK89835|10239|12429|2204151|51368|2023203|2506586|dsDNA",
"Iridovirus_AM__YP_009021100_",
"Iridio_AM__DH26_gp015__YP_009021100_",
"WCC63425|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|2730119|3018868|ssRNA___",
"WCC63318|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|2730119|3018874|ssRNA___",
"WDE20358|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|694002|696098|1928434|ssRNA___",
"ALK80245|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|694002|2509494|1335626|ssRNA___",
"MERS__NS4b__YP_009047207_",
"AHI48830|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|694002|2509494|1335626|ssRNA___",
"Mimivirus__WcbI__Uniprot__A0A2H4UTY7__",
"WCZ55762|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|693995|2664420|3027598|ssRNA___",
"Pigeonpox__PDE__YP_009046269_",
"QRI42760|10239|2732004|2732005|2732007|2732525|2732527|10240|40069|2810136|dsDNA",
"ARC95201|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|694002|2509481|694003|42005|ssRNA___",
"MHV__NS2b__YP_009824980.1_",
"AYR18614|10239|2559587|2732396|2732408|2732506|76804|2499399|11118|2501931|694002|696098|1928434|ssRNA___",
"BCD71876|10239|2559587|2732396|2732405|2732459|2732541|2946186|10912|28875|10941",
"Rotavirus_A__VP3__YP_002302228_",
"WBT97834|10239|2559587|2732396|2732405|2732459|2732541|2946186|10912|28875",
"YP_009665195|10239|2559587|2732396|2732408|2732506|76804|2499403|2508209|694017|11155|2509508|329862|ssRNA___",
"Porcine_Torovirus__polyprotein__YP_008798230_",
"QRD99327|10239|2559587|2732396|2732408|2732506|76804|2499403|2508209|694017|11155|1837216|2529475|ssRNA___"
])

In [50]:
cl_nucleoside_transporter_sequences = cleanup_underscores(["XP_012184276|131567|2759|33154|4751|451864|5204|5302|155619|355688|5303|2983427|599838|599839",
"KAG6836905|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2982303|930979|5450|530044",
"KAF4597187|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2983527|104366|5320|28995",
"PPQ68704|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2982305|40562|71950|93625",
"KAF5379817|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2982303|930979|180950|117010",
"KAJ3514656|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2982305|40562|5399|84603",
"KAG2017562|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2982305|184208|184431|5346|1132390",
"KAF4575320|131567|2759|33154|4751|451864|5204|5302|155619|452333|5338|2983527|104366|5320|28995",
"KAG2751110|131567|2759|33154|4751|451864|5204|5302|155619|452333|68889|227332|227336|5379|48565|1400700",
"KAJ3557587|131567|2759|33154|4751|451864|5204|5302|155619|355688|5303|81064|5307|194682",
"BEI94135|131567|2759|33154|4751|451864|5204|5302|155616|1851469|1759442|1838142|279322",
"XP_014564768|131567|2759|33154|4751|451864|5204|29000|432025|432026|165795|34348|34349|764103",
"CUA69723|131567|2759|33154|4751|451864|5204|5302|155619|355688|36064|5250|1322061|456999",
"KAA1086463|131567|2759|33154|4751|451864|5204|29000|162484|5258|5262|5296|5297|56615",
"KAA1076410|131567|2759|33154|4751|451864|5204|29000|162484|5258|5262|5296|5297|56615",
"KAJ3417250|131567|2759|33154|4751|112252|4761|2683659|451435|451442|1142503|2996260|2732419",
"XP_031027600|131567|2759|33154|4751|112252|4761|2683659|451435|2231171|286113|286114|1806994",
"CAG8481881|131567|2759|33154|4751|112252|1913637|214504|214506|214508|144536|144537|144538",
"KAG0945163|131567|2759|33154|4751|112252|1913637|451507|2212703|4827|1344963|1344955|4842|64495",
"CAG4710330|131567|2759|2611352|5752|2601529|2601530|5765|5761|5763",
"MBL1148629|131567|2|1783272|67819|1042316|2033014",
"KAG7304688|131567|2759|33154|33208|6072|33213|33317|1206794|88770|6656|197563|197562|6960|50557|85512|7496|33340|33392|85604|7088|41191|41196|41197|37567|37582|51653|51654|51655",
"XP_003289468|131567|2759|554915|2605435|142796|33083|2058949|2058185|5782|5786",
"KAF6000735|131567|2759|2763|2797|265318|265316|2759657|2690220",
"KAJ5079553|131567|2759|2611341|2683626|2055135|1746090",
"PRP80930|131567|2759|554915|2605435|2605334|2681888|435412|65007|1890364",
"PRP88178|131567|2759|554915|2605435|2605334|2681888|435412|65007|1890364",
"F22__YP_009173937__Felis_catus_gammaherpesvirus_1__1452540",
"CAG6016670|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489908|1489913|8075|461499|238742|238743|238744",
"KAE8299918|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|7898|186623|41665|32443|1489341|186625|1489388|123365|123366|123367|123368|123369|1489872|1489922|1489923|30870|215357|215358",
"XP_012598815|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|9443|376911|376915|30615|13149|30608",
"ENT1_human_XP_011512643_web_colabfold_model1",
"ASJ26355|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314146|9443|376913|314293|9526|314295|9604|207598|9605|9606",
"OWK13347|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|91561|9845|35500|9850|34878|9859|9860|46360",
"KAI4552365|131567|2759|33154|33208|6072|33213|33511|7711|89593|7742|7776|117570|117571|8287|1338369|32523|32524|40674|32525|9347|1437010|314145|91561|9845|35500|9895|9963|9935|2918886",
"YP_003084378|10239|2731341|2731360|2731361|2731363|548681|3044472|10293|180252|104388",
"envelope_protein_UL43__NP_066875__Gallid_alphaherpesvirus_3__35250",
"YP_010795642|10239|2731341|2731360|2731361|2731363|548681|3044472|10293|180252|3050288|35250",
"envelope_protein_UL43__YP_009054920__Equid_alphaherpesvirus_3__80341",
"YP_010801416|10239|2731341|2731360|2731361|2731363|548681|3044472|10293|10319|342940|173566",
"envelope_protein_UL43__YP_068346__Suid_alphaherpesvirus_1__10345",
"APT68444|10239|2731341|2731360|2731361|2731363|548681|3044472|10293|10319|3050355|10345",
"envelope_protein_UL43__NP_040138__Human_alphaherpesvirus_3__10335",
"AKG56230|10239|2731341|2731360|2731361|2731363|548681|3044472|10293|10319|3050294|10335",
"BMRF2__YP_001129455__Human_herpesvirus_4_type_2__12509",
"ORF58__NP_065557__Alcelaphine_gammaherpesvirus_1__35252",
"ORF58__YP_001129415__Human_gammaherpesvirus_8__37296",
"CAC84355|10239|2731341|2731360|2731361|2731363|548681|3044472|10374|10379|3050350|10381",
"YP_008319892|10239|12429|2204151|51368|2060084|1349409",
"YP_008319560|10239|12429|2204151|51368|2060084|1349409",
"XP_004995597|131567|2759|33154|28009|1924738|81529|86017|946362",
"WP_231932931|131567|2|1783257|203682|203683|2691354|2691359|2795782|2528022",
"CAE7280677|131567|2759|2698737|33630|2864|89954|252141|2949|2951",
])

## TABLE FORMAT TAXID AND ASSIGNED COLORS

- This step uses the txid information collected from sequences and puts it in a table format
- The resulting tables are ready to copy/paste to the iTOL annotation tool 

In [31]:
df_cl55 = pd.DataFrame(data=cl_55, columns=['id'])
df_cl55["color_sk"] = df_cl55["id"].map(super_kingdom_colors['cl_55']['color_hex']) 
df_cl55["color_vt"] = df_cl55["id"].map(viral_taxa_colors['cl_55']['color_hex']) 

In [39]:
df_cl56 = pd.DataFrame(data=cl_56, columns=['id'])
df_cl56["color_sk"] = df_cl56["id"].map(super_kingdom_colors['cl_56']['color_hex']) 
df_cl56["color_vt"] = df_cl56["id"].map(viral_taxa_colors['cl_56']['color_hex']) 

In [41]:
df_cl28 = pd.DataFrame(data=cl_28, columns=['id'])
df_cl28["color_sk"] = df_cl28["id"].map(super_kingdom_colors['cl_28']['color_hex']) 
df_cl28["color_vt"] = df_cl28["id"].map(viral_taxa_colors['cl_28']['color_hex']) 

In [46]:
df_cl735 = pd.DataFrame(data=cl_735, columns=['id'])
df_cl735["color_sk"] = df_cl735["id"].map(super_kingdom_colors['cl_735']['color_hex']) 
df_cl735["color_vt"] = df_cl735["id"].map(viral_taxa_colors['cl_735']['color_hex']) 

In [48]:
df_clnucleoside_transporter_sequences = pd.DataFrame(data=cl_nucleoside_transporter_sequences, columns=['id'])
df_clnucleoside_transporter_sequences["color_sk"] = df_clnucleoside_transporter_sequences["id"].map(super_kingdom_colors['cl_nucleoside_transporter_sequences']['color_hex']) 
df_clnucleoside_transporter_sequences["color_vt"] = df_clnucleoside_transporter_sequences["id"].map(viral_taxa_colors['cl_nucleoside_transporter_sequences']['color_hex']) 

In [51]:
df_clligT = pd.DataFrame(data=cl_ligT, columns=['id'])
df_clligT["color_sk"] = df_clligT["id"].map(super_kingdom_colors['cl_ligT']['color_hex']) 
df_clligT["color_vt"] = df_clligT["id"].map(viral_taxa_colors['cl_ligT']['color_hex']) 