# Information-Theoretic Benchmarking of Species Tree Inference

This notebook presents a comparative workflow for species tree inference using two different character sets: traditional amino acid sequences and foldtree-encoded structural features. The goal is to benchmark the phylogenetic signal carried by each character set using information-theoretic approaches.

**Workflow Overview:**
- Construct a species tree using a standard amino acid-based pipeline (multiple sequence alignment, concatenation, and maximum likelihood inference).
- Construct an equivalent species tree using foldtree-encoded data.
- For both trees, compute column-wise log-likelihoods and character frequencies.
- Quantify the information content and phylogenetic signal of each character set by analyzing the distribution of likelihoods and character frequencies.

This approach enables a direct comparison of how much evolutionary information is captured by sequence versus structure-based encodings, providing an objective benchmark for future phylogenomic analyses.

In [1]:
cd /home/dmoi/projects/foldtree2/

/home/dmoi/projects/foldtree2


In [2]:
save_dir = './foldtree2/notebooks/benchmarks/Information_benchmark/'

In [3]:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import requests
import random
import subprocess
from pathlib import Path
import concurrent.futures
import multiprocessing
import pandas as pd
import requests
from Bio import SeqIO
import re
import time

In [4]:
overwrite = False


In [5]:
import glob
markers = glob.glob( './families/Information_benchmark/marker_genes/marker_genes/*.fa')
print( f"Found {len(markers)} marker genes" )

Found 500 marker genes


In [6]:
#use autoreload to reload modules
%load_ext autoreload
%autoreload 2

In [8]:
# Specify your model and associated matrices
model = 'mergeddecoder_foldtree2_test'
from foldtree2 import ft2treebuilder as treebuilder
model_path = os.path.join( "models" , model )  # Path to your model (without .pkl)
mafftmat = model_path + "_mafftmat.mtx"
submat = model_path + "_submat.txt"
aapropcsv = os.path.join( 'foldtree2', "config" , "aaindex1.csv" )  # Path to your amino acid properties CSV file
# Initialize the treebuilder class
mafftmat = os.path.join(model_path + '_mafftmat.mtx')
submat = os.path.join(model_path + '_submat.txt')
charmaps = os.path.join(model_path + '_pair_counts.pkl')

tb = treebuilder.treebuilder(model=model_path, mafftmat=mafftmat, submat=submat , charmaps=charmaps, aapropcsv=aapropcsv ,  raxml_path='./foldtree2/raxml-ng/raxml-ng' ,
     maffttext2hex='./foldtree2/mafft_tools/maffttext2hex', maffthex2text='./foldtree2/mafft_tools/hex2maffttext')

#check that the paths are correct
assert os.path.exists(tb.mafftmat), f"MAFFT matrix not found at {tb.mafftmat}"
assert os.path.exists(tb.submat), f"Substitution matrix not found at {tb.submat}"
#check the tools

assert os.path.exists(tb.raxml_path), f"RAxML-NG not found at {tb.raxml_path}"
assert os.path.exists(tb.maffttext2hex), f"MAFFT text to hex converter not found at {tb.maffttext2hex}"
assert os.path.exists(tb.maffthex2text), f"MAFFT hex to text converter not found at {tb.maffthex2text}"

  self.raxmlchars = """0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ! " # $ % & ' ( ) * + , / : ; < = > @ [ \ ] ^ _ { | } ~"""


loading charmaps from models/mergeddecoder_foldtree2_test_pair_counts.pkl


In [9]:
import subprocess
import os

def run_raxml_lg_alignment(alignment_file, output_prefix="raxml_lg_tree", raxml_path="raxml-ng"):
	"""
	Run RAxML-NG on an amino acid alignment using the LG+G+I model.

	Parameters:
	-----------
	alignment_file : str
		Path to the aligned FASTA file (amino acid sequences)
	output_prefix : str
		Prefix for RAxML-NG output files
	raxml_path : str
		Path to the RAxML-NG executable (default: 'raxml-ng')

	Returns:
	--------
	str: Path to the inferred tree file (Newick format)
	"""
	tree_file = f"{output_prefix}.raxml.bestTree"
	cmd = [
		raxml_path,
		"--msa", alignment_file,
		"--model", "LG+G+I",
		"--prefix", output_prefix,
		"--threads", "1"
	]
	print(f"Running: {' '.join(cmd)}")
	subprocess.run(cmd, check=True)
	if os.path.exists(tree_file):
		print(f"Inferred tree saved to {tree_file}")
		return tree_file
	else:
		print("RAxML-NG did not produce a tree file.")
		return None

In [10]:
def align_fasta_with_mafft(fasta_file):
	"""
	Align a single FASTA file using MAFFT
	
	Parameters:
	-----------
	fasta_file : str
		Path to input FASTA file
	
	Returns:
	--------
	tuple: (input_path, output_path, success_status)
	"""
	# Create output filename - same directory but with .aligned.fa extension
	input_path = Path(fasta_file)
	output_path = input_path.with_name(f"{input_path.stem}.aligned.fa")
	
	# Run MAFFT
	cmd = f"mafft --auto --thread 1 {fasta_file} > {output_path}"
	print(f"Aligning: {fasta_file}")
	
	try:
		subprocess.run(cmd, shell=True, check=True, stderr=subprocess.PIPE)
		return (fasta_file, str(output_path), True)
	except subprocess.CalledProcessError as e:
		print(f"Error aligning {fasta_file}: {e}")
		return (fasta_file, str(output_path), False)

# Get number of available cores (leave 1 core free for system processes)
max_workers = max(1, multiprocessing.cpu_count() - 1)
print(f"Using {max_workers} cores for alignments")

# Process alignments in parallel
aligned_files = []
failed_files = []


# Get number of available cores (leave 1 core free for system processes)
max_workers = max(1, multiprocessing.cpu_count() - 1)
print(f"Using {max_workers} cores for alignments")
# Process alignments in parallel
aligned_files = []
failed_files = []

Using 31 cores for alignments
Using 31 cores for alignments


In [11]:
def concatenate_alignments(alignment_files, output_file):
	"""
	Concatenate multiple alignment files into a single supermatrix alignment.
	
	Parameters:
	-----------
	alignment_files : list
		List of paths to alignment files in FASTA format
	output_file : str
		Path to save the concatenated alignment
		
	Returns:
	--------
	tuple
		(concatenated_alignment, partition_info)
		- concatenated_alignment: The final MultipleSeqAlignment object
		- partition_info: Dictionary with gene boundaries for partition file creation
	"""
	if not alignment_files:
		print("No alignment files provided")
		return None, {}
	
	# Dictionary to store sequences for each species across all genes
	all_species = {}
	partition_info = {}
	current_position = 1
	
	# Process each alignment file
	for i, aln_file in enumerate(alignment_files):
		try:
			# Load the alignment
			gene_name = Path(aln_file).stem.replace('.aligned', '')
			alignment = AlignIO.read(aln_file, "fasta")
			aln_length = alignment.get_alignment_length()
			
			# Store partition information
			partition_info[gene_name] = {
				'start': current_position,
				'end': current_position + aln_length - 1
			}
			
			# Process each sequence in this alignment
			for record in alignment:
				# Extract species identifier from the sequence header
				species_id = record.id.split('|')[-1]
				
				# Initialize this species entry if it doesn't exist yet
				if species_id not in all_species:
					all_species[species_id] = {}
				
				# Add this gene's sequence for this species
				all_species[species_id][gene_name] = str(record.seq)
			
			current_position += aln_length
			print(f"Processed alignment {i+1}/{len(alignment_files)}: {gene_name} ({aln_length} columns)")
			
		except Exception as e:
			print(f"Error processing {aln_file}: {str(e)}")
	
	# Create the concatenated alignment
	concatenated_records = []
	gene_names = list(partition_info.keys())
	
	for species_id, genes in all_species.items():
		# Build the concatenated sequence for this species
		concat_seq = ""
		for gene in gene_names:
			if gene in genes:
				concat_seq += genes[gene]
			else:
				# If this species doesn't have this gene, add gaps
				gene_length = partition_info[gene]['end'] - partition_info[gene]['start'] + 1
				concat_seq += "-" * gene_length
		
		# Create a SeqRecord for this concatenated sequence
		record = SeqRecord(
			Seq(concat_seq),
			id=species_id,
			description=f"Concatenated {len(gene_names)} genes"
		)
		concatenated_records.append(record)
	
	# Create and save the concatenated alignment
	concatenated_alignment = MultipleSeqAlignment(concatenated_records)
	
	# Save alignment to file
	with open(output_file, "w") as handle:
		AlignIO.write(concatenated_alignment, handle, "fasta")
	
	# Create a partition file for RAxML-NG
	partition_file = f"{output_file}.partition"
	with open(partition_file, "w") as handle:
		for gene, pos in partition_info.items():
			handle.write(f"GTR+G, {gene} = {pos['start']}-{pos['end']}\n")
	
	print(f"Created concatenated alignment with {len(concatenated_records)} species and {concatenated_alignment.get_alignment_length()} columns")
	print(f"Partition file saved to {partition_file}")
	return concatenated_alignment, partition_info

In [12]:
def extract_oma_ids_from_fasta(fasta_file):
	"""
	Extract OMA identifiers from a FASTA file
	ID format example:

	>MOUSE45461 | OMA754554 | COQ5_MOUSE | [Mus musculus]

	
	Parameters:
	-----------
	fasta_file : str
		Path to FASTA file
		
	Returns:
	--------
	list of str: OMA identifiers
	"""
	oma_ids = []
	oma_files = {}
	for record in SeqIO.parse(fasta_file, "fasta"):
		# Extract OMA ID from the FASTA header using the pipe-separated format
		id = record.description.split('|')[0]	
		oma_ids.append(id.strip())
	oma_files[id.strip()] = fasta_file
	return oma_ids , oma_files
# Extract OMA IDs from all marker files
oma_ids = []
oma_files = {}
for marker in markers:
	ids, files = extract_oma_ids_from_fasta(marker)
	oma_ids.extend(ids)
	oma_files.update(files)

oma_ids = list(set(oma_ids))  # Remove duplicates
print(f"Extracted {len(oma_ids)} unique OMA IDs from marker genes")
oma_files = pd.DataFrame.from_dict(oma_files, orient='index', columns=['fasta_file'])
oma_files.index.name = 'oma_id'
oma_files.reset_index(inplace=True)
oma_files.to_csv("oma_markergene_files.csv", index=False)


Extracted 8377 unique OMA IDs from marker genes


In [13]:
print( oma_ids[:10] )  # Print first 10 OMA IDs for verification

['NAEGR13256', 'YEAST02461', 'HUMAN102995', 'NAEGR05161', 'ACACA06967', 'HUMAN03132', 'ARATH17090', 'PARTE06860', 'CAPO306280', 'DICDI11709']


In [14]:
import json
import tqdm
def bulk_map_oma_to_uniprot(oma_ids, batch_size=100, retry_limit=3 , verbose=True):
	"""
	Map a list of OMA identifiers to UniProt using the OMA bulk API.

	Parameters:
	-----------
	oma_ids : list of str
		List of OMA identifiers (e.g., OMA123456)
	batch_size : int
		Number of IDs per batch (max 1000)
	retry_limit : int
		Number of times to retry on failure

	Returns:
	--------
	list of dict: Mapping information for each OMA ID
	"""
	url = "https://omabrowser.org/api/protein/bulk_retrieve/"
	all_results = []

	for i in tqdm.tqdm(range(0, len(oma_ids), batch_size)):
		batch = oma_ids[i:i+batch_size]
		for attempt in range(retry_limit):
			try:
				response = requests.post(url, json={"ids": batch})
				if response.ok:
					data = response.json()
					data = json.loads(response.text)  # Ensure we parse the JSON correctly
					#transfor data into dataframe
					results = {}
					for query in data:
						results[query['query_id']] = query['target']
					if verbose:
						print(f"Batch {i//batch_size+1} processed successfully with {len(results)} mappings")
						print(pd.DataFrame.from_dict(results, orient='index').reset_index())
					all_results.append(pd.DataFrame.from_dict(results, orient='index').reset_index())
					break  # Success, break retry loop
				else:
					print(f"Error fetching batch {i//batch_size+1}: HTTP {response.status_code}")
			except Exception as e:
				print(f"Error in batch {i//batch_size+1}: {str(e)}")
			time.sleep(1)
		else:
			# If all retries failed, add minimal info for each ID in the batch
			for oma_id in batch:
				all_results.append({'oma_id': oma_id, 'uniprot_id': None})
			all_results.append(pd.DataFrame([{'oma_id': oma_id, 'uniprot_id': None}]))
	# Concatenate all results into a single DataFrame
	if all_results:
		all_results = pd.concat(all_results, ignore_index=True)
		return all_results
	else:
		return pd.DataFrame(columns=['oma_id', 'uniprot_id'])


	

In [15]:
import os
# Check if the mapping file already exists
if not os.path.exists(save_dir):
	print(f"Creating directory {save_dir} for marker gene files...")
	os.makedirs(save_dir, exist_ok=True)


In [16]:
# Check if the mapping file already exists
mapping_file = os.path.join(save_dir, "oma_to_uniprot_mapping.csv")
if not os.path.exists(mapping_file):
	print("Mapping file does not exist, creating it now...")
	#retreive all OMA IDs from the marker genes
	mapping_df = bulk_map_oma_to_uniprot(oma_ids , verbose=False)
	# Save the mapping results to a CSV file
	mapping_df.to_csv(mapping_file, index=True)
else:
	print("Mapping file already exists, loading it...")
	mapping_df = pd.read_csv(mapping_file, index_col=0)
	print(f"Loaded mapping file with {len(mapping_df)} entries")
	print(mapping_df.head())
	

Mapping file already exists, loading it...
Loaded mapping file with 8377 entries
        index  entry_nr                                     entry_url  \
0  TRYB205898   7201087   https://omabrowser.org/api/protein/7201087/   
1  DROME24395  17556944  https://omabrowser.org/api/protein/17556944/   
2  YEAST00066   8927628   https://omabrowser.org/api/protein/8927628/   
3  THAPS06704  20296723  https://omabrowser.org/api/protein/20296723/   
4  DROME17519  17550068  https://omabrowser.org/api/protein/17550068/   

        omaid  canonicalid                      sequence_md5  sequence_length  \
0  TRYB205898       Q57ZU2  c077db9e5ee8aa5e763b0daf676859b6              860   
1  DROME24395   GATA_DROME  6f214dfc8f1dbf0393dd24e6aac8ea1a              508   
2  YEAST00066  HSP71_YEAST  7cdc576cd47040f8a215326a8e29e066              642   
3  THAPS06704       B8BYD7  c493f1360799e587e42ffc17c5f33a57             1103   
4  DROME17519       Q9VVI0  4827054d86ce99be2f09ad82c7666978              5

In [17]:

import pyoma.browser.db
db = pyoma.browser.db.Database("/home/dmoi/datasets/OMA/OmaServer.h5")
db.get_release_name()
resolver = pyoma.browser.db.IDResolver(db)
mapper = db.id_mapper['OMA']
linkout = db.id_mapper['Linkout'] 




Cannot load SequenceSearch. Any future call to seq_search will fail!
Traceback (most recent call last):
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/pyoma/browser/db.py", line 2340, in __init__
    self.seq_idx = self.seq_idx()
                   ^^^^^^^^^^^^^^
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/tables/link.py", line 427, in __call__
    self.extfile = tb.open_file(filename, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/pyoma/browser/db.py", line 137, in synchronized_open_file
    return _tables_file._original_open_file(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/tables/file.py", line 325, in open_file
    return File(filename, mode, title, root_uep, filters, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/hom

In [18]:

entry = 'CHLRE00910'  # Example entry number
entry_nr = mapper.omaid_to_entry_nr(entry)
print(f"Entry number for {entry}: {entry_nr}")


Entry number for CHLRE00910: 20577393


In [19]:
#try mapping entry
entry = 'CHLRE00910'  # Example entry number
entry_nr = mapper.omaid_to_entry_nr(entry)
xrefs = linkout.iter_xrefs_for_entry_nr(entry_nr)
print([ r for r in xrefs])


[{'source': 'UniProtKB/TrEMBL', 'xref': 'A0A2K3DKT0', 'url': 'http://uniprot.org/uniprot/A0A2K3DKT0'}, {'source': 'EntrezGene', 'xref': '5718125', 'url': 'http://www.ncbi.nlm.nih.gov/gene/5718125'}]


In [20]:
def get_uniprot_ids_from_oma(oma_id, mapper, linkout):
	"""
	Get UniProt IDs for a given OMA ID by chaining mapper and xref operations.
	
	Parameters:
	-----------
	oma_id : str
		OMA identifier (e.g., 'CHLRE00910')
	mapper : pyoma.browser.db.OmaIdMapper
		OMA ID mapper instance
	linkout : pyoma.browser.idmapper.LinkoutIdMapper
		Linkout mapper instance
	
	Returns:
	--------
	list: UniProt IDs associated with the OMA ID
	"""
	try:
		# Convert OMA ID to entry number
		entry_nr = mapper.omaid_to_entry_nr(oma_id)
		
		# Get all xrefs for this entry
		xrefs = linkout.iter_xrefs_for_entry_nr(entry_nr)
		
		# Filter for UniProt references
		uniprot_ids = [ref['xref'] for ref in xrefs 
					  if ref['source'] in ['UniProtKB/Swiss-Prot', 'UniProtKB/TrEMBL']]
		
		return uniprot_ids
	
	except Exception as e:
		print(f"Error processing {oma_id}: {str(e)}")
		return []

# Test the function with the example entry
uniprot_ids = get_uniprot_ids_from_oma(entry, mapper, linkout)
print(f"UniProt IDs for {entry}: {uniprot_ids}")

UniProt IDs for CHLRE00910: ['A0A2K3DKT0']


In [21]:
print( mapping_df.head() )

        index  entry_nr                                     entry_url  \
0  TRYB205898   7201087   https://omabrowser.org/api/protein/7201087/   
1  DROME24395  17556944  https://omabrowser.org/api/protein/17556944/   
2  YEAST00066   8927628   https://omabrowser.org/api/protein/8927628/   
3  THAPS06704  20296723  https://omabrowser.org/api/protein/20296723/   
4  DROME17519  17550068  https://omabrowser.org/api/protein/17550068/   

        omaid  canonicalid                      sequence_md5  sequence_length  \
0  TRYB205898       Q57ZU2  c077db9e5ee8aa5e763b0daf676859b6              860   
1  DROME24395   GATA_DROME  6f214dfc8f1dbf0393dd24e6aac8ea1a              508   
2  YEAST00066  HSP71_YEAST  7cdc576cd47040f8a215326a8e29e066              642   
3  THAPS06704       B8BYD7  c493f1360799e587e42ffc17c5f33a57             1103   
4  DROME17519       Q9VVI0  4827054d86ce99be2f09ad82c7666978              553   

                                             species  oma_group  \
0  {'co

In [22]:
def ret_refs(oma_ids, mapper, linkout):
	"""
	Retrieve UniProt references for given OMA IDs from the OMA database.
	
	Parameters:
	-----------
	oma_ids : list
		List of OMA identifiers
	mapper : pyoma.browser.db.OmaIdMapper
		OMA ID mapper instance
	linkout : pyoma.browser.idmapper.LinkoutIdMapper
		Linkout mapper instance
	
	Returns:
	--------
	dict: Dictionary mapping OMA IDs to their UniProt references
	"""
	return {oma_id: get_uniprot_ids_from_oma(oma_id, mapper, linkout) for oma_id in oma_ids}

# Try with some OMA IDs
sample_oma_ids = mapping_df['index'].sample(10).tolist()
print(f"Sample OMA IDs: {sample_oma_ids}")

sample_refs = ret_refs(sample_oma_ids, mapper, linkout)
print("\nSample references:")
for oma_id, refs in sample_refs.items():
	print(f"{oma_id}: {refs}")

Sample OMA IDs: ['CHLRE16464', 'CAPO305630', 'MOUSE31178', 'DROME09011', 'HUMAN52035', 'HUMAN21192', 'ARATH15907', 'PARTE35951', 'CYAME04251', 'CYAME01940']

Sample references:
CHLRE16464: ['A8JHC3']
CAPO305630: ['A0A0D2WMH7']
MOUSE31178: ['Q60737', 'Q61177', 'Q6NSS6']
DROME09011: ['Q960Q8']
HUMAN52035: ['A6PVX3', 'P55036']
HUMAN21192: ['A0A024R6K8', 'G3V227', 'G3V277', 'G3V2C0', 'G3V2F2', 'G3V313', 'G3V339', 'G3V3H8', 'G3V3P2', 'G3V3R3', 'G3V3S7', 'G3V3X0', 'G3V3Y5', 'G3V423', 'G3V456', 'G3V4C7', 'G3V4N8', 'G3V4S4', 'G3V5H5', 'G3V5U1', 'G3V5W1', 'P23381', 'P78534']
ARATH15907: ['A0A178VTX1', 'Q8LPJ7']
PARTE35951: ['A0DY77']
CYAME04251: []
CYAME01940: []


In [23]:
from collections import defaultdict
import pandas as pd

def parse_xref_list_to_row(xref_list):
	"""
	Parse a list of xref dicts into a single DataFrame row.
	Each xref source becomes a column, value is the xref (or list if multiple for same source).
	The row index is the OMA ID (omaid).
	
	Parameters:
	-----------
	xref_list : list of dict
		List of xref dictionaries as returned by the OMA API.
		
	Returns:
	--------
	pandas.DataFrame: Single-row DataFrame with omaid as index and xrefs as columns.
	"""

	if not xref_list:
		return pd.DataFrame()

	omaid = xref_list[0].get('omaid')
	xref_dict = defaultdict(list)
	for x in xref_list:
		src = x['source']
		xref_dict[src].append(x['xref'])

	# Flatten single-item lists
	xref_dict = {k: v[0] if len(v) == 1 else v for k, v in xref_dict.items()}
	xref_dict['omaid'] = omaid

	df = pd.DataFrame([xref_dict]).set_index('omaid')
	return df

In [24]:
mapping_path = os.path.join(save_dir, "oma_to_uniprot_mapping.csv")
mapping_df = pd.read_csv(mapping_path, index_col=0)
example = mapping_df.iloc[0]
print( example)
#use ret_refs to get xref
#check xref pickle
xref_path = os.path.join(save_dir, "xref_data.pkl")
# Check if the xref data pickle file exists
if os.path.exists(xref_path):
	print("Loading xref data from pickle file...")
	xref_data = pd.read_pickle(xref_path)
else:
	print("Xref data pickle not found, generating it...")
	# Generate xref data
	# This will take some time depending on the number of OMA IDs
	# and the speed of the OMA API
	xref_data = ret_refs(mapping_df.omaid.to_list() , mapper, linkout)
#map xref_data to a DataFrame
print(xref_data)

index                                                               TRYB205898
entry_nr                                                               7201087
entry_url                          https://omabrowser.org/api/protein/7201087/
omaid                                                               TRYB205898
canonicalid                                                             Q57ZU2
sequence_md5                                  c077db9e5ee8aa5e763b0daf676859b6
sequence_length                                                            860
species                      {'code': 'TRYB2', 'taxon_id': 185431, 'species...
oma_group                                                              1428576
oma_hog_id                                                     HOG:E0802393.2b
chromosome                                                                   5
locus                            {'start': 932578, 'end': 935160, 'strand': 1}
is_main_isoform                                     

In [25]:
mapping_df['uniprot_refs'] = mapping_df['omaid'].map(xref_data)

In [26]:
#save with uniprot_refs
map_out = os.path.join(save_dir, "oma_to_uniprot_mapping_with_refs.csv")
mapping_df.to_csv(map_out, index=False)

In [27]:
print( mapping_df.head() )

        index  entry_nr                                     entry_url  \
0  TRYB205898   7201087   https://omabrowser.org/api/protein/7201087/   
1  DROME24395  17556944  https://omabrowser.org/api/protein/17556944/   
2  YEAST00066   8927628   https://omabrowser.org/api/protein/8927628/   
3  THAPS06704  20296723  https://omabrowser.org/api/protein/20296723/   
4  DROME17519  17550068  https://omabrowser.org/api/protein/17550068/   

        omaid  canonicalid                      sequence_md5  sequence_length  \
0  TRYB205898       Q57ZU2  c077db9e5ee8aa5e763b0daf676859b6              860   
1  DROME24395   GATA_DROME  6f214dfc8f1dbf0393dd24e6aac8ea1a              508   
2  YEAST00066  HSP71_YEAST  7cdc576cd47040f8a215326a8e29e066              642   
3  THAPS06704       B8BYD7  c493f1360799e587e42ffc17c5f33a57             1103   
4  DROME17519       Q9VVI0  4827054d86ce99be2f09ad82c7666978              553   

                                             species  oma_group  \
0  {'co

In [28]:
mapping_df['1stref'] = mapping_df['uniprot_refs'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
#eliminate rows with no 1stref
mapping_df = mapping_df[mapping_df['1stref'].notnull()]
print(mapping_df.head())

        index  entry_nr                                     entry_url  \
0  TRYB205898   7201087   https://omabrowser.org/api/protein/7201087/   
1  DROME24395  17556944  https://omabrowser.org/api/protein/17556944/   
2  YEAST00066   8927628   https://omabrowser.org/api/protein/8927628/   
3  THAPS06704  20296723  https://omabrowser.org/api/protein/20296723/   
4  DROME17519  17550068  https://omabrowser.org/api/protein/17550068/   

        omaid  canonicalid                      sequence_md5  sequence_length  \
0  TRYB205898       Q57ZU2  c077db9e5ee8aa5e763b0daf676859b6              860   
1  DROME24395   GATA_DROME  6f214dfc8f1dbf0393dd24e6aac8ea1a              508   
2  YEAST00066  HSP71_YEAST  7cdc576cd47040f8a215326a8e29e066              642   
3  THAPS06704       B8BYD7  c493f1360799e587e42ffc17c5f33a57             1103   
4  DROME17519       Q9VVI0  4827054d86ce99be2f09ad82c7666978              553   

                                             species  oma_group  \
0  {'co

In [29]:
mapping_df['struct_folder'] = mapping_df['oma_group'].apply(lambda x: f"./families/Information_benchmark/marker_genes/{x}/structs")

In [30]:
from foldtree2.src.AFDB_tools import grab_struct
import tqdm

In [32]:
dl = True
if dl:
	#download the alphafold structures for each marker gene family
	for idx,row in tqdm.tqdm(mapping_df.iterrows()):
		#check if the folder exists
		struct_folder = row['struct_folder']
		if not os.path.exists(struct_folder):
			os.makedirs(struct_folder)
		#download the structure
		retpath = grab_struct(row['1stref'], struct_folder, overwrite=overwrite)
		#add struct path to mapping_df
		mapping_df.at[idx, 'struct_path'] = retpath
	#save the mapping_df with struct paths
	map_out = os.path.join(save_dir, "oma_to_uniprot_mapping_with_structs.csv")
	mapping_df.to_csv(map_out, index=False)
if not dl:
	#load the mapping_df with struct paths
	map_out = os.path.join(save_dir, "oma_to_uniprot_mapping_with_structs.csv")

15it [00:00, 77.87it/s]

error downloading structure for A0A8H4BYN2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BYN2-F1-model_v4.pdb
error downloading structure for A0A2K3DAU6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3DAU6-F1-model_v4.pdb
error downloading structure for O15881 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-O15881-F1-model_v4.pdb


182it [00:00, 431.40it/s]

error downloading structure for L8HD42 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8HD42-F1-model_v4.pdb
error downloading structure for X6P1L5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6P1L5-F1-model_v4.pdb
error downloading structure for A0A1X7UM21 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7UM21-F1-model_v4.pdb


228it [00:00, 326.26it/s]

error downloading structure for D8LHW4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LHW4-F1-model_v4.pdb
error downloading structure for G3M399 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-G3M399-F1-model_v4.pdb
error downloading structure for Q7S8V0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S8V0-F1-model_v4.pdb


373it [00:00, 411.86it/s]

error downloading structure for A0A178W7K6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W7K6-F1-model_v4.pdb
error downloading structure for A0A178UF03 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UF03-F1-model_v4.pdb
error downloading structure for A0A0B4KHL2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0B4KHL2-F1-model_v4.pdb
error downloading structure for A0A8V8TNX6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TNX6-F1-model_v4.pdb


430it [00:01, 388.86it/s]

error downloading structure for A0A9L9PXM3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A9L9PXM3-F1-model_v4.pdb
error downloading structure for A0A2K3CS00 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3CS00-F1-model_v4.pdb
error downloading structure for A0A8V8TRG9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TRG9-F1-model_v4.pdb


578it [00:01, 557.82it/s]

error downloading structure for B8C4M1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C4M1-F1-model_v4.pdb
error downloading structure for D8LRP2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LRP2-F1-model_v4.pdb
error downloading structure for X6N3M5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6N3M5-F1-model_v4.pdb


684it [00:01, 423.39it/s]

error downloading structure for M9PFS0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-M9PFS0-F1-model_v4.pdb
error downloading structure for Q7RWE3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7RWE3-F1-model_v4.pdb
error downloading structure for A0A2K3CRZ5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3CRZ5-F1-model_v4.pdb


889it [00:01, 648.05it/s]

error downloading structure for D8LQN4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LQN4-F1-model_v4.pdb
error downloading structure for X6MGV5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6MGV5-F1-model_v4.pdb
error downloading structure for D8LEU4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LEU4-F1-model_v4.pdb


966it [00:02, 606.88it/s]

error downloading structure for A9RTW1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RTW1-F1-model_v4.pdb
error downloading structure for A0D7P3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0D7P3-F1-model_v4.pdb
error downloading structure for A0A2K1L0X0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1L0X0-F1-model_v4.pdb


1035it [00:02, 468.52it/s]

error downloading structure for A0A8H4F898 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4F898-F1-model_v4.pdb
error downloading structure for B8BVD1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BVD1-F1-model_v4.pdb
error downloading structure for A0A1X7UQD0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7UQD0-F1-model_v4.pdb


1180it [00:02, 599.24it/s]

error downloading structure for A0A2K1JLW1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JLW1-F1-model_v4.pdb
error downloading structure for A0A0G2JRU0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0G2JRU0-F1-model_v4.pdb
error downloading structure for B5YNG2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B5YNG2-F1-model_v4.pdb


1347it [00:02, 550.30it/s]

error downloading structure for A0A0D2WJE0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WJE0-F1-model_v4.pdb
error downloading structure for D7FI10 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FI10-F1-model_v4.pdb
error downloading structure for L8H0U7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H0U7-F1-model_v4.pdb


1408it [00:03, 457.14it/s]

error downloading structure for A0A8H4BX66 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BX66-F1-model_v4.pdb
error downloading structure for F5H112 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-F5H112-F1-model_v4.pdb
error downloading structure for A0A178UNQ0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UNQ0-F1-model_v4.pdb
error downloading structure for A0A8H4BWX1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BWX1-F1-model_v4.pdb
error downloading structure for A8J3Y6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A8J3Y6-F1-model_v4.pdb
error downloading structure for Q7SBD3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBD3-F1-model_v4.pdb


1500it [00:03, 328.35it/s]

error downloading structure for A0DJB3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0DJB3-F1-model_v4.pdb
error downloading structure for A0A178UPW5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UPW5-F1-model_v4.pdb
error downloading structure for X6NQP0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6NQP0-F1-model_v4.pdb


1538it [00:03, 316.81it/s]

error downloading structure for A6NMQ1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A6NMQ1-F1-model_v4.pdb
error downloading structure for A0A0D2UQX2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UQX2-F1-model_v4.pdb
error downloading structure for G5E866 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-G5E866-F1-model_v4.pdb


1656it [00:03, 350.63it/s]

error downloading structure for B3H4G2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B3H4G2-F1-model_v4.pdb
error downloading structure for A0EII8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0EII8-F1-model_v4.pdb
error downloading structure for B8C4Z8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C4Z8-F1-model_v4.pdb


1735it [00:04, 374.78it/s]

error downloading structure for D8LQX5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LQX5-F1-model_v4.pdb
error downloading structure for A0A2K3D577 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3D577-F1-model_v4.pdb
error downloading structure for X6LSP0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6LSP0-F1-model_v4.pdb


1874it [00:04, 408.88it/s]

error downloading structure for A0A0D2UHS0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UHS0-F1-model_v4.pdb
error downloading structure for A0C766 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0C766-F1-model_v4.pdb
error downloading structure for A0A994J6E8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A994J6E8-F1-model_v4.pdb


1916it [00:04, 344.64it/s]

error downloading structure for L8GWB5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GWB5-F1-model_v4.pdb
error downloading structure for B8CAU9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8CAU9-F1-model_v4.pdb
error downloading structure for A0A8H4BXW5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BXW5-F1-model_v4.pdb


1976it [00:04, 357.37it/s]

error downloading structure for A0A2K3E2B8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3E2B8-F1-model_v4.pdb
error downloading structure for A0CXW7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0CXW7-F1-model_v4.pdb
error downloading structure for A0A0D2VTX0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VTX0-F1-model_v4.pdb


2066it [00:05, 337.47it/s]

error downloading structure for A0A2K3CV27 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3CV27-F1-model_v4.pdb
error downloading structure for D8LTY2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LTY2-F1-model_v4.pdb
error downloading structure for Q9U6I2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q9U6I2-F1-model_v4.pdb


2244it [00:05, 640.93it/s]

error downloading structure for B8C728 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C728-F1-model_v4.pdb
error downloading structure for Q7S6V7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S6V7-F1-model_v4.pdb
error downloading structure for A0A0D2X0Z2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X0Z2-F1-model_v4.pdb
error downloading structure for A9RFW2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RFW2-F1-model_v4.pdb
error downloading structure for A0A0D2UP26 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UP26-F1-model_v4.pdb
error downloading structure for A0A0D2WIH6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WIH6-F1-model_v4.pdb
error downloading structure for A0A8H4FAH9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4FAH9-F1-model_v4.pdb


2376it [00:05, 365.44it/s]

error downloading structure for Q8MS45 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8MS45-F1-model_v4.pdb
error downloading structure for A0A2R8Y705 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2R8Y705-F1-model_v4.pdb
error downloading structure for A0CY10 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0CY10-F1-model_v4.pdb


2520it [00:05, 476.70it/s]

error downloading structure for A0A2K3E5A0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3E5A0-F1-model_v4.pdb
error downloading structure for A0A178WN52 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178WN52-F1-model_v4.pdb
error downloading structure for A0A2K1IHM2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1IHM2-F1-model_v4.pdb


2774it [00:06, 591.83it/s]

error downloading structure for A0A8H4BX09 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BX09-F1-model_v4.pdb
error downloading structure for L8GTL5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GTL5-F1-model_v4.pdb
error downloading structure for A0A1X7VVZ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VVZ9-F1-model_v4.pdb


2853it [00:06, 576.95it/s]

error downloading structure for X6NDB8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6NDB8-F1-model_v4.pdb
error downloading structure for A0A8V8TQE5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TQE5-F1-model_v4.pdb
error downloading structure for D2V0G9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V0G9-F1-model_v4.pdb


2985it [00:06, 510.43it/s]

error downloading structure for A0A0D2VFR5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VFR5-F1-model_v4.pdb
error downloading structure for A0A8H4BZ97 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BZ97-F1-model_v4.pdb
error downloading structure for D8LRY3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LRY3-F1-model_v4.pdb


3065it [00:06, 520.78it/s]

error downloading structure for A0A178UHA5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UHA5-F1-model_v4.pdb
error downloading structure for Q7S1T6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S1T6-F1-model_v4.pdb
error downloading structure for A0A0D2X573 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X573-F1-model_v4.pdb


3196it [00:07, 480.37it/s]

error downloading structure for A0A1X7VNC8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VNC8-F1-model_v4.pdb
error downloading structure for A0BZJ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0BZJ9-F1-model_v4.pdb
error downloading structure for A0A0D2WVM9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WVM9-F1-model_v4.pdb


3246it [00:07, 443.84it/s]

error downloading structure for D2V5V4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V5V4-F1-model_v4.pdb
error downloading structure for B8BTL9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BTL9-F1-model_v4.pdb
error downloading structure for A0A9L9PWW3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A9L9PWW3-F1-model_v4.pdb


3440it [00:07, 528.43it/s]

error downloading structure for A0A8H8ULD4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8ULD4-F1-model_v4.pdb
error downloading structure for C9JJJ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-C9JJJ9-F1-model_v4.pdb
error downloading structure for O08847 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-O08847-F1-model_v4.pdb


3494it [00:07, 477.15it/s]

error downloading structure for A0A2K1KKV4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1KKV4-F1-model_v4.pdb
error downloading structure for L8H8A6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H8A6-F1-model_v4.pdb
error downloading structure for A0CWW3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0CWW3-F1-model_v4.pdb


3638it [00:07, 610.27it/s]

error downloading structure for A0C1F3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0C1F3-F1-model_v4.pdb
error downloading structure for A0A0D2WQT4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WQT4-F1-model_v4.pdb
error downloading structure for X6MD81 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6MD81-F1-model_v4.pdb


3765it [00:08, 469.45it/s]

error downloading structure for A0A1X7VIA2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VIA2-F1-model_v4.pdb
error downloading structure for A0BQ90 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0BQ90-F1-model_v4.pdb
error downloading structure for A0A1C7ZN09 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1C7ZN09-F1-model_v4.pdb


3814it [00:08, 381.24it/s]

error downloading structure for A0A6Q8PGB0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A6Q8PGB0-F1-model_v4.pdb
error downloading structure for A0A2K3D4R2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3D4R2-F1-model_v4.pdb
error downloading structure for A0A178WJP3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178WJP3-F1-model_v4.pdb


3938it [00:08, 452.32it/s]

error downloading structure for V5INF5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-V5INF5-F1-model_v4.pdb
error downloading structure for D2VN06 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VN06-F1-model_v4.pdb
error downloading structure for Q7RXJ6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7RXJ6-F1-model_v4.pdb


3989it [00:08, 427.23it/s]

error downloading structure for D2UY12 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2UY12-F1-model_v4.pdb
error downloading structure for Q872G8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q872G8-F1-model_v4.pdb
error downloading structure for A0A2K3DR41 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3DR41-F1-model_v4.pdb


4146it [00:09, 480.76it/s]

error downloading structure for A0A8H4BYJ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BYJ9-F1-model_v4.pdb
error downloading structure for L8GZ75 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GZ75-F1-model_v4.pdb
error downloading structure for L8GWY4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GWY4-F1-model_v4.pdb


4273it [00:09, 479.40it/s]

error downloading structure for Q7S8K0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S8K0-F1-model_v4.pdb
error downloading structure for D2V3Y4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V3Y4-F1-model_v4.pdb
error downloading structure for I1V4Y8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-I1V4Y8-F1-model_v4.pdb


4322it [00:09, 379.87it/s]

error downloading structure for A0A2K1J2Q4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1J2Q4-F1-model_v4.pdb
error downloading structure for Q8IKE1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8IKE1-F1-model_v4.pdb
error downloading structure for B7ZKR9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B7ZKR9-F1-model_v4.pdb


4455it [00:09, 526.61it/s]

error downloading structure for A0A0D2UDQ5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UDQ5-F1-model_v4.pdb
error downloading structure for A0A1X7VQ62 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VQ62-F1-model_v4.pdb
error downloading structure for A0A994J5K1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A994J5K1-F1-model_v4.pdb


4511it [00:10, 421.56it/s]

error downloading structure for Q8I1X5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8I1X5-F1-model_v4.pdb
error downloading structure for A0A5S9T906 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A5S9T906-F1-model_v4.pdb
error downloading structure for A0A2K1KEY3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1KEY3-F1-model_v4.pdb


4600it [00:10, 331.12it/s]

error downloading structure for Q7SBL5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBL5-F1-model_v4.pdb
error downloading structure for A0A8V8TQ34 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TQ34-F1-model_v4.pdb
error downloading structure for A0A8V8TNX3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TNX3-F1-model_v4.pdb
error downloading structure for A0A8H4BUB1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BUB1-F1-model_v4.pdb


4677it [00:10, 379.69it/s]

error downloading structure for B8C8T3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C8T3-F1-model_v4.pdb
error downloading structure for A9RYA1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RYA1-F1-model_v4.pdb
error downloading structure for A0A4D6K3W6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A4D6K3W6-F1-model_v4.pdb


4774it [00:10, 352.13it/s]

error downloading structure for A0A2K1KED1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1KED1-F1-model_v4.pdb
error downloading structure for Q7SBT9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBT9-F1-model_v4.pdb
error downloading structure for B8BW81 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BW81-F1-model_v4.pdb


4989it [00:11, 622.78it/s]

error downloading structure for A0A1I9LME9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1I9LME9-F1-model_v4.pdb
error downloading structure for D2VT72 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VT72-F1-model_v4.pdb
error downloading structure for Q8IFP1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8IFP1-F1-model_v4.pdb


5063it [00:11, 472.85it/s]

error downloading structure for V5IM93 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-V5IM93-F1-model_v4.pdb
error downloading structure for A0A2K1IX27 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1IX27-F1-model_v4.pdb
error downloading structure for A0A178W7J7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W7J7-F1-model_v4.pdb
error downloading structure for A9UNH9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9UNH9-F1-model_v4.pdb


5124it [00:11, 470.12it/s]

error downloading structure for A0A2K1JKR7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JKR7-F1-model_v4.pdb
error downloading structure for A0A0D2VGW4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VGW4-F1-model_v4.pdb
error downloading structure for A0A0D2VTB7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VTB7-F1-model_v4.pdb


5181it [00:11, 323.43it/s]

error downloading structure for A0A2K3D1T9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3D1T9-F1-model_v4.pdb
error downloading structure for A0A8H4BV16 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BV16-F1-model_v4.pdb
error downloading structure for A0A0D2X586 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X586-F1-model_v4.pdb
error downloading structure for A0A8H8ULD3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8ULD3-F1-model_v4.pdb


5245it [00:12, 341.45it/s]

error downloading structure for H7C0A0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-H7C0A0-F1-model_v4.pdb
error downloading structure for A0A8H8UKY7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8UKY7-F1-model_v4.pdb
error downloading structure for Q7SBU3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBU3-F1-model_v4.pdb


5610it [00:12, 777.15it/s]

error downloading structure for A0A0B4KFH4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0B4KFH4-F1-model_v4.pdb
error downloading structure for D7FNG4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FNG4-F1-model_v4.pdb
error downloading structure for Q80XH7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q80XH7-F1-model_v4.pdb


5740it [00:12, 779.87it/s]

error downloading structure for A0A178V904 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178V904-F1-model_v4.pdb
error downloading structure for A0A2K3E5Q8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3E5Q8-F1-model_v4.pdb
error downloading structure for D7FU80 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FU80-F1-model_v4.pdb


5832it [00:12, 577.59it/s]

error downloading structure for A0A8H4BWD9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BWD9-F1-model_v4.pdb
error downloading structure for A8K8N7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A8K8N7-F1-model_v4.pdb
error downloading structure for A0A8Q3SIS9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8Q3SIS9-F1-model_v4.pdb


5907it [00:12, 560.01it/s]

error downloading structure for A0A1X7VP53 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VP53-F1-model_v4.pdb
error downloading structure for A0A1X7V9A9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7V9A9-F1-model_v4.pdb
error downloading structure for J9R021 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-J9R021-F1-model_v4.pdb


6059it [00:13, 536.83it/s]

error downloading structure for B8CE77 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8CE77-F1-model_v4.pdb
error downloading structure for Q84KP6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q84KP6-F1-model_v4.pdb
error downloading structure for A0A8D9I6U5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8D9I6U5-F1-model_v4.pdb


6152it [00:13, 548.25it/s]

error downloading structure for D7G3I9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7G3I9-F1-model_v4.pdb
error downloading structure for Q4V5E9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q4V5E9-F1-model_v4.pdb
error downloading structure for D8LES2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LES2-F1-model_v4.pdb


6265it [00:13, 472.15it/s]

error downloading structure for B8BVU2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BVU2-F1-model_v4.pdb
error downloading structure for A0A178WHI8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178WHI8-F1-model_v4.pdb
error downloading structure for D2W323 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2W323-F1-model_v4.pdb


6314it [00:13, 372.25it/s]

error downloading structure for L8HD92 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8HD92-F1-model_v4.pdb
error downloading structure for A0A8H4FAP6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4FAP6-F1-model_v4.pdb
error downloading structure for B8C8U9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C8U9-F1-model_v4.pdb


6355it [00:14, 344.92it/s]

error downloading structure for A7E261 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A7E261-F1-model_v4.pdb
error downloading structure for L8GIV8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GIV8-F1-model_v4.pdb
error downloading structure for D2UZM0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2UZM0-F1-model_v4.pdb


6491it [00:14, 390.33it/s]

error downloading structure for A0A8V8TNZ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TNZ9-F1-model_v4.pdb
error downloading structure for A0A0D2VJQ0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VJQ0-F1-model_v4.pdb
error downloading structure for A0A2K3DJF9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3DJF9-F1-model_v4.pdb
error downloading structure for A0A0D2VYG0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VYG0-F1-model_v4.pdb
error downloading structure for L8H0V8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H0V8-F1-model_v4.pdb
error downloading structure for D7FN81 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FN81-F1-model_v4.pdb


6581it [00:14, 298.72it/s]

error downloading structure for Q7S1D7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S1D7-F1-model_v4.pdb
error downloading structure for A0A2K1JLZ0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JLZ0-F1-model_v4.pdb
error downloading structure for L8GY46 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GY46-F1-model_v4.pdb


6669it [00:15, 359.73it/s]

error downloading structure for A0A4D6K4F6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A4D6K4F6-F1-model_v4.pdb
error downloading structure for H0YKI9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-H0YKI9-F1-model_v4.pdb
error downloading structure for A0A2K1IRD9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1IRD9-F1-model_v4.pdb


6708it [00:15, 278.16it/s]

error downloading structure for A0A2K1JHT7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JHT7-F1-model_v4.pdb
error downloading structure for L8HFM7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8HFM7-F1-model_v4.pdb
error downloading structure for A0BYP5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0BYP5-F1-model_v4.pdb


6740it [00:15, 268.06it/s]

error downloading structure for D8LQ01 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LQ01-F1-model_v4.pdb
error downloading structure for A0A8H4FA39 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4FA39-F1-model_v4.pdb
error downloading structure for A0A994J3Z0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A994J3Z0-F1-model_v4.pdb


6912it [00:15, 463.27it/s]

error downloading structure for A0A8H4C1K9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4C1K9-F1-model_v4.pdb
error downloading structure for L8H1Z5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H1Z5-F1-model_v4.pdb
error downloading structure for A0A494C1A5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A494C1A5-F1-model_v4.pdb


7058it [00:15, 490.19it/s]

error downloading structure for A9RIM7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RIM7-F1-model_v4.pdb
error downloading structure for D8LIP1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LIP1-F1-model_v4.pdb
error downloading structure for A0A1X7V288 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7V288-F1-model_v4.pdb


7193it [00:16, 682.85it/s]

error downloading structure for A0A0D2X319 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X319-F1-model_v4.pdb
error downloading structure for D2V097 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V097-F1-model_v4.pdb
error downloading structure for A0A2K1JG38 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JG38-F1-model_v4.pdb


7337it [00:16, 515.30it/s]

error downloading structure for A0A178VV92 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178VV92-F1-model_v4.pdb
error downloading structure for B6IDH0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B6IDH0-F1-model_v4.pdb
error downloading structure for D2VNK8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VNK8-F1-model_v4.pdb


7414it [00:16, 521.67it/s]

error downloading structure for A0A8H4BWC8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BWC8-F1-model_v4.pdb
error downloading structure for A0A178W8K9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W8K9-F1-model_v4.pdb
error downloading structure for A0A8H4BW10 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BW10-F1-model_v4.pdb


7499it [00:16, 531.80it/s]

error downloading structure for A0A178W1F6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W1F6-F1-model_v4.pdb
error downloading structure for D2VJC0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VJC0-F1-model_v4.pdb
error downloading structure for A0D8K9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0D8K9-F1-model_v4.pdb


7556it [00:17, 381.15it/s]

error downloading structure for A0A2K1L907 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1L907-F1-model_v4.pdb
error downloading structure for A0A8Q3SHT1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8Q3SHT1-F1-model_v4.pdb
error downloading structure for A8K6F0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A8K6F0-F1-model_v4.pdb


7751it [00:17, 447.41it/s]

error downloading structure for A0A8H8UMD1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8UMD1-F1-model_v4.pdb
error downloading structure for D2V0Z5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V0Z5-F1-model_v4.pdb
error downloading structure for A0A0D2X192 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X192-F1-model_v4.pdb





In [33]:
#drop the rows with no struct_path
mapping_df = mapping_df[mapping_df['struct_path'].notnull()]
print(f"Final mapping DataFrame with structures: {mapping_df.shape[0]} entries")

Final mapping DataFrame with structures: 7521 entries


In [31]:
#write an identifier file for each marker gene family
for sub in mapping_df['oma_group'].unique():
	sub_df = mapping_df[mapping_df['oma_group'] == sub]
	#write the uniprot ids to an identifiers.txt file
	identifiers_file = f"./families/Information_benchmark/marker_genes/{sub}/identifiers.txt"
	with open(identifiers_file, 'w') as f:
		for uniprot_id in sub_df['1stref'].unique():
			f.write(f"{uniprot_id}\n")

In [32]:
from Bio import PDB

def create_fasta_from_pdbs(pdb_folder, output_fasta, chain_id='A', species_mapping=None):
	"""
	Create a FASTA file from PDB structures by extracting amino acid sequences using Biopython.
	
	Parameters:
	-----------
	pdb_folder : str
		Path to folder containing PDB files
	output_fasta : str
		Path to output FASTA file
	chain_id : str, optional
		Chain ID to extract sequence from (default: 'A')
	species_mapping : dict, optional
		Dictionary mapping PDB IDs to species codes
		
	Returns:
	--------
	list: List of SeqRecord objects that were written to FASTA
	"""
	
	# Standard amino acid 3-letter to 1-letter conversion
	aa_codes = {
		'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
		'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
		'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
		'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
		'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
	}
	
	records = []
	parser = PDB.PDBParser(QUIET=True)
	
	# Find all PDB files in the folder
	pdb_files = glob.glob(os.path.join(pdb_folder, "*.pdb"))
	
	for pdb_file in pdb_files:
		try:
			# Get structure name from filename
			pdb_id = os.path.basename(pdb_file).replace('.pdb', '')
			
			# Parse structure
			structure = parser.get_structure(pdb_id, pdb_file)
			
			# Get first model
			model = structure[0]
			
			# Check if requested chain exists
			if chain_id in model:
				chain = model[chain_id]
				
				# Extract sequence
				sequence = ''
				for residue in chain:
					if residue.get_resname() in aa_codes:
						sequence += aa_codes[residue.get_resname()]
				
				if sequence:
					# Create record ID with species code if available
					if species_mapping and pdb_id in species_mapping:
						record_id = f"{pdb_id}|{species_mapping[pdb_id]}"
						description = f"Chain {chain_id} sequence from {pdb_file} - Species: {species_mapping[pdb_id]}"
					else:
						record_id = pdb_id
						description = f"Chain {chain_id} sequence from {pdb_file}"
					
					# Create SeqRecord object
					record = SeqRecord(
						Seq(sequence),
						id=record_id,
						description=description
					)
					records.append(record)
			
		except Exception as e:
			print(f"Error processing {pdb_file}: {str(e)}")
	
	# Write all sequences to FASTA file
	if records:
		with open(output_fasta, 'w') as handle:
			SeqIO.write(records, handle, 'fasta')
		print(f"Created FASTA file with {len(records)} sequences at {output_fasta}")
	else:
		print("No sequences were extracted from PDB files")
	
	return records

In [33]:
#create a FASTA file for each marker gene family and use the species mapping from the sub datafram

for sub in mapping_df['oma_group'].unique():
	sub_df = mapping_df[mapping_df['oma_group'] == sub]
	struct_folder = sub_df['struct_folder'].iloc[0]  # Use the first struct folder for this family
	output_fasta = f"./families/Information_benchmark/marker_genes/{sub}/sequences.fasta"
	
	# Create species mapping for this family
	species_mapping = dict( (zip(sub_df['1stref'], sub_df['omaid'].apply(lambda x: x[:5]))) )
	if not os.path.exists(output_fasta):
		print(f"Creating FASTA file for {sub} in {output_fasta}")
		create_fasta_from_pdbs(struct_folder, output_fasta, chain_id='A', species_mapping=species_mapping)
	else:
		print(f"FASTA file already exists for {sub}, skipping creation")

FASTA file already exists for 1428576, skipping creation
FASTA file already exists for 966408, skipping creation
FASTA file already exists for 1067505, skipping creation
FASTA file already exists for 1314601, skipping creation
FASTA file already exists for 1067338, skipping creation
FASTA file already exists for 848449, skipping creation
FASTA file already exists for 1392015, skipping creation
FASTA file already exists for 756269, skipping creation
FASTA file already exists for 1344491, skipping creation
FASTA file already exists for 1372179, skipping creation
FASTA file already exists for 1067608, skipping creation
FASTA file already exists for 1324983, skipping creation
FASTA file already exists for 1427916, skipping creation
FASTA file already exists for 1067292, skipping creation
FASTA file already exists for 1384924, skipping creation
FASTA file already exists for 770664, skipping creation
FASTA file already exists for 1396383, skipping creation
FASTA file already exists for 10132

In [34]:
 #use normal mafft to align the sequences
import glob
from concurrent.futures import ThreadPoolExecutor
aln = False
if aln:
	# Get all FASTA files in the marker gene families
	fasta_files = glob.glob("./families/Information_benchmark/marker_genes/*/sequences.fasta")
	print(f"Found {len(fasta_files)} FASTA files for alignment")
	# Align all FASTA files in parallel
	max_workers = 10  # Leave 1 core free for system processes

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
		results = list(executor.map(align_fasta_with_mafft, tqdm.tqdm(fasta_files)))

In [36]:
#for each marker gene, encode the structs, align and make a tree using treebuilder
import foldtree2 as ft2

for fam in tqdm.tqdm(mapping_df['oma_group'].unique()):
	fam_df = mapping_df[mapping_df['oma_group'] == fam]
	#check if the folder exists
	struct_folder = fam_df['struct_folder'].iloc[0]  # Use the first struct folder for this family
	#check if the aligned file exists
	outfile = f"./families/Information_benchmark/marker_genes/{fam}/encoded.ASCIIaln.txt.raxml.bestTree"
	outdir = os.path.dirname(outfile)
	if not os.path.exists(outfile) or overwrite:
		outfasta = os.path.join(outdir, 'encoded.fasta')
		print(f"Encoding structures for {fam} into {outfasta}")
		
		try:	
			results = tb.structs2tree( structs=struct_folder+'/*.pdb' , outdir=outdir, overwrite=overwrite, mafftmat=mafftmat, submat=submat, aapropcsv=aapropcsv )
			#results is a dictionary with keys 'encoded_fasta', 'tree', 'alignment', 'mafft_aln', 'asciifile', 'hexfasta'
			encoded_fasta = results['encoded_fasta']
			treefile = results['tree']
			alnfasta = results['alignment']
			mafftaln = results['mafft_aln']
			asciifile = results['asciifile']
			hexfasta = results['hexfasta']
			print(f"Encoded FASTA: {encoded_fasta}")
			print(f"Tree file: {treefile}")
			print(f"Alignment FASTA: {alnfasta}")
			print(f"MAFFT alignment: {mafftaln}")
			print(f"ASCII file: {asciifile}")
			print(f"Hex FASTA: {hexfasta}")
		except Exception as e:
			print(f"Error processing {fam}: {str(e)}")
			continue
	else:
		print(f"Output file {outfile} already exists, skipping encoding for {fam}")

 86%|▊| 428/500 [00

Output file ./families/Information_benchmark/marker_genes/1428576/encoded.ASCIIaln.txt.raxml.bestTree already exists, skipping encoding for 1428576
Output file ./families/Information_benchmark/marker_genes/966408/encoded.ASCIIaln.txt.raxml.bestTree already exists, skipping encoding for 966408
Output file ./families/Information_benchmark/marker_genes/1067505/encoded.ASCIIaln.txt.raxml.bestTree already exists, skipping encoding for 1067505
Output file ./families/Information_benchmark/marker_genes/1314601/encoded.ASCIIaln.txt.raxml.bestTree already exists, skipping encoding for 1314601
Output file ./families/Information_benchmark/marker_genes/1067338/encoded.ASCIIaln.txt.raxml.bestTree already exists, skipping encoding for 1067338
Output file ./families/Information_benchmark/marker_genes/848449/encoded.ASCIIaln.txt.raxml.bestTree already exists, skipping encoding for 848449
Output file ./families/Information_benchmark/marker_genes/1392015/encoded.ASCIIaln.txt.raxml.bestTree already exists



converting structures



 86%|▊| 428/500 [00
[A
[A
100%|█| 3/3 [00:56<
Encoding structures to FASTA: 3it [00:56, 18.89s/it]


converting to hex for mafft
outfile for hex : ./families/Information_benchmark/marker_genes/768489/encoded.hex
converting to ascii for mafft
outfile for ascii : ./families/Information_benchmark/marker_genes/768489/encoded.ASCII
./foldtree2/mafft_tools/hex2maffttext ./families/Information_benchmark/marker_genes/768489/encoded.hex > ./families/Information_benchmark/marker_genes/768489/encoded.ASCII
asciifile: ./families/Information_benchmark/marker_genes/768489/encoded.ASCII
running mafft
mafft --text --thread -1 --localpair --maxiterate 1000 --textmatrix models/mergeddecoder_foldtree2_test_mafftmat.mtx ./families/Information_benchmark/marker_genes/768489/encoded.ASCII  > ./families/Information_benchmark/marker_genes/768489/encoded.ASCIIaln.txt


OS = linux
The number of physical cores =  16
outputhat23=16
treein = 0
compacttree = 0
stacksize: 8192 kb
nalphabets = 256
nused=
All-to-all alignment.
tbfast-pair (text) Version 7.526
alg=L, model=Extended, 2.00, -0.10, +0.10, noshift, amax=0.0
16 thread(s)

outputhat23=16
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
nalphabets = 256
nused=
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
    0 / 3
done.

Progressive alignment ... 
STEP     2 /2 (thread    1) 
done.
tbfast (text) Version 7.526
alg=A, model=Extended, 1.53, -0.00, -0.00, noshift, amax=0.0
16 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 8
randomseed = 0
blosum -2 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16
Loading 'hat3' ... done.
nalphabets = 256
nused=

    0 / 3
Segment   1/  1    1-2702
002-0002-1 (thread    5) identical     001-0001-1 (thread    4) identical     001-0002-1 (thread    1) identi

converting mafft aln to hex fasta
./foldtree2/mafft_tools/maffttext2hex ./families/Information_benchmark/marker_genes/768489/encoded.ASCIIaln.txt > ./families/Information_benchmark/marker_genes/768489/encoded.ASCIIaln.txt.hex
running raxml-ng
./foldtree2/raxml-ng/raxml-ng --model MULTI22_GTR{models/mergeddecoder_foldtree2_test_submat.txt} --redo  --all --bs-trees 20 --seed 12345 --threads auto{32} --workers auto --msa ./families/Information_benchmark/marker_genes/768489/encoded.ASCIIaln.txt.raxml_aln.fasta --prefix ./families/Information_benchmark/marker_genes/768489/encoded.ASCIIaln.txt --force perf_threads

RAxML-NG v. 1.2.2-master released on 30.04.2024 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth, Julia Haag, Anastasis Togkousidis.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum



converting structures



[A
[A
[A
[A
[A
100%|█| 5/5 [00:30<
Encoding structures to FASTA: 5it [00:30,  6.10s/it]


converting to hex for mafft
outfile for hex : ./families/Information_benchmark/marker_genes/1396892/encoded.hex
converting to ascii for mafft
outfile for ascii : ./families/Information_benchmark/marker_genes/1396892/encoded.ASCII
./foldtree2/mafft_tools/hex2maffttext ./families/Information_benchmark/marker_genes/1396892/encoded.hex > ./families/Information_benchmark/marker_genes/1396892/encoded.ASCII
asciifile: ./families/Information_benchmark/marker_genes/1396892/encoded.ASCII
running mafft
mafft --text --thread -1 --localpair --maxiterate 1000 --textmatrix models/mergeddecoder_foldtree2_test_mafftmat.mtx ./families/Information_benchmark/marker_genes/1396892/encoded.ASCII  > ./families/Information_benchmark/marker_genes/1396892/encoded.ASCIIaln.txt


OS = linux
The number of physical cores =  16
outputhat23=16
treein = 0
compacttree = 0
stacksize: 8192 kb
nalphabets = 256
nused=
All-to-all alignment.
tbfast-pair (text) Version 7.526
alg=L, model=Extended, 2.00, -0.10, +0.10, noshift, amax=0.0
16 thread(s)

outputhat23=16
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
nalphabets = 256
nused=
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
    0 / 5
done.

Progressive alignment ... 
STEP     1 /4 (thread    1) 
Reallocating (by thread 5) ..done. *alloclen = 3726
STEP     4 /4 (thread    6) 
done.
tbfast (text) Version 7.526
alg=A, model=Extended, 1.53, -0.00, -0.00, noshift, amax=0.0
16 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 8
randomseed = 0
blosum -2 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16
Loading 'hat3' ... done.
nalphabets = 256
nused=

    0 / 5
Segment   1/  1    1-1474
004-0005-0 (thread    5) ide

converting mafft aln to hex fasta
./foldtree2/mafft_tools/maffttext2hex ./families/Information_benchmark/marker_genes/1396892/encoded.ASCIIaln.txt > ./families/Information_benchmark/marker_genes/1396892/encoded.ASCIIaln.txt.hex
running raxml-ng
./foldtree2/raxml-ng/raxml-ng --model MULTI22_GTR{models/mergeddecoder_foldtree2_test_submat.txt} --redo  --all --bs-trees 20 --seed 12345 --threads auto{32} --workers auto --msa ./families/Information_benchmark/marker_genes/1396892/encoded.ASCIIaln.txt.raxml_aln.fasta --prefix ./families/Information_benchmark/marker_genes/1396892/encoded.ASCIIaln.txt --force perf_threads

RAxML-NG v. 1.2.2-master released on 30.04.2024 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth, Julia Haag, Anastasis Togkousidis.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/f

100%|▉| 499/500 [01

[00:00:00] [worker #17] Bootstrap tree #18, logLikelihood: -15204.927209
[00:00:00] [worker #16] Bootstrap tree #17, logLikelihood: -15159.241963
[00:00:00] [worker #8] Bootstrap tree #9, logLikelihood: -14891.741727
[00:00:00] [worker #19] Bootstrap tree #20, logLikelihood: -15067.664345
[00:00:00] [worker #3] Bootstrap tree #4, logLikelihood: -15359.332633
[00:00:00] [worker #13] Bootstrap tree #14, logLikelihood: -15232.471873
[00:00:00] [worker #5] Bootstrap tree #6, logLikelihood: -15237.010550
[00:00:00] [worker #0] Bootstrap tree #1, logLikelihood: -15210.814649
[00:00:00] [worker #10] Bootstrap tree #11, logLikelihood: -15115.434352
[00:00:00] [worker #15] Bootstrap tree #16, logLikelihood: -14937.145286
[00:00:00] [worker #18] Bootstrap tree #19, logLikelihood: -15202.776446
[00:00:00] [worker #9] Bootstrap tree #10, logLikelihood: -15266.138264
[00:00:01] [worker #12] Bootstrap tree #13, logLikelihood: -15180.071846
[00:00:01] [worker #6] Bootstrap tree #7, logLikelihood: -14



converting structures



[A
[A
[A
[A
[A
100%|█| 5/5 [00:42<
Encoding structures to FASTA: 5it [00:42,  8.58s/it]


converting to hex for mafft
outfile for hex : ./families/Information_benchmark/marker_genes/1424224/encoded.hex
converting to ascii for mafft
outfile for ascii : ./families/Information_benchmark/marker_genes/1424224/encoded.ASCII
./foldtree2/mafft_tools/hex2maffttext ./families/Information_benchmark/marker_genes/1424224/encoded.hex > ./families/Information_benchmark/marker_genes/1424224/encoded.ASCII
asciifile: ./families/Information_benchmark/marker_genes/1424224/encoded.ASCII
running mafft
mafft --text --thread -1 --localpair --maxiterate 1000 --textmatrix models/mergeddecoder_foldtree2_test_mafftmat.mtx ./families/Information_benchmark/marker_genes/1424224/encoded.ASCII  > ./families/Information_benchmark/marker_genes/1424224/encoded.ASCIIaln.txt


OS = linux
The number of physical cores =  16
outputhat23=16
treein = 0
compacttree = 0
stacksize: 8192 kb
nalphabets = 256
nused=
All-to-all alignment.
tbfast-pair (text) Version 7.526
alg=L, model=Extended, 2.00, -0.10, +0.10, noshift, amax=0.0
16 thread(s)

outputhat23=16
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
nalphabets = 256
nused=
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
    0 / 5
done.

Progressive alignment ... 
STEP     2 /4 (thread    0) 
Reallocating (by thread 3) ..done. *alloclen = 4547
STEP     4 /4 (thread    2) 
done.
tbfast (text) Version 7.526
alg=A, model=Extended, 1.53, -0.00, -0.00, noshift, amax=0.0
16 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 8
randomseed = 0
blosum -2 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16
Loading 'hat3' ... done.
nalphabets = 256
nused=

    0 / 5
Segment   1/  1    1-1932
004-0006-1 (thread    3) wor

converting mafft aln to hex fasta
./foldtree2/mafft_tools/maffttext2hex ./families/Information_benchmark/marker_genes/1424224/encoded.ASCIIaln.txt > ./families/Information_benchmark/marker_genes/1424224/encoded.ASCIIaln.txt.hex
running raxml-ng
./foldtree2/raxml-ng/raxml-ng --model MULTI22_GTR{models/mergeddecoder_foldtree2_test_submat.txt} --redo  --all --bs-trees 20 --seed 12345 --threads auto{32} --workers auto --msa ./families/Information_benchmark/marker_genes/1424224/encoded.ASCIIaln.txt.raxml_aln.fasta --prefix ./families/Information_benchmark/marker_genes/1424224/encoded.ASCIIaln.txt --force perf_threads

RAxML-NG v. 1.2.2-master released on 30.04.2024 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth, Julia Haag, Anastasis Togkousidis.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/f

100%|█| 500/500 [02

[00:00:01] [worker #3] Bootstrap tree #20, logLikelihood: -19469.831330
[00:00:01] [worker #0] Bootstrap tree #17, logLikelihood: -19470.504756
[00:00:01] [worker #2] Bootstrap tree #19, logLikelihood: -19459.080079
[00:00:01] [worker #1] Bootstrap tree #18, logLikelihood: -19191.280608

Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: NONE
   Base frequencies (user): 0.022373 0.014977 0.037758 0.027621 0.000018 0.065800 0.050222 0.040233 0.055168 0.042505 0.052079 0.079477 0.015546 0.165890 0.025259 0.072871 0.059498 0.017111 0.020838 0.065841 0.012980 0.055936 
   Substitution rates (user): 0.066803 0.068463 0.114533 0.134028 0.052453 0.044379 0.077626 0.061272 0.057495 0.047960 0.053751 0.069987 0.036514 0.088754 0.043550 0.072451 0.075666 0.059274 0.062504 0.069100 0.062370 0.068490 0.066493 0.040043 0.047035 0.065493 0.043314 0.042310 0.053859 0.041257 0.055141 0.074390 0.051561 0.048722 0.070380 0.045608 0.042372 0.070640 0.034222 0.072247 0.037389 0.136




In [38]:
import glob
import os
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [39]:
#glob up all the sequence alignments
alignment_files = glob.glob("./families/Information_benchmark/marker_genes/*/aligned.fasta")
#glob up all the structural alignments
structural_alignment_files = glob.glob("./families/Information_benchmark/marker_genes/*/encoded.ASCIIaln.txt.raxml_aln.fasta")



In [None]:
#create a tree for each marker gene using raxml and mafft in normal mode

for fam in tqdm.tqdm(mapping_df['oma_group'].unique()):
	fam_df = mapping_df[mapping_df['oma_group'] == fam]
	#check if the folder exists
	#check if the raxml tree already exists
	raxml_tree_file = f"./families/Information_benchmark/marker_genes/{fam}/raxml_lg_tree.raxml.bestTree"
	if not os.path.exists( f"./families/Information_benchmark/marker_genes/{fam}/raxml" ):
		struct_folder = fam_df['struct_folder'].iloc[0]  # Use the first struct folder for this family
		#pdb fasta 
		output_fasta = f"./families/Information_benchmark/marker_genes/{fam}/sequences.fasta"
		alnfast = align_fasta_with_mafft(output_fasta)[1]  # Get the aligned FASTA file path
		if alnfast is None:
			print(f"Alignment failed for {output_fasta}, skipping tree creation for {fam}")
			continue
		
		print(f"Creating RAxML tree for {fam} using aligned FASTA: {alnfast}")
		# Run RAxML-LG alignment and tree inference
		# Use the output prefix to save the tree file
		raxml_ng_path = "./foldtree2/raxml-ng/raxml-ng"
		tree = run_raxml_lg_alignment(alnfast, output_prefix=f"./families/Information_benchmark/marker_genes/{fam}/raxml_lg_tree", raxml_path=raxml_ng_path)

  0%|                                                                     | 0/500 [00:00<?, ?it/s]

Aligning: ./families/Information_benchmark/marker_genes/1428576/sequences.fasta
Creating RAxML tree for 1428576 using aligned FASTA: families/Information_benchmark/marker_genes/1428576/sequences.aligned.fa
Running: ./foldtree2/raxml-ng/raxml-ng --msa families/Information_benchmark/marker_genes/1428576/sequences.aligned.fa --model LG+G+I --prefix ./families/Information_benchmark/marker_genes/1428576/raxml_lg_tree --threads 1

RAxML-NG v. 1.2.2-master released on 30.04.2024 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth, Julia Haag, Anastasis Togkousidis.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum/#!forum/raxml

System: Intel(R) Xeon(R) Silver 4110 CPU @ 2.10GHz, 16 cores, 251 GB RAM

RAxML-NG was called at 20-Aug-2025 12:11:11 as follows:

./foldtree2/raxml-ng/raxml-ng --msa famil

  0%|                                                         | 1/500 [02:26<20:20:45, 146.79s/it]

All ML trees saved to: /home/dmoi/projects/foldtree2/families/Information_benchmark/marker_genes/1428576/raxml_lg_tree.raxml.mlTrees
Optimized model saved to: /home/dmoi/projects/foldtree2/families/Information_benchmark/marker_genes/1428576/raxml_lg_tree.raxml.bestModel

Execution log saved to: /home/dmoi/projects/foldtree2/families/Information_benchmark/marker_genes/1428576/raxml_lg_tree.raxml.log

Analysis started: 20-Aug-2025 12:11:11 / finished: 20-Aug-2025 12:13:33

Elapsed time: 141.513 seconds

Inferred tree saved to ./families/Information_benchmark/marker_genes/1428576/raxml_lg_tree.raxml.bestTree
Aligning: ./families/Information_benchmark/marker_genes/966408/sequences.fasta
Creating RAxML tree for 966408 using aligned FASTA: families/Information_benchmark/marker_genes/966408/sequences.aligned.fa
Running: ./foldtree2/raxml-ng/raxml-ng --msa families/Information_benchmark/marker_genes/966408/sequences.aligned.fa --model LG+G+I --prefix ./families/Information_benchmark/marker_gen

  0%|▏                                                        | 2/500 [03:48<15:02:19, 108.71s/it]

[00:01:20 -13663.158059] SLOW spr round 3 (radius: 15)
[00:01:20 -13663.158058] Model parameter optimization (eps = 0.100000)

[00:01:20] ML tree search #20, logLikelihood: -13663.157974


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.260802 (ML),  weights&rates: (0.250000,0.185899) (0.250000,0.543209) (0.250000,1.032631) (0.250000,2.238261) 
   P-inv (ML): 0.051430
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  1%|▎                                                         | 3/500 [04:30<10:48:19, 78.27s/it]

[00:00:41 -7929.039719] Model parameter optimization (eps = 0.100000)

[00:00:41] ML tree search #20, logLikelihood: -7929.035479


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.833069 (ML),  weights&rates: (0.250000,0.102574) (0.250000,0.420115) (0.250000,0.965713) (0.250000,2.511599) 
   P-inv (ML): 0.206759
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.509238

  1%|▍                                                        | 4/500 [06:59<14:36:36, 106.04s/it]

[00:02:24 -27317.160545] Model parameter optimization (eps = 0.100000)

[00:02:24] ML tree search #20, logLikelihood: -27317.107798


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.419552 (ML),  weights&rates: (0.250000,0.212623) (0.250000,0.574594) (0.250000,1.045326) (0.250000,2.167457) 
   P-inv (ML): 0.072716
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  1%|▌                                                        | 5/500 [08:42<14:25:40, 104.93s/it]

[00:01:40 -15995.060391] Model parameter optimization (eps = 0.100000)

[00:01:40] ML tree search #20, logLikelihood: -15995.060224


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.668925 (ML),  weights&rates: (0.250000,0.250278) (0.250000,0.614455) (0.250000,1.058974) (0.250000,2.076293) 
   P-inv (ML): 0.020842
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  1%|▋                                                         | 6/500 [10:02<13:14:17, 96.47s/it]

[00:01:18 -14129.999032] SLOW spr round 3 (radius: 15)
[00:01:18 -14129.999032] Model parameter optimization (eps = 0.100000)

[00:01:18] ML tree search #20, logLikelihood: -14129.998820


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.591991 (ML),  weights&rates: (0.250000,0.239190) (0.250000,0.603185) (0.250000,1.055396) (0.250000,2.102229) 
   P-inv (ML): 0.043435
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  1%|▊                                                         | 7/500 [10:42<10:40:01, 77.89s/it]


[00:00:38] ML tree search #20, logLikelihood: -6253.886677


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.253244 (ML),  weights&rates: (0.250000,0.184569) (0.250000,0.541570) (0.250000,1.031921) (0.250000,2.241940) 
   P-inv (ML): 0.063528
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.509238 0.191503 0.068427 2.145078 0.371004 0.089525 0.161787 4.008358 2.0006

  2%|▉                                                         | 8/500 [12:54<13:01:48, 95.34s/it]

[00:02:07 -23187.250154] Model parameter optimization (eps = 0.100000)

[00:02:07] ML tree search #20, logLikelihood: -23187.166812


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.036115 (ML),  weights&rates: (0.250000,0.144103) (0.250000,0.487344) (0.250000,1.005735) (0.250000,2.362817) 
   P-inv (ML): 0.023533
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  2%|█                                                         | 9/500 [14:38<13:22:42, 98.09s/it]

[00:01:40 -21927.142206] Model parameter optimization (eps = 0.100000)

[00:01:40] ML tree search #20, logLikelihood: -21927.137791


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.835160 (ML),  weights&rates: (0.250000,0.272771) (0.250000,0.636256) (0.250000,1.065259) (0.250000,2.025714) 
   P-inv (ML): 0.033615
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  2%|█▏                                                       | 10/500 [15:34<11:33:53, 84.97s/it]

[00:00:54 -7507.847897] SLOW spr round 3 (radius: 15)
[00:00:54 -7507.847890] Model parameter optimization (eps = 0.100000)

[00:00:54] ML tree search #20, logLikelihood: -7507.752232


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.517121 (ML),  weights&rates: (0.250000,0.036603) (0.250000,0.262946) (0.250000,0.832337) (0.250000,2.868113) 
   P-inv (ML): 0.140171
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  2%|█▎                                                       | 11/500 [17:00<11:35:44, 85.37s/it]

[00:01:24 -12906.995350] SLOW spr round 3 (radius: 15)
[00:01:24 -12906.994942] Model parameter optimization (eps = 0.100000)

[00:01:24] ML tree search #20, logLikelihood: -12906.994279


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.815390 (ML),  weights&rates: (0.250000,0.270196) (0.250000,0.633828) (0.250000,1.064601) (0.250000,2.031376) 
   P-inv (ML): 0.007764
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  2%|█▎                                                       | 12/500 [17:48<10:01:04, 73.90s/it]

[00:00:46 -8260.986224] SLOW spr round 3 (radius: 15)
[00:00:46 -8260.986224] Model parameter optimization (eps = 0.100000)

[00:00:46] ML tree search #20, logLikelihood: -8260.986206


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 2.417164 (ML),  weights&rates: (0.250000,0.338460) (0.250000,0.693121) (0.250000,1.077623) (0.250000,1.890796) 
   P-inv (ML): 0.035287
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  3%|█▌                                                        | 13/500 [18:01<7:29:27, 55.37s/it]

[00:00:12 -2244.456888] SLOW spr round 2 (radius: 10)
[00:00:12 -2244.456679] Model parameter optimization (eps = 0.100000)

[00:00:12] ML tree search #20, logLikelihood: -2244.456090


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.823449 (ML),  weights&rates: (0.250000,0.100538) (0.250000,0.416406) (0.250000,0.963246) (0.250000,2.519809) 
   P-inv (ML): 0.000000
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  3%|█▌                                                        | 14/500 [19:03<7:45:21, 57.45s/it]

[00:01:01 -9515.365337] SLOW spr round 3 (radius: 15)
[00:01:01 -9515.365311] Model parameter optimization (eps = 0.100000)

[00:01:01] ML tree search #20, logLikelihood: -9515.329370


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.261331 (ML),  weights&rates: (0.250000,0.185992) (0.250000,0.543324) (0.250000,1.032681) (0.250000,2.238004) 
   P-inv (ML): 0.013895
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  3%|█▋                                                        | 15/500 [19:29<6:27:04, 47.89s/it]

[00:00:25 -4200.931861] SLOW spr round 3 (radius: 15)
[00:00:25 -4200.931059] Model parameter optimization (eps = 0.100000)

[00:00:25] ML tree search #20, logLikelihood: -4200.900007


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.804591 (ML),  weights&rates: (0.250000,0.096535) (0.250000,0.408976) (0.250000,0.958219) (0.250000,2.536270) 
   P-inv (ML): 0.202881
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  3%|█▊                                                       | 16/500 [22:16<11:16:26, 83.86s/it]


[00:02:42] ML tree search #20, logLikelihood: -24699.114422


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.365374 (ML),  weights&rates: (0.250000,0.203756) (0.250000,0.564498) (0.250000,1.041430) (0.250000,2.190316) 
   P-inv (ML): 0.022328
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.509238 0.191503 0.068427 2.145078 0.371004 0.089525 0.161787 4.008358 2.000

  3%|█▉                                                       | 17/500 [23:33<10:56:56, 81.61s/it]

[00:01:15 -12398.595414] SLOW spr round 3 (radius: 15)
[00:01:15 -12398.595413] Model parameter optimization (eps = 0.100000)

[00:01:15] ML tree search #20, logLikelihood: -12398.595317


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 2.231950 (ML),  weights&rates: (0.250000,0.319475) (0.250000,0.677611) (0.250000,1.074841) (0.250000,1.928073) 
   P-inv (ML): 0.017550
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  4%|██                                                       | 18/500 [25:22<12:03:45, 90.09s/it]

[00:01:46 -18830.462786] SLOW spr round 3 (radius: 15)
[00:01:46 -18830.462778] Model parameter optimization (eps = 0.100000)

[00:01:47] ML tree search #20, logLikelihood: -18830.462519


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.336164 (ML),  weights&rates: (0.250000,0.198868) (0.250000,0.558802) (0.250000,1.039153) (0.250000,2.203177) 
   P-inv (ML): 0.033076
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  4%|██▏                                                      | 19/500 [26:13<10:28:14, 78.37s/it]

[00:00:47 -23000.781499] Model parameter optimization (eps = 0.100000)

[00:00:47] ML tree search #20, logLikelihood: -23000.754070


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 2.128711 (ML),  weights&rates: (0.250000,0.308170) (0.250000,0.668038) (0.250000,1.072900) (0.250000,1.950892) 
   P-inv (ML): 0.021426
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  4%|██▎                                                       | 20/500 [26:43<8:30:31, 63.82s/it]

[00:00:28 -6805.936052] SLOW spr round 2 (radius: 10)
[00:00:29 -6805.936051] Model parameter optimization (eps = 0.100000)

[00:00:29] ML tree search #20, logLikelihood: -6805.931293


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.878346 (ML),  weights&rates: (0.250000,0.112089) (0.250000,0.436864) (0.250000,0.976505) (0.250000,2.474543) 
   P-inv (ML): 0.220457
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  4%|██▍                                                      | 21/500 [28:29<10:09:52, 76.39s/it]

[00:01:43 -18427.150536] SLOW spr round 3 (radius: 15)
[00:01:43 -18427.150530] Model parameter optimization (eps = 0.100000)

[00:01:44] ML tree search #20, logLikelihood: -18427.144041


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.465108 (ML),  weights&rates: (0.250000,0.219882) (0.250000,0.582644) (0.250000,1.048305) (0.250000,2.149169) 
   P-inv (ML): 0.042440
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  4%|██▌                                                       | 22/500 [28:56<8:10:59, 61.63s/it]

[00:00:26 -4509.293947] SLOW spr round 3 (radius: 15)
[00:00:26 -4509.293946] Model parameter optimization (eps = 0.100000)

[00:00:26] ML tree search #20, logLikelihood: -4509.292906


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.549637 (ML),  weights&rates: (0.250000,0.232889) (0.250000,0.596615) (0.250000,1.053208) (0.250000,2.117287) 
   P-inv (ML): 0.070836
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  5%|██▋                                                       | 23/500 [29:30<7:02:32, 53.15s/it]

[00:00:32 -4480.970290] SLOW spr round 3 (radius: 15)
[00:00:32 -4480.970181] Model parameter optimization (eps = 0.100000)

[00:00:32] ML tree search #20, logLikelihood: -4480.504786


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.724418 (ML),  weights&rates: (0.250000,0.079410) (0.250000,0.374827) (0.250000,0.933592) (0.250000,2.612172) 
   P-inv (ML): 0.000000
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  5%|██▊                                                       | 24/500 [30:25<7:07:54, 53.94s/it]

[00:00:54 -7808.307658] SLOW spr round 3 (radius: 15)
[00:00:54 -7808.307650] Model parameter optimization (eps = 0.100000)

[00:00:54] ML tree search #20, logLikelihood: -7808.215314


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.012431 (ML),  weights&rates: (0.250000,0.139428) (0.250000,0.480457) (0.250000,1.002030) (0.250000,2.378086) 
   P-inv (ML): 0.038392
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  5%|██▉                                                       | 25/500 [30:59<6:18:35, 47.82s/it]

[00:00:32 -4563.626889] SLOW spr round 3 (radius: 15)
[00:00:32 -4563.626394] Model parameter optimization (eps = 0.100000)

[00:00:32] ML tree search #20, logLikelihood: -4563.623331


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.297302 (ML),  weights&rates: (0.250000,0.192245) (0.250000,0.550929) (0.250000,1.035913) (0.250000,2.220913) 
   P-inv (ML): 0.110174
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  5%|██▉                                                     | 26/500 [35:09<14:16:10, 108.38s/it]


[00:04:02] ML tree search #20, logLikelihood: -31939.454657


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.732363 (ML),  weights&rates: (0.250000,0.259091) (0.250000,0.623159) (0.250000,1.061584) (0.250000,2.056166) 
   P-inv (ML): 0.012489
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.509238 0.191503 0.068427 2.145078 0.371004 0.089525 0.161787 4.008358 2.000

  5%|███                                                     | 27/500 [36:38<13:29:32, 102.69s/it]

[00:01:27 -17577.561607] Model parameter optimization (eps = 0.100000)

[00:01:27] ML tree search #20, logLikelihood: -17577.560239


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.089437 (ML),  weights&rates: (0.250000,0.154446) (0.250000,0.502077) (0.250000,1.013371) (0.250000,2.330106) 
   P-inv (ML): 0.030925
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  6%|███▏                                                     | 28/500 [37:11<10:43:36, 81.81s/it]

[00:00:32 -4580.676074] SLOW spr round 3 (radius: 15)
[00:00:32 -4580.676074] Model parameter optimization (eps = 0.100000)

[00:00:32] ML tree search #20, logLikelihood: -4580.655573


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.850222 (ML),  weights&rates: (0.250000,0.106192) (0.250000,0.426595) (0.250000,0.969955) (0.250000,2.497258) 
   P-inv (ML): 0.118080
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  6%|███▎                                                     | 29/500 [39:18<12:29:19, 95.45s/it]

   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.509238 0.191503 0.068427 2.145078 0.371004 0.089525 0.161787 4.008358 2.000679 0.045376 0.612025 0.083688 0.062556 0.523386 5.243870 0.844926 0.927114 0.010690 0.015076 0.282959 0.025548 0.017416 0.394456 1.240275 0.425860 0.029890 0.135107 0.037967 0.084808 0.003499 0.569265 0.640543 0.320627 0.594007 0.013266 0.893680 1.105251 0.075382 2.784478 1.143480 0.670128 1.165532 1.959291 4.128591 0.267959 4.813505 0.072854 0.582457 3.234294 1.672569 0.035855 0.624294 1.223828 1.080136 0.236199 0.257336 0.210332 0.348847 0.423881 0.044265 0.069673 1.807177 0.173735 0.018811 0.419409 0.6119

  6%|███▎                                                    | 30/500 [41:35<14:05:21, 107.92s/it]


[00:02:15] ML tree search #20, logLikelihood: -14571.332691


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.164588 (ML),  weights&rates: (0.250000,0.168577) (0.250000,0.521190) (0.250000,1.022694) (0.250000,2.287539) 
   P-inv (ML): 0.121829
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.509238 0.191503 0.068427 2.145078 0.371004 0.089525 0.161787 4.008358 2.000

  6%|███▌                                                     | 31/500 [42:39<12:19:51, 94.65s/it]

[00:01:02 -8917.322517] SLOW spr round 3 (radius: 15)
[00:01:02 -8917.322515] Model parameter optimization (eps = 0.100000)

[00:01:02] ML tree search #20, logLikelihood: -8917.290901


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.548153 (ML),  weights&rates: (0.250000,0.042623) (0.250000,0.282194) (0.250000,0.852374) (0.250000,2.822809) 
   P-inv (ML): 0.156621
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  6%|███▌                                                    | 32/500 [46:00<16:26:16, 126.45s/it]


[00:03:15] ML tree search #20, logLikelihood: -30621.738712


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 2.149655 (ML),  weights&rates: (0.250000,0.310508) (0.250000,0.670039) (0.250000,1.073320) (0.250000,1.946133) 
   P-inv (ML): 0.025547
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.509238 0.191503 0.068427 2.145078 0.371004 0.089525 0.161787 4.008358 2.000

  7%|███▋                                                    | 33/500 [46:57<13:41:40, 105.57s/it]

[00:00:55 -9363.700701] SLOW spr round 3 (radius: 15)
[00:00:55 -9363.700694] Model parameter optimization (eps = 0.100000)

[00:00:55] ML tree search #20, logLikelihood: -9363.694488


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.729362 (ML),  weights&rates: (0.250000,0.258680) (0.250000,0.622759) (0.250000,1.061467) (0.250000,2.057094) 
   P-inv (ML): 0.017684
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  7%|███▊                                                    | 34/500 [49:12<14:48:58, 114.46s/it]

[00:02:11 -22344.805378] SLOW spr round 3 (radius: 15)
[00:02:12 -22344.805378] Model parameter optimization (eps = 0.100000)

[00:02:12] ML tree search #20, logLikelihood: -22344.805255


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.259846 (ML),  weights&rates: (0.250000,0.185731) (0.250000,0.543003) (0.250000,1.032542) (0.250000,2.238724) 
   P-inv (ML): 0.141751
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  7%|███▉                                                     | 35/500 [49:45<11:37:36, 90.01s/it]

[00:00:32 -4453.147589] SLOW spr round 3 (radius: 15)
[00:00:32 -4453.147589] Model parameter optimization (eps = 0.100000)

[00:00:32] ML tree search #20, logLikelihood: -4453.145011


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.110441 (ML),  weights&rates: (0.250000,0.158448) (0.250000,0.507604) (0.250000,1.016135) (0.250000,2.317813) 
   P-inv (ML): 0.147214
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  7%|████                                                     | 36/500 [51:17<11:40:58, 90.64s/it]

[00:01:30 -16326.515711] Model parameter optimization (eps = 0.100000)

[00:01:30] ML tree search #20, logLikelihood: -16326.448664


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.878616 (ML),  weights&rates: (0.250000,0.112145) (0.250000,0.436960) (0.250000,0.976566) (0.250000,2.474329) 
   P-inv (ML): 0.037380
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  7%|████▏                                                    | 37/500 [52:10<10:12:42, 79.40s/it]

[00:00:52 -8810.599248] SLOW spr round 3 (radius: 15)
[00:00:52 -8810.599170] Model parameter optimization (eps = 0.100000)

[00:00:52] ML tree search #20, logLikelihood: -8810.595593


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 2.226725 (ML),  weights&rates: (0.250000,0.318916) (0.250000,0.677143) (0.250000,1.074750) (0.250000,1.929191) 
   P-inv (ML): 0.024655
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  8%|████▍                                                     | 38/500 [53:24<9:58:51, 77.77s/it]

[00:01:12 -15700.917184] Model parameter optimization (eps = 0.100000)

[00:01:12] ML tree search #20, logLikelihood: -15700.916253


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.587188 (ML),  weights&rates: (0.250000,0.238482) (0.250000,0.602454) (0.250000,1.055156) (0.250000,2.103908) 
   P-inv (ML): 0.072327
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887 5.076149 0.528768 1.695752 0.541712 1.437645 4.5092

  8%|████▌                                                     | 39/500 [53:38<7:29:42, 58.53s/it]

[00:00:12 -1098.113317] SLOW spr round 2 (radius: 10)
[00:00:13 -1098.112968] SLOW spr round 3 (radius: 15)
[00:00:13 -1098.112948] Model parameter optimization (eps = 0.100000)

[00:00:13] ML tree search #20, logLikelihood: -1097.915940


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 0.677588 (ML),  weights&rates: (0.250000,0.069409) (0.250000,0.352761) (0.250000,0.916271) (0.250000,2.661559) 
   P-inv (ML): 0.100664
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722

  8%|████▋                                                     | 40/500 [54:15<6:39:50, 52.15s/it]

[00:00:36 -6076.156351] SLOW spr round 3 (radius: 15)
[00:00:36 -6076.156349] Model parameter optimization (eps = 0.100000)

[00:00:36] ML tree search #20, logLikelihood: -6076.154030


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.583218 (ML),  weights&rates: (0.250000,0.237896) (0.250000,0.601846) (0.250000,1.054957) (0.250000,2.105301) 
   P-inv (ML): 0.084032
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

  8%|████▊                                                     | 41/500 [55:55<8:29:22, 66.59s/it]

[00:01:38 -13668.889708] SLOW spr round 3 (radius: 15)
[00:01:38 -13668.889697] Model parameter optimization (eps = 0.100000)

[00:01:38] ML tree search #20, logLikelihood: -13668.885727


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.269482 (ML),  weights&rates: (0.250000,0.187419) (0.250000,0.545075) (0.250000,1.033434) (0.250000,2.234072) 
   P-inv (ML): 0.102162
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170

  8%|████▊                                                     | 42/500 [56:24<7:01:09, 55.17s/it]

[00:00:27 -5062.596760] SLOW spr round 2 (radius: 10)
[00:00:27 -5062.596561] Model parameter optimization (eps = 0.100000)

[00:00:27] ML tree search #20, logLikelihood: -5062.589476


Optimized model parameters:

   Partition 0: noname
   Rate heterogeneity: GAMMA (4 cats, mean),  alpha: 1.469454 (ML),  weights&rates: (0.250000,0.220565) (0.250000,0.583392) (0.250000,1.048576) (0.250000,2.147467) 
   P-inv (ML): 0.084817
   Base frequencies (model): 0.079066 0.055941 0.041977 0.053052 0.012937 0.040767 0.071586 0.057337 0.022355 0.062157 0.099081 0.064600 0.022951 0.042302 0.044040 0.061197 0.053287 0.012066 0.034155 0.069147 
   Substitution rates (model): 0.425093 0.276818 0.395144 2.489084 0.969894 1.038545 2.066040 0.358858 0.149830 0.395337 0.536518 1.124035 0.253701 1.177651 4.727182 2.139501 0.180717 0.218959 2.547870 0.751878 0.123954 0.534551 2.807908 0.363970 0.390192 2.426601 0.126991 0.301848 6.326067 0.484133 0.052722 0.332533 0.858151 0.578987 0.593607 0.314440 0.170887

In [None]:
def create_supermatrix(alignment_files, output_file , mapping_df=None):
	"""
	Create a supermatrix alignment from multiple gene alignments.
	
	Parameters:
	-----------
	alignment_files : list
		List of paths to alignment files in FASTA format
	output_file : str
		Path to save the concatenated alignment
	mapping_df : pandas.DataFrame, optional	
		DataFrame containing mapping information for structural alignments
		
	Returns:
	--------
	tuple
		(concatenated_alignment, partition_info)
		- concatenated_alignment: The final MultipleSeqAlignment object
		- partition_info: Dictionary with gene boundaries for partition file creation
	"""

	for aln in alignment_files:
		if not os.path.exists(aln):
			print(f"Alignment file {aln} does not exist, skipping")
			continue
		#read the alignment and map each aln string to an identifier
		try:
			aln_obj = AlignIO.read(aln, "fasta")
			print(f"Read alignment {aln} with {len(aln_obj)} sequences and {aln_obj.get_alignment_length()} columns")
		except Exception as e:
			print(f"Error reading alignment {aln}: {str(e)}")
			continue
		#find which line to use for each identifier from the species
		for record in aln_obj:
			spec = record.id.split("|")[1]
			if spec not in species_map:
				species_map[spec] = []
			species_map[spec].append(record)
		

In [None]:
def run_site_likelihood_analysis(aln , tree , model,  output_prefix = None):
	"""
	Placeholder function for running site likelihood analysis.
	This function should be implemented based on specific requirements.
	"""
	print("Running site likelihood analysis...")
	#raxml command is  --force --evaluate --msa your_alignment.phy --model GTR+G --tree fixed_tree.newick --site-lh
	# Example: assumes alignment and tree files are available for each HOG
	# Example: assumes alignment and tree files are available
	if output_prefix is None:
		output_prefix = "./raxmlng_results/example"
	# Ensure output directory exists
	import os
	if os.path.exists(os.path.dirname(output_prefix)):
		print(f"Output directory {os.path.dirname(output_prefix)} already exists.")
	else:
		print(f"Creating output directory: {os.path.dirname(output_prefix)}")	
		# Create output directory if it doesn't exist
		os.makedirs(os.path.dirname(output_prefix), exist_ok=True)

	cmd = [
		"raxml-ng",
		"--force",
		"--evaluate",
		"--msa", aln,
		"--model", model,
		"--tree", tree,
		"--site-lh",
		"--prefix", output_prefix
	]
	print(f"Running: {' '.join(cmd)}")
	subprocess.run(cmd, check=True)




In [None]:
import re
from ete3 import Tree

#aster format for trees is species_name_A for species_name and one protein. letter changes for each protein from the same species

def convert_newick_to_aster_format_ete3(newick_file, mapping_df, output_file):
	"""
	Convert a Newick tree with UniProt codes to ASTER format using mapping_df and ete3.
	Assigns a species name and a unique letter to each leaf for each protein from the same species.

	Parameters:
	-----------
	newick_file : str
		Path to input Newick file with UniProt codes as leaf names
	mapping_df : pandas.DataFrame
		DataFrame containing '1stref' (UniProt code) and 'species' (dict with 'code')
	output_file : str
		Path to output Newick file in ASTER format

	Returns:
	--------
	str: Path to the output file
	"""
	# Build mapping: UniProt -> species code
	uniprot_to_species = {}
	for _, row in mapping_df.iterrows():
		uniprot = row['1stref']
		species_code = row['species']['code'] if isinstance(row['species'], dict) else row['species']
		uniprot_to_species[uniprot] = species_code

	# Load tree using ete3
	tree = Tree(newick_file, format=1)

	# Assign letters for each protein per species
	species_letter_count = {}
	uniprot_to_aster = {}

	# Collect all leaf names
	leaf_names = [leaf.name for leaf in tree.iter_leaves()]
	for leaf in leaf_names:
		species = uniprot_to_species.get(leaf, "UNK")
		if species not in species_letter_count:
			species_letter_count[species] = 0
		letter = chr(ord('A') + species_letter_count[species])
		species_letter_count[species] += 1
		aster_name = f"{species}_{letter}"
		uniprot_to_aster[leaf] = aster_name

	# Rename leaves in the tree
	for leaf in tree.iter_leaves():
		leaf.name = uniprot_to_aster.get(leaf.name, leaf.name)

	# Write output
	tree.write(outfile=output_file, format=1)
	return output_file


In [None]:
import subprocess
import os

#bin/astral4 -o OUTPUT_FILE INPUT_FILE 2>LOG_FILE
def run_astral(input_file, output_file, astral_path="astral.jar", log_file=None):
	"""
	Run ASTRAL to infer a species tree from gene trees.

	Parameters:
	-----------
	input_file : str
		Path to the input file containing gene trees (one per line, Newick format)
	output_file : str
		Path to save the output supertree (Newick format)
	astral_path : str
		Path to the ASTRAL jar file (default: 'astral.jar')
	log_file : str or None
		Path to save stderr log output (default: None)

	Returns:
	--------
	str: Path to the output tree file
	"""
	cmd = [
		"java", "-jar", astral_path,
		"-i", input_file,
		"-o", output_file
	]
	if log_file is not None:
		with open(log_file, "w") as logf:
			subprocess.run(cmd, stderr=logf, check=True)
	else:
		subprocess.run(cmd, check=True)
	if os.path.exists(output_file):
		print(f"ASTRAL tree saved to {output_file}")
		return output_file
	else:
		print("ASTRAL did not produce an output file.")
		return None

In [None]:
import os
import subprocess
from pathlib import Path
import pandas as pd
import numpy as np
import re
from Bio import AlignIO

def extract_site_likelihoods(log_file):
	"""
	Extract site-wise log-likelihood values from RAxML-NG output
	
	Parameters:
	-----------
	log_file : str
		Path to the RAxML-NG log file containing site likelihoods
		
	Returns:
	--------
	list of floats: Site log-likelihood values
	"""
	likelihoods = []
	
	with open(log_file, 'r') as f:
		# Skip to the part with site likelihoods
		for line in f:
			if line.startswith('Site '):
				break
				
		# Parse the likelihood values
		for line in f:
			if not line.strip() or line.startswith('Site '):
				continue
			if 'Sum' in line:  # End of site likelihoods section
				break
				
			parts = line.strip().split()
			if len(parts) >= 2:
				try:
					likelihoods.append(float(parts[1]))
				except ValueError:
					continue
	
	return likelihoods

def create_column_likelihood_dataframe(alignment_file, tree_file, log_file,  output_dir=None):
	"""
	Calculate site likelihoods for an alignment and create a DataFrame with
	alignment columns and their corresponding likelihood values
	
	Parameters:
	-----------
	alignment_file : str
		Path to the aligned FASTA file
	tree_file : str
		Path to the tree file in Newick format
	output_dir : str or None
		Directory to store intermediate files (defaults to same directory as alignment)
		
	Returns:
	--------
	pandas.DataFrame: DataFrame with columns for site index, alignment column, and likelihood
	"""
	if output_dir is None:
		output_dir = os.path.dirname(alignment_file)
	
	# Load the alignment
	alignment = AlignIO.read(alignment_file, "fasta")
	# Extract site likelihoods
	likelihoods = extract_site_likelihoods(log_file)
	# Prepare data for DataFrame
	data = []
	for i in range(alignment.get_alignment_length()):
		if i < len(likelihoods):
			column = [record.seq[i] for record in alignment]
			column_str = ''.join(column)
			data.append({
				'Site': i + 1,
				'Alignment_Column': column_str,
				'Log_Likelihood': likelihoods[i]
			})
	# Create DataFrame
	df = pd.DataFrame(data)
	return df


In [None]:
def alignment_character_proportions(alignment):
	"""
	Quantify the proportion of each character at every position in an alignment.

	Parameters:
	-----------
	alignment : Bio.Align.MultipleSeqAlignment
		Alignment object (e.g., from AlignIO.read)

	Returns:
	--------
	pandas.DataFrame: Rows are positions, columns are characters, values are proportions.
	"""
	columns = []
	for i in range(alignment.get_alignment_length()):
		column = [record.seq[i] for record in alignment]
		counts = Counter(column)
		total = len(column)
		proportions = {char: counts[char] / total for char in counts}
		columns.append(proportions)
	df = pd.DataFrame(columns).fillna(0)
	df.index.name = "Position"
	return df


def discretize_proportions(df, granularity=10):
	"""
	Discretize character proportions in an alignment DataFrame to specified granularity.

	Parameters:
	-----------
	df : pandas.DataFrame
		DataFrame of character proportions (output of alignment_character_proportions)
	granularity : int
		Number of bins (e.g., 10 for 0.1 steps)

	Returns:
	--------
	pandas.DataFrame: Discretized proportions (integer bins)
	"""
	bins = np.linspace(0, 1, granularity + 1)
	return df.apply(lambda col: np.digitize(col, bins, right=True), axis=0)



In [None]:
from scipy.stats import entropy
import numpy as np

#concatenate the likelihoods and column proportions

#double integral over the likelihoods and proportions to get MI

def compute_mutual_information(proportions, likelihoods, bins=20):
	"""
	Compute mutual information between character proportions and site likelihoods.

	Parameters:
	-----------
	proportions : array-like
		Proportion values for a character across sites (e.g., column from proportions DataFrame)
	likelihoods : array-like
		Log-likelihood values for corresponding sites
	bins : int
		Number of bins to discretize both variables

	Returns:
	--------
	float: Mutual information (in nats)
	"""
	# Discretize both arrays
	prop_binned = np.digitize(proportions, np.linspace(0, 1, bins + 1), right=True)
	like_binned = np.digitize(likelihoods, np.linspace(min(likelihoods), max(likelihoods), bins + 1), right=True)

	# Joint histogram
	joint_hist, _, _ = np.histogram2d(prop_binned, like_binned, bins=bins)
	joint_prob = joint_hist / joint_hist.sum()

	# Marginals
	prop_hist = joint_hist.sum(axis=1)
	like_hist = joint_hist.sum(axis=0)
	prop_prob = prop_hist / prop_hist.sum()
	like_prob = like_hist / like_hist.sum()

	# Compute MI
	mi = 0.0
	for i in range(bins):
		for j in range(bins):
			p_ij = joint_prob[i, j]
			if p_ij > 0:
				mi += p_ij * np.log(p_ij / (prop_prob[i] * like_prob[j]))
	return mi

In [None]:
import glob
import os
import subprocess
from pathlib import Path


for fam in 

# Directory containing encoded FASTA files
encoded_dir = './families/encoded_fastas/'  # Change to your directory
os.makedirs(encoded_dir, exist_ok=True)

# Find all encoded FASTA files
encoded_fastas = list(Path(encoded_dir).glob("*.fasta"))
print(f"Found {len(encoded_fastas)} encoded FASTA files.")

# Align each encoded FASTA file using MAFFT via treebuilder's static method
for fasta_file in encoded_fastas:
	aligned_path = str(fasta_file.with_name(f"{fasta_file.stem}.aligned.fasta"))
	tb.run_mafft_textaln(str(fasta_file), outaln=aligned_path, matrix=mafftmat)
	print(f"Aligned {fasta_file} -> {aligned_path}")




In [None]:
#be