In [13]:
cd /home/dmoi/projects/foldtree2/

/home/dmoi/projects/foldtree2


# Information-Theoretic Benchmarking of Species Tree Inference

This notebook presents a comparative workflow for species tree inference using two different character sets: traditional amino acid sequences and foldtree-encoded structural features. The goal is to benchmark the phylogenetic signal carried by each character set using information-theoretic approaches.

**Workflow Overview:**
- Construct a species tree using a standard amino acid-based pipeline (multiple sequence alignment, concatenation, and maximum likelihood inference).
- Construct an equivalent species tree using foldtree-encoded data.
- For both trees, compute column-wise log-likelihoods and character frequencies.
- Quantify the information content and phylogenetic signal of each character set by analyzing the distribution of likelihoods and character frequencies.

This approach enables a direct comparison of how much evolutionary information is captured by sequence versus structure-based encodings, providing an objective benchmark for future phylogenomic analyses.

In [None]:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
from pathlib import Path
import requests
import random
import subprocess
from pathlib import Path
import concurrent.futures
import multiprocessing
import pandas as pd
import requests
from Bio import SeqIO
import re
import time

In [15]:
import glob
markers = glob.glob( './families/marker_genes/marker_genes/*.fa')
print( f"Found {len(markers)} marker genes" )

Found 500 marker genes


In [None]:
# Specify your model and associated matrices
from foldtree2.ft2treebuilder import treebuilder
model_path = "../../models/your_trained_model"  # Path to your model (without .pkl)
mafftmat = model_path + "_mafftmat.mtx"
submat = model_path + "_submat.txt"
# Initialize the treebuilder class
tb = treebuilder(model=model_path, mafftmat=mafftmat, submat=submat)

In [None]:
def align_fasta_with_mafft(fasta_file):
	"""
	Align a single FASTA file using MAFFT
	
	Parameters:
	-----------
	fasta_file : str
		Path to input FASTA file
	
	Returns:
	--------
	tuple: (input_path, output_path, success_status)
	"""
	# Create output filename - same directory but with .aligned.fa extension
	input_path = Path(fasta_file)
	output_path = input_path.with_name(f"{input_path.stem}.aligned.fa")
	
	# Run MAFFT
	cmd = f"mafft --auto --thread 1 {fasta_file} > {output_path}"
	print(f"Aligning: {fasta_file}")
	
	try:
		subprocess.run(cmd, shell=True, check=True, stderr=subprocess.PIPE)
		return (fasta_file, str(output_path), True)
	except subprocess.CalledProcessError as e:
		print(f"Error aligning {fasta_file}: {e}")
		return (fasta_file, str(output_path), False)

# Get number of available cores (leave 1 core free for system processes)
max_workers = max(1, multiprocessing.cpu_count() - 1)
print(f"Using {max_workers} cores for alignments")

# Process alignments in parallel
aligned_files = []
failed_files = []

def align_fasta_with_mafft(fasta_file):
	"""
	Align a single FASTA file using MAFFT
	
	Parameters:
	-----------
	fasta_file : str
		Path to input FASTA file
	
	Returns:
	--------
	tuple: (input_path, output_path, success_status)
	"""
	# Create output filename - same directory but with .aligned.fa extension
	input_path = Path(fasta_file)
	output_path = input_path.with_name(f"{input_path.stem}.aligned.fa")
	
	# Run MAFFT
	cmd = f"mafft --auto --thread 1 {fasta_file} > {output_path}"
	print(f"Aligning: {fasta_file}")
	
	try:
		subprocess.run(cmd, shell=True, check=True, stderr=subprocess.PIPE)
		return (fasta_file, str(output_path), True)
	except subprocess.CalledProcessError as e:
		print(f"Error aligning {fasta_file}: {e}")
		return (fasta_file, str(output_path), False)

# Get number of available cores (leave 1 core free for system processes)
max_workers = max(1, multiprocessing.cpu_count() - 1)
print(f"Using {max_workers} cores for alignments")
# Process alignments in parallel
aligned_files = []
failed_files = []

In [None]:
def concatenate_alignments(alignment_files, output_file):
	"""
	Concatenate multiple alignment files into a single supermatrix alignment.
	
	Parameters:
	-----------
	alignment_files : list
		List of paths to alignment files in FASTA format
	output_file : str
		Path to save the concatenated alignment
		
	Returns:
	--------
	tuple
		(concatenated_alignment, partition_info)
		- concatenated_alignment: The final MultipleSeqAlignment object
		- partition_info: Dictionary with gene boundaries for partition file creation
	"""
	if not alignment_files:
		print("No alignment files provided")
		return None, {}
	
	# Dictionary to store sequences for each species across all genes
	all_species = {}
	partition_info = {}
	current_position = 1
	
	# Process each alignment file
	for i, aln_file in enumerate(alignment_files):
		try:
			# Load the alignment
			gene_name = Path(aln_file).stem.replace('.aligned', '')
			alignment = AlignIO.read(aln_file, "fasta")
			aln_length = alignment.get_alignment_length()
			
			# Store partition information
			partition_info[gene_name] = {
				'start': current_position,
				'end': current_position + aln_length - 1
			}
			
			# Process each sequence in this alignment
			for record in alignment:
				# Extract species identifier from the sequence header
				species_id = record.id.split('|')[-1]
				
				# Initialize this species entry if it doesn't exist yet
				if species_id not in all_species:
					all_species[species_id] = {}
				
				# Add this gene's sequence for this species
				all_species[species_id][gene_name] = str(record.seq)
			
			current_position += aln_length
			print(f"Processed alignment {i+1}/{len(alignment_files)}: {gene_name} ({aln_length} columns)")
			
		except Exception as e:
			print(f"Error processing {aln_file}: {str(e)}")
	
	# Create the concatenated alignment
	concatenated_records = []
	gene_names = list(partition_info.keys())
	
	for species_id, genes in all_species.items():
		# Build the concatenated sequence for this species
		concat_seq = ""
		for gene in gene_names:
			if gene in genes:
				concat_seq += genes[gene]
			else:
				# If this species doesn't have this gene, add gaps
				gene_length = partition_info[gene]['end'] - partition_info[gene]['start'] + 1
				concat_seq += "-" * gene_length
		
		# Create a SeqRecord for this concatenated sequence
		record = SeqRecord(
			Seq(concat_seq),
			id=species_id,
			description=f"Concatenated {len(gene_names)} genes"
		)
		concatenated_records.append(record)
	
	# Create and save the concatenated alignment
	concatenated_alignment = MultipleSeqAlignment(concatenated_records)
	
	# Save alignment to file
	with open(output_file, "w") as handle:
		AlignIO.write(concatenated_alignment, handle, "fasta")
	
	# Create a partition file for RAxML-NG
	partition_file = f"{output_file}.partition"
	with open(partition_file, "w") as handle:
		for gene, pos in partition_info.items():
			handle.write(f"GTR+G, {gene} = {pos['start']}-{pos['end']}\n")
	
	print(f"Created concatenated alignment with {len(concatenated_records)} species and {concatenated_alignment.get_alignment_length()} columns")
	print(f"Partition file saved to {partition_file}")
	return concatenated_alignment, partition_info

In [None]:
def extract_oma_ids_from_fasta(fasta_file):
	"""
	Extract OMA identifiers from a FASTA file
	ID format example:

	>MOUSE45461 | OMA754554 | COQ5_MOUSE | [Mus musculus]

	
	Parameters:
	-----------
	fasta_file : str
		Path to FASTA file
		
	Returns:
	--------
	list of str: OMA identifiers
	"""
	oma_ids = []
	try:
		for record in SeqIO.parse(fasta_file, "fasta"):
			# Extract OMA ID from the FASTA header using the pipe-separated format
			id = record.description.split('|')[0]	
			oma_ids.append(id.strip())
	return oma_ids


In [None]:
def map_oma_to_uniprot(oma_id, retry_limit=3):
	"""
	Map an OMA identifier to UniProt using the OMA API
	
	Parameters:
	-----------
	oma_id : str
		OMA identifier (e.g., OMA123456)
	retry_limit : int
		Number of times to retry on failure
		
	Returns:
	--------
	dict: Mapping information (OMA ID, UniProt ID, etc.)
	"""
	# Extract numeric part from OMA ID
	oma_numeric = re.sub(r'OMA:?', '', oma_id)
	
	# OMA API endpoint for protein information
	api_url = f"https://omabrowser.org/api/protein/OMA{oma_numeric}"
	
	for attempt in range(retry_limit):
		try:
			response = requests.get(api_url)
			if response.ok:
				data = response.json()
				# Extract UniProt ID if available
				xrefs = data.get('xrefs', [])
				uniprot_id = None
				for xref in xrefs:
					if xref.get('source') == 'UniProtKB/Swiss-Prot' or xref.get('source') == 'UniProtKB/TrEMBL':
						uniprot_id = xref.get('id')
						break
				
				return {
					'oma_id': f"OMA{oma_numeric}",
					'uniprot_id': uniprot_id,
					'species': data.get('species', {}).get('name', ''),
					'taxon_id': data.get('species', {}).get('taxon_id', ''),
					'sequence_length': len(data.get('sequence', '')),
					'protein_name': data.get('protein_name', '')
				}
			else:
				print(f"Error fetching data for {oma_id}: HTTP {response.status_code}")
		except Exception as e:
			print(f"Error mapping {oma_id} to UniProt: {str(e)}")
		
		# Wait before retrying
		time.sleep(1)
	
	# Return minimal info if all attempts failed
	return {'oma_id': f"OMA{oma_numeric}", 'uniprot_id': None}

def process_marker_gene(marker_file):
	"""Process a single marker gene file to extract OMA IDs and map to UniProt"""
	oma_ids = extract_oma_ids_from_fasta(marker_file)
	mappings = []
	
	for oma_id in oma_ids:
		mapping = map_oma_to_uniprot(oma_id)
		mappings.append(mapping)
		# Be nice to the API
		time.sleep(0.2)
	
	return {
		'marker_file': marker_file,
		'gene_name': Path(marker_file).stem,
		'mappings': mappings
	}


In [None]:

# Process marker genes to extract OMA IDs and map them to UniProt
print(f"Processing {len(markers)} marker gene files...")

# Use max 8 workers to avoid overwhelming the API
max_workers = min(8, multiprocessing.cpu_count())
marker_data = []

# Process a sample of markers for testing (adjust as needed)
sample_markers = markers[:10]  # Process first 10 markers for testing

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
	futures = [executor.submit(process_marker_gene, marker) for marker in sample_markers]
	for future in concurrent.futures.as_completed(futures):
		try:
			result = future.result()
			marker_data.append(result)
			print(f"Processed {result['gene_name']}: {len(result['mappings'])} sequences mapped")
		except Exception as e:
			print(f"Error processing marker gene: {str(e)}")

# Convert mappings to DataFrame for analysis
mapping_rows = []
for marker in marker_data:
	for mapping in marker['mappings']:
		mapping['marker_gene'] = marker['gene_name']
		mapping_rows.append(mapping)

mapping_df = pd.DataFrame(mapping_rows)

# Display summary of mappings
print(f"\nMapping summary:")
print(f"Total sequences processed: {len(mapping_rows)}")
print(f"UniProt mappings found: {mapping_df['uniprot_id'].notna().sum()} ({mapping_df['uniprot_id'].notna().sum()/len(mapping_rows)*100:.1f}%)")
print(f"Unique species: {mapping_df['species'].nunique()}")

# Save mapping results
mapping_df.to_csv("oma_uniprot_mappings.csv", index=False)
print("Mapping data saved to oma_uniprot_mappings.csv")

In [None]:
def run_site_likelihood_analysis(aln , tree , model,  output_prefix = None):
	"""
	Placeholder function for running site likelihood analysis.
	This function should be implemented based on specific requirements.
	"""
	print("Running site likelihood analysis...")
	#raxml command is  --force --evaluate --msa your_alignment.phy --model GTR+G --tree fixed_tree.newick --site-lh
	# Example: assumes alignment and tree files are available for each HOG
	# Example: assumes alignment and tree files are available
	if output_prefix is None:
		output_prefix = "./raxmlng_results/example"
	# Ensure output directory exists
	import os
	if os.path.exists(os.path.dirname(output_prefix)):
		print(f"Output directory {os.path.dirname(output_prefix)} already exists.")
	else:
		print(f"Creating output directory: {os.path.dirname(output_prefix)}")	
		# Create output directory if it doesn't exist
		os.makedirs(os.path.dirname(output_prefix), exist_ok=True)

	cmd = [
		"raxml-ng",
		"--force",
		"--evaluate",
		"--msa", aln,
		"--model", model,
		"--tree", tree,
		"--site-lh",
		"--prefix", output_prefix
	]
	print(f"Running: {' '.join(cmd)}")
	subprocess.run(cmd, check=True)


In [None]:
#create a folder for the structures of each group of marker genes



In [None]:
#run the tree for the amino acid sequences

In [None]:
import os
import subprocess
from pathlib import Path
import pandas as pd
import numpy as np
import re
from Bio import AlignIO

def extract_site_likelihoods(log_file):
	"""
	Extract site-wise log-likelihood values from RAxML-NG output
	
	Parameters:
	-----------
	log_file : str
		Path to the RAxML-NG log file containing site likelihoods
		
	Returns:
	--------
	list of floats: Site log-likelihood values
	"""
	likelihoods = []
	
	with open(log_file, 'r') as f:
		# Skip to the part with site likelihoods
		for line in f:
			if line.startswith('Site '):
				break
				
		# Parse the likelihood values
		for line in f:
			if not line.strip() or line.startswith('Site '):
				continue
			if 'Sum' in line:  # End of site likelihoods section
				break
				
			parts = line.strip().split()
			if len(parts) >= 2:
				try:
					likelihoods.append(float(parts[1]))
				except ValueError:
					continue
	
	return likelihoods

def create_column_likelihood_dataframe(alignment_file, tree_file, log_file,  output_dir=None):
	"""
	Calculate site likelihoods for an alignment and create a DataFrame with
	alignment columns and their corresponding likelihood values
	
	Parameters:
	-----------
	alignment_file : str
		Path to the aligned FASTA file
	tree_file : str
		Path to the tree file in Newick format
	output_dir : str or None
		Directory to store intermediate files (defaults to same directory as alignment)
		
	Returns:
	--------
	pandas.DataFrame: DataFrame with columns for site index, alignment column, and likelihood
	"""
	if output_dir is None:
		output_dir = os.path.dirname(alignment_file)
	
	# Load the alignment
	alignment = AlignIO.read(alignment_file, "fasta")
	# Extract site likelihoods
	likelihoods = extract_site_likelihoods(log_file)
	# Prepare data for DataFrame
	data = []
	for i in range(alignment.get_alignment_length()):
		if i < len(likelihoods):
			column = [record.seq[i] for record in alignment]
			column_str = ''.join(column)
			data.append({
				'Site': i + 1,
				'Alignment_Column': column_str,
				'Log_Likelihood': likelihoods[i]
			})
	# Create DataFrame
	df = pd.DataFrame(data)
	return df


In [None]:

#create supermatrix using alignment
#mafft with ft2
#mafft normal
#foldmason


#concat and run raxmlng for each super

#visualize the trees

#calculate likelihood scores for each column




## Align FoldTree2-encoded FASTA Files with MAFFT
This cell will align all encoded FASTA files in a directory using MAFFT, producing aligned FASTA files for downstream benchmarking.

In [None]:
import glob
import os
import subprocess
from pathlib import Path


for fam in 

# Directory containing encoded FASTA files
encoded_dir = './families/encoded_fastas/'  # Change to your directory
os.makedirs(encoded_dir, exist_ok=True)

# Find all encoded FASTA files
encoded_fastas = list(Path(encoded_dir).glob("*.fasta"))
print(f"Found {len(encoded_fastas)} encoded FASTA files.")

# Align each encoded FASTA file using MAFFT via treebuilder's static method
for fasta_file in encoded_fastas:
	aligned_path = str(fasta_file.with_name(f"{fasta_file.stem}.aligned.fasta"))
	tb.run_mafft_textaln(str(fasta_file), outaln=aligned_path, matrix=mafftmat)
	print(f"Aligned {fasta_file} -> {aligned_path}")




In [None]:
#be