In [42]:

# Setup AlphaFold Benchmark

# 5 structural families

# collect input sets with good tax range

# use AA and foldtree 2 to reconstruct all internal nodes

#alphafold the amino acid sequences inferred from both

#recover plddt values for each ancestral reconstruction

#plddt as a measure of confidence in the ancestral reconstruction overall

#plddt ( mean, var , skew ) vs distance from root

## AlphaFold Benchmark Notebook Overview

This notebook benchmarks ancestral sequence reconstruction using AlphaFold.  
The workflow includes:

- Selecting 5 structural protein families with broad taxonomic diversity.
- Collecting input sets for each family.
- Reconstructing all internal ancestral nodes using two methods: amino acid (AA) and FoldTree2.
- Predicting structures for inferred ancestral sequences with AlphaFold.
- Extracting pLDDT confidence scores for each reconstruction.
- Analyzing pLDDT statistics (mean, variance, skewness) as a function of evolutionary distance from the root.

The goal is to assess the reliability of ancestral sequence reconstruction by comparing AlphaFold confidence metrics across different methods and evolutionary depths.

In [43]:
cd /home/dmoi/projects/foldtree2/

/home/dmoi/projects/foldtree2


In [44]:
overwrite = False
benchmark_folder='alphafold_benchmark'


In [45]:
#use autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
# List of eukaryotic species with broad taxonomic spread, emphasizing protists and early-branching lineages, with additional plants

eukaryotic_species = [
	# Excavata (early-branching protists)
	"Naegleria gruberi",      # Heterolobosea
	"Trypanosoma brucei",     # Euglenozoa
	"Giardia lamblia",        # Diplomonadida

	# Amoebozoa
	"Dictyostelium discoideum",  # Slime mold
	"Acanthamoeba castellanii",  # Free-living amoeba

	# Archaeplastida (plants and algae)
	"Chlamydomonas reinhardtii", # Green alga
	"Cyanidioschyzon merolae",   # Red alga
	"Arabidopsis thaliana",      # Land plant (model dicot)
	"Oryza sativa",              # Rice (monocot)
	"Physcomitrella patens",     # Moss (bryophyte)
	"Selaginella moellendorffii",# Lycophyte (early vascular plant)
	"Marchantia polymorpha",     # Liverwort (early land plant)

	# SAR (Stramenopiles, Alveolates, Rhizaria)
	"Thalassiosira pseudonana",  # Diatom (Stramenopile)
	"Plasmodium falciparum",     # Apicomplexan (Alveolate)
	"Tetrahymena thermophila",   # Ciliate (Alveolate)
	"Bigelowiella natans",       # Chlorarachniophyte (Rhizaria)

	# Opisthokonta
	"Saccharomyces cerevisiae",  # Fungi (yeast)
	"Neurospora crassa",         # Fungi (filamentous)
	"Monosiga brevicollis",      # Choanoflagellate
	"Homo sapiens",              # Animal (mammal)
	"Drosophila melanogaster",   # Animal (insect)
	"Caenorhabditis elegans",    # Animal (nematode)

	# Haptophytes & Cryptophytes (other early-branching groups)
	"Emiliania huxleyi",         # Haptophyte
	"Guillardia theta",          # Cryptophyte
]

print(eukaryotic_species)

from ete3 import NCBITaxa, Tree

ncbi = NCBITaxa()

# Get taxids for each species
species_to_taxid = {}
for sp in eukaryotic_species:
	try:
		taxid = ncbi.get_name_translator([sp])[sp][0]
		species_to_taxid[sp] = taxid
	except Exception as e:
		print(f"Could not find taxid for {sp}: {e}")

print("Species to TaxID mapping:")
for sp, taxid in species_to_taxid.items():
	print(f"{sp}: {taxid}")

# Get the NCBI tree topology for these species
taxid_list = list(species_to_taxid.values())
tree = ncbi.get_topology(taxid_list)
print(tree.get_ascii(attributes=["sci_name"]))

['Naegleria gruberi', 'Trypanosoma brucei', 'Giardia lamblia', 'Dictyostelium discoideum', 'Acanthamoeba castellanii', 'Chlamydomonas reinhardtii', 'Cyanidioschyzon merolae', 'Arabidopsis thaliana', 'Oryza sativa', 'Physcomitrella patens', 'Selaginella moellendorffii', 'Marchantia polymorpha', 'Thalassiosira pseudonana', 'Plasmodium falciparum', 'Tetrahymena thermophila', 'Bigelowiella natans', 'Saccharomyces cerevisiae', 'Neurospora crassa', 'Monosiga brevicollis', 'Homo sapiens', 'Drosophila melanogaster', 'Caenorhabditis elegans', 'Emiliania huxleyi', 'Guillardia theta']
Species to TaxID mapping:
Naegleria gruberi: 5762
Trypanosoma brucei: 5691
Giardia lamblia: 5741
Dictyostelium discoideum: 44689
Acanthamoeba castellanii: 5755
Chlamydomonas reinhardtii: 3055
Cyanidioschyzon merolae: 45157
Arabidopsis thaliana: 3702
Oryza sativa: 4530
Physcomitrella patens: 3218
Selaginella moellendorffii: 88036
Marchantia polymorpha: 3197
Thalassiosira pseudonana: 35128
Plasmodium falciparum: 5833


In [47]:
# Get distances from each internal node to the root in the ete3 tree

def get_internal_node_distances(tree):
	node_distances = {}
	root = tree
	for node in tree.traverse("postorder"):
		if not node.is_leaf():
			# ete3: get distance from node to root
			dist = 0.0
			current = node
			while current.up is not None:
				dist += current.dist if current.dist is not None else 0.0
				current = current.up
			node_distances[node.name if node.name else str(node)] = dist
	return node_distances

internal_node_distances = get_internal_node_distances(tree)
print(internal_node_distances)

{'2611352': 1.0, '1206794': 3.0, '33213': 2.0, '716545': 2.0, '33154': 1.0, '33630': 2.0, '2698737': 1.0, '554915': 1.0, '1437183': 4.0, '58023': 3.0, '3193': 2.0, '33090': 1.0, '2759': 0.0}


In [48]:
import requests
from io import StringIO
import pandas as pd
search_terms = ['rhodopsin', 'RuBisCO', 'FO ATP synthase', 'F1 ATP synthase' , 'Hap2' ]
def get_swissprot_eukaryota_entries(search_terms, reviewed=True, max_entries=1000):
	results = {}
	base_url = 'http://rest.uniprot.org/uniprotkb/stream?'
	search_terms = search_terms.replace(' ', '+')  # Replace spaces with '+' for URL encoding
	#query = f'{search_terms} AND taxonomy_id:2759'
	query = f'{search_terms} AND taxonomy_id:2759 AND database:alphafolddb'
	print(f"Querying UniProt for: {query}")
	if reviewed:
		query += ' AND reviewed:true'
	params = {
		'query': query,
	'fields': 'accession,id,protein_name,organism_name,sequence,lineage,',
		'format': 'tsv',
		'size': max_entries
	}
	response = requests.get(base_url, params=params)
	if response.status_code != 200:
		print(f"Error fetching data for {search_terms}: {response.status_code}")
		return None
	if not response.text.strip():
		print(f"No data returned for search term: {search_terms}")
		return None

	# Parse the response into a DataFrame
	results = pd.read_csv(StringIO(response.text), sep='\t')
	results['search_term'] = search_terms  # Add search term column
	results['taxonomy_id'] = 2759
	return results



In [49]:
import os
if overwrite:
	# Remove existing results folder if it exists
	import shutil
	if os.path.exists(benchmark_folder):
		shutil.rmtree(benchmark_folder)
		print(f"Removed existing folder: {benchmark_folder}")


In [50]:

import os
if not os.path.exists(benchmark_folder):
	os.makedirs(benchmark_folder)

for term in search_terms:
	print(f"\nSearch results for '{term}':")
	resultsdf = get_swissprot_eukaryota_entries(term)
	print(resultsdf)
	#save the results to a CSV file
	termfolder = term.replace(' ', '_')
	termfolder = os.path.join(benchmark_folder,termfolder)
	if not os.path.exists(termfolder):
		os.makedirs(termfolder)
	# Save the results to a CSV file
	results_file = os.path.join(termfolder, f"{term.replace(' ', '_')}_results.csv")
	resultsdf.to_csv(results_file, index=False)
	print(f"Results saved to {results_file}")



Search results for 'rhodopsin':
Querying UniProt for: rhodopsin AND taxonomy_id:2759 AND database:alphafolddb
           Entry   Entry Name  \
0     A0A0K3AWM6   MOM5_CAEEL   
1     A0A2R9YJI3  GPR22_DANRE   
2         A0T2N3   APJB_DANRE   
3         A1Z7G7   LPHN_DROME   
4         A2ARI4   LGR4_MOUSE   
...          ...          ...   
3182      Q09964   YS94_CAEEL   
3183      Q19473  SRD51_CAEEL   
3184      Q19474  SRD50_CAEEL   
3185      Q19508  SRD46_CAEEL   
3186      Q19975  SRD34_CAEEL   

                                          Protein names  \
0                                         Protein mom-5   
1                         G-protein coupled receptor 22   
2     Apelin receptor B (Angiotensin II receptor-lik...   
3                                      Latrophilin Cirl   
4     Leucine-rich repeat-containing G-protein coupl...   
...                                                 ...   
3182        Putative G-protein coupled receptor B0244.4   
3183  Serpentine rec

In [51]:

import glob
from src import AFDB_tools
import tqdm
#get the structures for each search term
if overwrite:
	for term in search_terms:
		termfolder = term.replace(' ', '_')
		structfolder = os.path.join(benchmark_folder, termfolder, 'input_structs')
		results_file = os.path.join(benchmark_folder , termfolder, f"{term.replace(' ', '_')}_results.csv")
		if not os.path.exists(results_file):
			print(f"Results file {results_file} does not exist. Skipping.")
			continue
		resultsdf = pd.read_csv(results_file)
		print(f"Processing results for {term} with {len(resultsdf)} entries.")
		print( len( glob.glob(os.path.join(structfolder, '*.pdb')) ), " structures already downloaded.")
		for index, row in tqdm.tqdm(resultsdf.iterrows() , total=len(resultsdf) , desc=f"Processing {term}"):
			uniprot_id = row['Entry']
			if not os.path.isfile(os.path.join(structfolder, uniprot_id + '.pdb')):
				AFDB_tools.grab_struct(uniprot_id , structfolder= structfolder + '/' , overwrite=False )

In [52]:
#use foldseek to cluster the structures
import os
import subprocess
overwrite = True  # Set to True to overwrite existing results
def run_foldseek(query_folder, target_folder, tmp_folder , foldseek_path='foldseek'):
	# Ensure output folder exists
	if not os.path.exists(output_folder):
		os.makedirs(output_folder)
		
	#command example:foldseek easy-cluster example/ res tmp -c 0.9 
	command = [foldseek_path, 'easy-cluster', query_folder, target_folder, tmp_folder, '-c', '0.9'
	]
	print(f"Running command: {' '.join(command)}")
	try:
		subprocess.run(command, check=True)
		print(f"Foldseek clustering completed successfully for {query_folder}.")
	except subprocess.CalledProcessError as e:
		print(f"Error running foldseek: {e}")
		raise

# Run foldseek clustering for each term
for term in search_terms:
	termfolder = term.replace(' ', '_')
	termfolder = os.path.join(benchmark_folder, termfolder)
	print(f"Processing term: {term} in folder: {termfolder}")
	structfolder = os.path.join(termfolder, 'input_structs')
	print(f"Input structures folder: {structfolder}")
	#print number of PDB files in the input folder
	pdb_files = glob.glob(os.path.join(structfolder, '*.pdb'))
	print(f"Number of PDB files in {structfolder}: {len(pdb_files)}")
	temp_folder = os.path.join(termfolder, 'tmp')
	print(f"Temporary folder for foldseek: {temp_folder}")
	if overwrite:
		# Remove existing output folder if it exists
		termfolder = term.replace(' ', '_')
		output_folder = os.path.join(benchmark_folder, termfolder,  'foldseek_output')
		if os.path.exists(output_folder):
			import shutil
			shutil.rmtree(output_folder)
			print(f"Removed existing output folder: {output_folder}")
	
	print(f"Running foldseek for {term} in {termfolder}")
	run_foldseek(structfolder, output_folder , tmp_folder=temp_folder)

Processing term: rhodopsin in folder: alphafold_benchmark/rhodopsin
Input structures folder: alphafold_benchmark/rhodopsin/input_structs
Number of PDB files in alphafold_benchmark/rhodopsin/input_structs: 3187
Temporary folder for foldseek: alphafold_benchmark/rhodopsin/tmp
Removed existing output folder: alphafold_benchmark/rhodopsin/foldseek_output
Running foldseek for rhodopsin in rhodopsin
Running command: foldseek easy-cluster alphafold_benchmark/rhodopsin/input_structs alphafold_benchmark/rhodopsin/foldseek_output alphafold_benchmark/rhodopsin/tmp -c 0.9
alphafold_benchmark/rhodopsin/foldseek_output exists and will be overwritten
easy-cluster alphafold_benchmark/rhodopsin/input_structs alphafold_benchmark/rhodopsin/foldseek_output alphafold_benchmark/rhodopsin/tmp -c 0.9 

MMseqs Version:                     	9.427df8a
Substitution matrix                 	aa:3di.out,nucl:3di.out
Seed substitution matrix            	aa:3di.out,nucl:3di.out
Sensitivity                         	4
k-

In [53]:
print("Foldseek clustering completed for all terms.")

Foldseek clustering completed for all terms.


In [34]:
#read input_structs_cluster.tsv and copy the cluster heads to a new folder
import pandas as pd
def copy_cluster_heads(input_file, input_folder , output_folder , verbose=True):
	if not os.path.exists(output_folder):
		os.makedirs(output_folder)
	df = pd.read_csv(input_file, sep='\t' , header=None)
	print(f"Read {len(df)} rows from {input_file}")
	if verbose:
		print("First few rows of the DataFrame:")
		print(df.head())
	# Get unique cluster IDs
	cluster_ids = df[0].unique()
	print(f"Found {len(cluster_ids)} unique clusters.")
	for cluster_id in cluster_ids:
		cluster_df = df[df[0] == cluster_id]
		if not cluster_df.empty:
			head_row = cluster_df.iloc[0]
			structure_path = os.path.join(input_folder, head_row[1] + '.pdb')
			if os.path.exists(structure_path):
				dest_path = os.path.join(output_folder, f"{head_row[0]}.pdb")
				print(f"Copying {structure_path} to {dest_path}")
				subprocess.run(['cp', structure_path, dest_path])
			else:
				print(f"Structure file {structure_path} does not exist. Skipping.")

for term in search_terms:
	termfolder = term.replace(' ', '_')
	termfolder = os.path.join(benchmark_folder, termfolder)
	# Define the input file path
	input_file = os.path.join(termfolder, 'foldseek_output_cluster.tsv')
	input_folder = os.path.join(termfolder, 'input_structs')
	output_folder = os.path.join(termfolder, 'cluster_heads')
	copy_cluster_heads(input_file, input_folder, output_folder, verbose=True)


Read 3187 rows from alphafold_benchmark/rhodopsin/foldseek_output_cluster.tsv
First few rows of the DataFrame:
        0       1
0  O55240  O55240
1  O60241  O60241
2  O60241  Q8CGM1
3  O60241  O14514
4  O60241  C0HL12
Found 178 unique clusters.
Copying alphafold_benchmark/rhodopsin/input_structs/O55240.pdb to alphafold_benchmark/rhodopsin/cluster_heads/O55240.pdb
Copying alphafold_benchmark/rhodopsin/input_structs/O60241.pdb to alphafold_benchmark/rhodopsin/cluster_heads/O60241.pdb
Copying alphafold_benchmark/rhodopsin/input_structs/O70430.pdb to alphafold_benchmark/rhodopsin/cluster_heads/O70430.pdb
Copying alphafold_benchmark/rhodopsin/input_structs/O70432.pdb to alphafold_benchmark/rhodopsin/cluster_heads/O70432.pdb
Copying alphafold_benchmark/rhodopsin/input_structs/O75154.pdb to alphafold_benchmark/rhodopsin/cluster_heads/O75154.pdb
Copying alphafold_benchmark/rhodopsin/input_structs/O77830.pdb to alphafold_benchmark/rhodopsin/cluster_heads/O77830.pdb
Copying alphafold_benchmark/

In [55]:
import glob
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import os
import tqdm
from scipy.stats import describe

#families directories
def pdb_to_fasta(pdb_file, fasta_file):
	parser = PDBParser(QUIET=True)
	three_to_one = {'ALA':'A', 'CYS':'C', 'ASP':'D', 'GLU':'E',
				'PHE':'F', 'GLY':'G', 'HIS':'H', 'ILE':'I',
				'LYS':'K', 'LEU':'L', 'MET':'M', 'ASN':'N',
				'PRO':'P', 'GLN':'Q', 'ARG':'R', 'SER':'S',
				'THR':'T', 'VAL':'V', 'TRP':'W', 'TYR':'Y'}

	try:
		structure = parser.get_structure('protein', pdb_file)
		# Get first chain sequence
		for model in structure:
			for chain in model:
				seq = ''
				for residue in chain:
					# Only process amino acid residues with CA atoms
					if 'CA' in residue and residue.get_resname() in three_to_one:
						try:
							aa = three_to_one.get(residue.get_resname(), 'X')
							seq += aa
						except Exception:
							# Use X for unknown/modified amino acids
							seq += 'X'
				if seq:  # Only create record if sequence was found
					pdb_id = os.path.splitext(os.path.basename(pdb_file))[0]
					record = SeqRecord(
						Seq(seq),
						id=f"{pdb_id}_{chain.id}",
						description=f"Chain {chain.id} from {pdb_id}"
					)
					return record
		return None
	except Exception as e:
		print(f"Error processing {pdb_file}: {e}")
		return None

def pdbs_to_fasta(pdb_files, fasta_file):
	with open(fasta_file, 'w') as fasta_out:
		for pdb_file in tqdm.tqdm(pdb_files, desc="Converting PDB to FASTA"):
			record = pdb_to_fasta(pdb_file, fasta_file)
			if record:
				SeqIO.write(record, fasta_out, "fasta")
	return fasta_file

fams = glob.glob('./alphafold_benchmark/*/')
for term in search_terms:
	termfolder = term.replace(' ', '_')
	fmt_term = term.replace(' ', '_')
	termfolder = os.path.join(benchmark_folder, termfolder)
	structs = glob.glob(termfolder + '/cluster_heads/*.pdb')
	if os.path.exists(termfolder + '/structs.fasta') and not overwrite:
		print(f"Skipping {termfolder} - fasta already exists.")
		continue
	fasta_file = termfolder + f'/{fmt_term}_AA.fasta'
	if os.path.exists(fasta_file):
		os.remove(fasta_file)
	pdbs_to_fasta(structs, fasta_file)
	print(f"Converted {len(structs)} PDB files to FASTA in {fasta_file}")

Converting PDB to FASTA: 100%|█| 178/178 [00:08<00:00,


Converted 178 PDB files to FASTA in alphafold_benchmark/rhodopsin/rhodopsin_AA.fasta


Converting PDB to FASTA: 100%|█| 70/70 [00:02<00:00, 2


Converted 70 PDB files to FASTA in alphafold_benchmark/RuBisCO/RuBisCO_AA.fasta


Converting PDB to FASTA: 100%|█| 5/5 [00:00<00:00, 69.


Converted 5 PDB files to FASTA in alphafold_benchmark/FO_ATP_synthase/FO_ATP_synthase_AA.fasta


Converting PDB to FASTA: 100%|█| 97/97 [00:02<00:00, 4


Converted 97 PDB files to FASTA in alphafold_benchmark/F1_ATP_synthase/F1_ATP_synthase_AA.fasta


Converting PDB to FASTA: 100%|█| 31/31 [00:01<00:00, 2

Converted 31 PDB files to FASTA in alphafold_benchmark/Hap2/Hap2_AA.fasta





In [57]:
import subprocess
def align_AA(fasta_path, output_dir, mafft_path='mafft'):
	# Align with MAFFT
	aligned_fasta = os.path.join(output_dir, 'aligned.fasta')
	with open(aligned_fasta, 'w') as out_f:
		subprocess.run([mafft_path, '--auto', fasta_path], stdout=out_f, check=True)
	return aligned_fasta

In [58]:
#align the fasta files
for term in search_terms:
	termfolder = term.replace(' ', '_')
	fmt_term = term.replace(' ', '_')
	termfolder = os.path.join(benchmark_folder, termfolder)
	fasta_file = os.path.join(termfolder, f'{fmt_term}_AA.fasta')
	if not os.path.exists(fasta_file):
		print(f"FASTA file {fasta_file} does not exist. Skipping alignment.")
		continue
	output_dir = termfolder
	aligned_fasta = align_AA(fasta_file, output_dir)
	print(f"Aligned FASTA saved to {aligned_fasta}")


nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..
  101 / 178
done.

Constructing a UPGMA tree (efffree=0) ... 
  170 / 178
done.

Progressive alignment 1/2... 
STEP    54 / 177 
Reallocating..done. *alloclen = 6207
STEP    76 / 177 
Reallocating..done. *alloclen = 7588
STEP   128 / 177 
Reallocating..done. *alloclen = 9078
STEP   177 / 177 
done.

Making a distance matrix from msa.. 
  100 / 178
done.

Constructing a UPGMA tree (efffree=1) ... 
  170 / 178
done.

Progressive alignment 2/2... 
STEP   129 / 177 
Reallocating..done. *alloclen = 6680
STEP   133 / 177 
Reallocating..done. *alloclen = 7763
STEP   153 / 177 
Reallocating..done. *alloclen = 9822
STEP   177 / 177 
done.

disttbfast (aa) Version 7.526
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)

distout=h
rescale = 1
dndpre (aa) Version 7.526
alg=X, model=BLOSUM62, 1.53, +0.12, -0.00, noshift, ama

Aligned FASTA saved to alphafold_benchmark/rhodopsin/aligned.fasta


outputhat23=16
treein = 0
compacttree = 0
stacksize: 8192 kb
rescale = 1
All-to-all alignment.
tbfast-pair (aa) Version 7.526
alg=L, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

outputhat23=16
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
   60 / 70
done.

Progressive alignment ... 
STEP    41 /69 
Reallocating..done. *alloclen = 4544
STEP    69 /69 
done.
tbfast (aa) Version 7.526
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
1 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16
Loading 'hat3' ... done.
rescale = 1

   60 / 70
Segment   1/  1    1-3041
STEP 012-018-1  rejected..    identical.    identical.    identical.    rejected. identical.    accepted. identical.    rejected. rejected. rejected. rejec

Aligned FASTA saved to alphafold_benchmark/RuBisCO/aligned.fasta


outputhat23=16
treein = 0
compacttree = 0
stacksize: 8192 kb
rescale = 1
All-to-all alignment.
tbfast-pair (aa) Version 7.526
alg=L, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

outputhat23=16
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
    0 / 5
done.

Progressive alignment ... 
STEP     4 /4 
done.
tbfast (aa) Version 7.526
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
1 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16
Loading 'hat3' ... done.
rescale = 1

    0 / 5
Segment   1/  1    1- 373
STEP 004-001-0  rejected..   
Converged.

done
dvtditr (aa) Version 7.526
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 L-INS-i (Probably most accurate, very slow)
 Iterat

Aligned FASTA saved to alphafold_benchmark/FO_ATP_synthase/aligned.fasta


outputhat23=16
treein = 0
compacttree = 0
stacksize: 8192 kb
rescale = 1
All-to-all alignment.
tbfast-pair (aa) Version 7.526
alg=L, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

outputhat23=16
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
   90 / 97
done.

Progressive alignment ... 
STEP    96 /96 
done.
tbfast (aa) Version 7.526
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
1 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16
Loading 'hat3' ... done.
rescale = 1

   90 / 97
Segment   1/  1    1-2748
STEP 016-051-0  rejected..    identical.    identical.    rejected. identical.    accepted. identical.    rejected. accepted. identical.    identical.    rejected. rejected. accepted. rejected. rejected. reje

Aligned FASTA saved to alphafold_benchmark/F1_ATP_synthase/aligned.fasta


outputhat23=16
treein = 0
compacttree = 0
stacksize: 8192 kb
rescale = 1
All-to-all alignment.
tbfast-pair (aa) Version 7.526
alg=L, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

outputhat23=16
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
   20 / 31
done.

Progressive alignment ... 
STEP    30 /30 
done.
tbfast (aa) Version 7.526
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
1 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 16
sueff_global = 0.100000
nadd = 16
Loading 'hat3' ... done.
rescale = 1

   20 / 31
Segment   1/  1    1-2078
STEP 008-015-0  rejected..   epted. rejected. identical.    rejected. accepted. rejected. rejected. rejected. rejected. rejected. rejected. rejected. rejected. rejected. rejected. rejected. rejected. rejected

Aligned FASTA saved to alphafold_benchmark/Hap2/aligned.fasta


STEP 008-014-0  rejected.
Converged.

done
dvtditr (aa) Version 7.526
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
0 thread(s)


Strategy:
 L-INS-i (Probably most accurate, very slow)
 Iterative refinement method (<16) with LOCAL pairwise alignment information

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft --help', 'mafft --man' and the mafft page.

The default gap scoring scheme has been changed in version 7.110 (2013 Oct).
It tends to insert more gaps into gap-rich regions than previous versions.
To disable this change, add the --leavegappyregion option.



In [None]:

def build_tree_ng(fasta_path, output_dir,  raxmlng_path='raxml-ng', model='LG+G+I', ancestral_states=True):
	# Run RAxML-NG
	tree_prefix = os.path.join(output_dir, 'raxmlng')
	raxmlng_cmd = [
		raxmlng_path,
		'--msa', fasta_path,
		'--model', model,
		'--prefix', tree_prefix,
		'--seed', '12345'
	]
	if ancestral_states:
		raxmlng_cmd += ['--ancestral']
	subprocess.run(raxmlng_cmd, check=True)
	print(f"Alignment written to {aligned_fasta}")
	print(f"RAxML-NG output in {output_dir}")

def build_states_ng(fasta_path, treefile, output_dir,  raxmlng_path='raxml-ng', model='LG+G+I', ancestral_states=True):
	# Run RAxML-NG
	tree_prefix = os.path.join(output_dir, 'raxmlng')
	raxmlng_cmd = [
		raxmlng_path,
		'--msa', fasta_path,
		'--model', model,
		'--prefix', tree_prefix,
		'--seed', '12345',
		'--tree', treefile,
	]
	if ancestral_states:
		raxmlng_cmd += ['--ancestral']
	subprocess.run(raxmlng_cmd, check=True)
	print(f"Alignment written to {aligned_fasta}")
	print(f"RAxML-NG output in {output_dir}")

In [65]:
#build tree for each term
for term in search_terms:
	termfolder = term.replace(' ', '_')
	fmt_term = term.replace(' ', '_')
	termfolder = os.path.join(benchmark_folder, termfolder)
	fasta_file = os.path.join(termfolder, 'aligned.fasta')
	if not os.path.exists(fasta_file):
		print(f"FASTA file {fasta_file} does not exist. Skipping tree building.")
		continue
	output_dir = termfolder
	print( fasta_file, output_dir)
	build_tree_ng( fasta_file, output_dir , raxmlng_path= '/home/dmoi/projects/foldtree2/raxmlng/raxml-ng' , model='LG+G+I', ancestral_states=False)
	# Check if the tree was built successfully
	tree_file = os.path.join(output_dir, 'raxmlng.tree')
	if not os.path.exists(tree_file):
		print(f"RAxML-NG tree file {tree_file} does not exist. Tree building may have failed.")
		continue
	# Print the tree file path
	print(f"RAxML-NG tree file: {tree_file}")
	#run ancestral reconstruction
	build_tree_ng( fasta_file, output_dir , raxmlng_path= './raxmlng/raxml-ng' , model='LG+G+I', ancestral_states=True)
	print(f"RAxML-NG tree built for {term} in {output_dir}")

alphafold_benchmark/rhodopsin/aligned.fasta alphafold_benchmark/rhodopsin

RAxML-NG v. 1.2.2-master released on 30.04.2024 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth, Julia Haag, Anastasis Togkousidis.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum/#!forum/raxml

System: Intel(R) Xeon(R) Silver 4110 CPU @ 2.10GHz, 16 cores, 251 GB RAM

RAxML-NG was called at 08-Jul-2025 10:41:26 as follows:

/home/dmoi/projects/foldtree2/raxmlng/raxml-ng --msa alphafold_benchmark/rhodopsin/aligned.fasta --model LG+G+I --prefix alphafold_benchmark/rhodopsin/raxmlng --seed 12345

Analysis options:
  run mode: ML tree search
  start tree(s): random (10) + parsimony (10)
  random seed: 12345
  tip-inner: OFF
  pattern compression: ON
  per-rate scalers: OFF
  site repeats: ON
  logLH epsilon: general

KeyboardInterrupt: 

In [None]:
#prepare alphafold run with AA reconstructed sequences
for term in search_terms:
    #make a folder for the alphafold run
    termfolder = term.replace(' ', '_')
	fmt_term = term.replace(' ', '_')
	termfolder = os.path.join(benchmark_folder, termfolder)
	output_dir = os.path.join(termfolder, 'alphafold_run_AA')
	if not os.path.exists(output_dir):
		os.makedirs(output_dir)
	#make a fasta for each entry in the renconstructed sequences
	reconstructed_fasta = os.path.join(output_dir, 'reconstructed_sequences_AA.fasta')
	if not os.path.exists(reconstructed_fasta):
		print(f"Reconstructed FASTA file {reconstructed_fasta} does not exist. Skipping AlphaFold run.")
		continue
	#parse the reconstructed sequences with biopython
	from Bio import SeqIO
	reconstructed_sequences = []
	for record in SeqIO.parse(reconstructed_fasta, "fasta"):
		reconstructed_sequences.append(record)
	for record in reconstructed_sequences:
		record.id = record.id.replace(' ', '_')  # Replace spaces with underscores in IDs
		record.description = ""  # Clear description for simplicity
		with open(os.path.join(output_dir, f"{record.id}.fasta"), "w") as out_f:
			SeqIO.write(record, out_f, "fasta")
	print(f"Reconstructed sequences saved to {output_dir}")

In [None]:
#get fastas from ancestral reconstruction
import os
import glob
from Bio import SeqIO	

def extract_ancestral_sequences(fasta_file, output_folder):
	if not os.path.exists(output_folder):
		os.makedirs(output_folder)
	
	for record in SeqIO.parse(fasta_file, "fasta"):
		if 'ancestral' in record.id:
			output_file = os.path.join(output_folder, f"{record.id}.fasta")
			SeqIO.write(record, output_file, "fasta")
			print(f"Extracted ancestral sequence: {record.id} to {output_file}")
	
	print(f"Ancestral sequences extracted to {output_folder}")

In [None]:
#setup alphafold run

## Run FoldTree2 on a Protein Family

In [16]:
import os

# Example: Run FoldTree2 on a family directory
family_dir = './alphafold_benchmark/families/example_family/'  # Change to your family path
model_path = '../../models/your_trained_model'  # Path to your trained model (without .pkl)
mafftmat = model_path + '_mafftmat.mtx'
submat = model_path + '_submat.txt'
output_dir = os.path.join(family_dir, 'foldtree2_results')

os.makedirs(output_dir, exist_ok=True)

cmd = f"python ../../ft2treebuilder.py --model {model_path} --mafftmat {mafftmat} --submat {submat} --structures '{family_dir}/structs/*.pdb' --outdir {output_dir} --ancestral"
print('Run this command in your shell:')
print(cmd)
# Optionally, to run from the notebook (uncomment the next line):
# !{cmd}


Run this command in your shell:
python ../../ft2treebuilder.py --model ../../models/your_trained_model --mafftmat ../../models/your_trained_model_mafftmat.mtx --submat ../../models/your_trained_model_submat.txt --structures './alphafold_benchmark/families/example_family//structs/*.pdb' --outdir ./alphafold_benchmark/families/example_family/foldtree2_results --ancestral


In [17]:
#convert phylip files to fasta
def phylip_to_fasta(phylip_file, fasta_file):
	with open(phylip_file, 'r') as infile, open(fasta_file, 'w') as outfile:
		lines = infile.readlines()
		num_seqs = int(lines[0].split()[0])
		seq_length = int(lines[0].split()[1])
		for i in range(1, num_seqs + 1):
			parts = lines[i].split()
			seq_id = parts[0]
			seq = ''.join(parts[1:])
			record = SeqRecord(Seq(seq), id=seq_id, description="")
			SeqIO.write(record, outfile, "fasta")

In [18]:
#convert the ancestral sequences to separate fastas for folding with alphafold

def split_fasta_by_id(fasta_file, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for record in tqdm.tqdm(SeqIO.parse(fasta_file, "fasta")):
        output_file = os.path.join(output_dir, f"{record.id}.fasta")
        SeqIO.write(record, output_file, "fasta")


for fam in fams:
    print(fam)
    ancestral_fasta = fam + '/ancestral.fasta'
    if not os.path.exists(ancestral_fasta):
        print(f"Skipping {fam} - ancestral fasta does not exist.")
        continue
    output_dir = fam + '/alphafold'
    split_fasta_by_id(ancestral_fasta, output_dir)
    print(f"Split ancestral sequences into separate FASTA files in {output_dir}")

In [19]:
# run alphafold separate on each ancestral sequence
# whichever method you prefer outside of this notebook

In [20]:
# grab all of the best models from the output dir

def ret_plddt_from_pdb(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)
    plddt_values = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if 'CA' in residue:
                    plddt = residue.xtra.get('plddt', None)
                    if plddt is not None:
                        plddt_values.append(pldddt)
    return plddt_values

resdf = {}

for fam in fams:
    print(fam)
    output_dir = fam + '/alphafold'
    if not os.path.exists(output_dir):
        print(f"Skipping {fam} - alphafold output directory does not exist.")
        continue
    best_models = glob.glob(output_dir + '/*/model_0.pdb')
    if not best_models:
        print(f"No best models found in {output_dir}.")
        continue
    for model in tqdm.tqdm(best_models):
        plddt_values = ret_plddt_from_pdb(model)
        if plddt_values:
            prot = os.path.basename(model).split('.')[0].split('/')[-1]
            stats = describe(plddt_values) 
            if prot not in resdf:
                resdf[prot] = {}
            resdf[prot]['mean'] = stats.mean
            resdf[prot]['variance'] = stats.variance
            resdf[prot]['skewness'] = stats.skewness
            resdf[prot]['max'] = stats.max
            resdf[prot]['min'] = stats.max
            resdf[prot]['plddt'] = plddt_values
            resdf[prot]['fam'] = fam



In [21]:
from Bio import Phylo

def get_node_to_prot_mapping(tree_file):
    tree = Phylo.read(tree_file, "newick")
    node_to_prot = {}
    for clade in tree.find_clades(order='level'):
        if clade.name:
            node_to_prot[clade.name] = clade
    return node_to_prot

def normalize_tree_branch_lengths(tree):
    total_length = sum(clade.branch_length for clade in tree.find_clades() if clade.branch_length)
    if total_length == 0:
        return tree  # Avoid division by zero
    for clade in tree.find_clades():
        if clade.branch_length:
            clade.branch_length /= total_length
    return tree

def get_distance_to_root(tree, node_name):
    clade = None
    for c in tree.find_clades():
        if c.name == node_name:
            clade = c
            break
    if clade is None:
        raise ValueError(f"Node {node_name} not found in tree.")
    distance = 0.0
    while clade != tree.root:
        parent = tree.get_path(clade)[-2] if len(tree.get_path(clade)) > 1 else tree.root
        if clade.branch_length:
            distance += clade.branch_length
        clade = parent
    return distance

# Assign distance to root for each protein/node in resdf
for prot, data in resdf.items():
    fam = data['fam']
    tree_file = os.path.join(fam, 'raxmlng.bestTree')  # adjust if tree filename differs
    if not os.path.exists(tree_file):
        print(f"Tree file not found for {fam}")
        data['distance_to_root'] = None
        continue
    tree = Phylo.read(tree_file, "newick")
    normalize_tree_branch_lengths(tree)
    try:
        distance = get_distance_to_root(tree, prot)
    except Exception as e:
        print(f"Error for {prot} in {fam}: {e}")
        distance = None
    data['distance_to_root'] = distance

In [22]:
import pandas as pd

import matplotlib.pyplot as plt

# Convert resdf to a DataFrame for easier plotting
df_stats = pd.DataFrame.from_dict(resdf, orient='index')
df_stats = df_stats.dropna(subset=['distance_to_root'])

# Plot by family
families = df_stats['fam'].unique()
plt.figure(figsize=(10, 6))
for fam in families:
    fam_df = df_stats[df_stats['fam'] == fam]
    plt.scatter(fam_df['distance_to_root'], fam_df['mean'], label=os.path.basename(fam), alpha=0.7)
plt.xlabel('Distance to Root')
plt.ylabel('Mean pLDDT')
plt.title('Mean pLDDT vs Distance to Root by Family')
plt.legend()
plt.show()

# Plot globally
plt.figure(figsize=(8, 5))
plt.scatter(df_stats['distance_to_root'], df_stats['mean'], alpha=0.7)
plt.xlabel('Distance to Root')
plt.ylabel('Mean pLDDT')
plt.title('Mean pLDDT vs Distance to Root (Global)')
plt.show()

KeyError: ['distance_to_root']

In [None]:
#visualize the trees

#calculate the tcs values

#

In [None]:
#recover fasta from the ancestral sequences
