In [None]:

# Setup AlphaFold Benchmark

# 5 structural families

# collect input sets with good tax range

# use AA and foldtree 2 to reconstruct all internal nodes

#alphafold the amino acid sequences inferred from both

#recover plddt values for each ancestral reconstruction

#plddt as a measure of confidence in the ancestral reconstruction overall

#plddt ( mean, var , skew ) vs distance from root

## AlphaFold Benchmark Notebook Overview

This notebook benchmarks ancestral sequence reconstruction using AlphaFold.  
The workflow includes:

- Selecting 5 structural protein families with broad taxonomic diversity.
- Collecting input sets for each family.
- Reconstructing all internal ancestral nodes using two methods: amino acid (AA) and FoldTree2.
- Predicting structures for inferred ancestral sequences with AlphaFold.
- Extracting pLDDT confidence scores for each reconstruction.
- Analyzing pLDDT statistics (mean, variance, skewness) as a function of evolutionary distance from the root.

The goal is to assess the reliability of ancestral sequence reconstruction by comparing AlphaFold confidence metrics across different methods and evolutionary depths.

In [None]:
overwrite == True

In [None]:
import glob
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import os
import tqdm
from scipy.stats import describe

#families directories
def pdb_to_fasta(pdb_file, fasta_file):
	parser = PDBParser(QUIET=True)
	structure = parser.get_structure('protein', pdb_file)
	for model in structure:
		for chain in model:
			seq = ''
			for residue in chain:
				if 'CA' in residue:
					resname = residue.get_resname()
					try:
						aa = SeqIO.Polypeptide.three_to_one(resname)
					except Exception:
						aa = 'X'
					seq += aa
			record = SeqRecord(Seq(seq), id=f"{os.path.basename(pdb_file)}_{chain.id}", description="")
			SeqIO.write(record, fasta_file, "fasta")


fams = glob.glob('./alphafold_benchmark/families/*/')
for fam in fams:
	print(fam)
	structs = glob.glob(fam + '/structs/*.pdb')
	if os.path.exists(fam + '/structs.fasta') and not overwrite:
		print(f"Skipping {fam} - fasta already exists.")
		continue
	fasta_file = fam + '/{fam}.fasta'
	if os.path.exists(fasta_file):
		os.remove(fasta_file)
	pdb_to_fasta(structs, fasta_file)
	print(f"Converted {len(structs)} PDB files to FASTA in {fasta_file}")

In [None]:
import subprocess

def align_and_build_tree_ng(fasta_path, output_dir, mafft_path='mafft', raxmlng_path='raxml-ng', model='LG+G', ancestral_states=False):
	# Align with MAFFT
	aligned_fasta = os.path.join(output_dir, 'aligned.fasta')
	with open(aligned_fasta, 'w') as out_f:
		subprocess.run([mafft_path, '--auto', fasta_path], stdout=out_f, check=True)
	
	# Run RAxML-NG
	tree_prefix = os.path.join(output_dir, 'raxmlng')
	raxmlng_cmd = [
		raxmlng_path,
		'--msa', aligned_fasta,
		'--model', model,
		'--prefix', tree_prefix,
		'--seed', '12345'
	]
	if ancestral_states:
		raxmlng_cmd += ['--ancestral']
	subprocess.run(raxmlng_cmd, check=True)
	print(f"Alignment written to {aligned_fasta}")
	print(f"RAxML-NG output in {output_dir}")


In [None]:
import os


## Run FoldTree2 on a Protein Family

In [None]:
import os

# Example: Run FoldTree2 on a family directory
family_dir = './alphafold_benchmark/families/example_family/'  # Change to your family path
model_path = '../../models/your_trained_model'  # Path to your trained model (without .pkl)
mafftmat = model_path + '_mafftmat.mtx'
submat = model_path + '_submat.txt'
output_dir = os.path.join(family_dir, 'foldtree2_results')

os.makedirs(output_dir, exist_ok=True)

cmd = f"python ../../ft2treebuilder.py --model {model_path} --mafftmat {mafftmat} --submat {submat} --structures '{family_dir}/structs/*.pdb' --outdir {output_dir} --ancestral"
print('Run this command in your shell:')
print(cmd)
# Optionally, to run from the notebook (uncomment the next line):
# !{cmd}


In [None]:
#convert phylip files to fasta
def phylip_to_fasta(phylip_file, fasta_file):
	with open(phylip_file, 'r') as infile, open(fasta_file, 'w') as outfile:
		lines = infile.readlines()
		num_seqs = int(lines[0].split()[0])
		seq_length = int(lines[0].split()[1])
		for i in range(1, num_seqs + 1):
			parts = lines[i].split()
			seq_id = parts[0]
			seq = ''.join(parts[1:])
			record = SeqRecord(Seq(seq), id=seq_id, description="")
			SeqIO.write(record, outfile, "fasta")

In [None]:
#convert the ancestral sequences to separate fastas for folding with alphafold

def split_fasta_by_id(fasta_file, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    for record in tqdm.tqdm(SeqIO.parse(fasta_file, "fasta")):
        output_file = os.path.join(output_dir, f"{record.id}.fasta")
        SeqIO.write(record, output_file, "fasta")


for fam in fams:
    print(fam)
    ancestral_fasta = fam + '/ancestral.fasta'
    if not os.path.exists(ancestral_fasta):
        print(f"Skipping {fam} - ancestral fasta does not exist.")
        continue
    output_dir = fam + '/alphafold'
    split_fasta_by_id(ancestral_fasta, output_dir)
    print(f"Split ancestral sequences into separate FASTA files in {output_dir}")

In [None]:
# run alphafold separate on each ancestral sequence
# whichever method you prefer outside of this notebook

In [None]:
# grab all of the best models from the output dir

def ret_plddt_from_pdb(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file)
    plddt_values = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if 'CA' in residue:
                    plddt = residue.xtra.get('plddt', None)
                    if plddt is not None:
                        plddt_values.append(pldddt)
    return plddt_values

resdf = {}

for fam in fams:
    print(fam)
    output_dir = fam + '/alphafold'
    if not os.path.exists(output_dir):
        print(f"Skipping {fam} - alphafold output directory does not exist.")
        continue
    best_models = glob.glob(output_dir + '/*/model_0.pdb')
    if not best_models:
        print(f"No best models found in {output_dir}.")
        continue
    for model in tqdm.tqdm(best_models):
        plddt_values = ret_plddt_from_pdb(model)
        if plddt_values:
            prot = os.path.basename(model).split('.')[0].split('/')[-1]
            stats = describe(plddt_values) 
            if prot not in resdf:
                resdf[prot] = {}
            resdf[prot]['mean'] = stats.mean
            resdf[prot]['variance'] = stats.variance
            resdf[prot]['skewness'] = stats.skewness
            resdf[prot]['max'] = stats.max
            resdf[prot]['min'] = stats.max
            resdf[prot]['plddt'] = plddt_values
            resdf[prot]['fam'] = fam



In [None]:
from Bio import Phylo

def get_node_to_prot_mapping(tree_file):
    tree = Phylo.read(tree_file, "newick")
    node_to_prot = {}
    for clade in tree.find_clades(order='level'):
        if clade.name:
            node_to_prot[clade.name] = clade
    return node_to_prot

def normalize_tree_branch_lengths(tree):
    total_length = sum(clade.branch_length for clade in tree.find_clades() if clade.branch_length)
    if total_length == 0:
        return tree  # Avoid division by zero
    for clade in tree.find_clades():
        if clade.branch_length:
            clade.branch_length /= total_length
    return tree

def get_distance_to_root(tree, node_name):
    clade = None
    for c in tree.find_clades():
        if c.name == node_name:
            clade = c
            break
    if clade is None:
        raise ValueError(f"Node {node_name} not found in tree.")
    distance = 0.0
    while clade != tree.root:
        parent = tree.get_path(clade)[-2] if len(tree.get_path(clade)) > 1 else tree.root
        if clade.branch_length:
            distance += clade.branch_length
        clade = parent
    return distance

# Assign distance to root for each protein/node in resdf
for prot, data in resdf.items():
    fam = data['fam']
    tree_file = os.path.join(fam, 'raxmlng.bestTree')  # adjust if tree filename differs
    if not os.path.exists(tree_file):
        print(f"Tree file not found for {fam}")
        data['distance_to_root'] = None
        continue
    tree = Phylo.read(tree_file, "newick")
    normalize_tree_branch_lengths(tree)
    try:
        distance = get_distance_to_root(tree, prot)
    except Exception as e:
        print(f"Error for {prot} in {fam}: {e}")
        distance = None
    data['distance_to_root'] = distance

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

# Convert resdf to a DataFrame for easier plotting
df_stats = pd.DataFrame.from_dict(resdf, orient='index')
df_stats = df_stats.dropna(subset=['distance_to_root'])

# Plot by family
families = df_stats['fam'].unique()
plt.figure(figsize=(10, 6))
for fam in families:
    fam_df = df_stats[df_stats['fam'] == fam]
    plt.scatter(fam_df['distance_to_root'], fam_df['mean'], label=os.path.basename(fam), alpha=0.7)
plt.xlabel('Distance to Root')
plt.ylabel('Mean pLDDT')
plt.title('Mean pLDDT vs Distance to Root by Family')
plt.legend()
plt.show()

# Plot globally
plt.figure(figsize=(8, 5))
plt.scatter(df_stats['distance_to_root'], df_stats['mean'], alpha=0.7)
plt.xlabel('Distance to Root')
plt.ylabel('Mean pLDDT')
plt.title('Mean pLDDT vs Distance to Root (Global)')
plt.show()

In [None]:
#visualize the trees

#calculate the tcs values

#

In [None]:
#recover fasta from the ancestral sequences
