In [2]:
import requests
import pandas as pd
from io import StringIO
import os
from tqdm import tqdm

# Base directory for all benchmarking files
base_dir = '../../datasets/foldtree2/minitcs/benchmark_data'
os.makedirs(base_dir, exist_ok=True)

# Create directories to store data
clusters_dir = os.path.join(base_dir, 'clusters')
metadata_dir = os.path.join(base_dir, 'metadata')
os.makedirs(clusters_dir, exist_ok=True)
os.makedirs(metadata_dir, exist_ok=True)

# Function to download AFDB clusters
def download_afdb_clusters(cluster_url="https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UniProt.clusters.tsv", 
						  save_path=os.path.join(clusters_dir, "afdb_clusters.tsv")):
	"""
	Download clusters from the AlphaFold Database.
	"""
	print(f"Downloading AFDB clusters from {cluster_url}...")
	response = requests.get(cluster_url)
	if response.status_code == 200:
		with open(save_path, 'w') as f:
			f.write(response.text)
		print(f"Successfully downloaded clusters to {save_path}")
		# Read the clusters into a DataFrame
		clusters_df = pd.read_csv(save_path, sep='\t', header=None, 
								 names=['cluster_id', 'representative', 'members'])
		return clusters_df
	else:
		print(f"Failed to download clusters. Status code: {response.status_code}")
		return None

# Function to get metadata from UniProt
def get_uniprot_metadata(uniprot_ids, batch_size=100):
	"""
	Get metadata from UniProt for a list of UniProt IDs.
	Uses the batch retrieval API for efficiency.
	"""
	base_url = "https://rest.uniprot.org/uniprotkb/search"
	all_metadata = []
	
	# Process in batches to avoid overwhelming the API
	for i in tqdm(range(0, len(uniprot_ids), batch_size), desc="Fetching UniProt metadata"):
		batch_ids = uniprot_ids[i:i+batch_size]
		query = " OR ".join([f"accession:{uid}" for uid in batch_ids])
		
		params = {
			'query': query,
			'format': 'tsv',
			'fields': 'accession,id,protein_name,gene_names,organism_name,length,reviewed,lineage_ids'
		}
		
		try:
			response = requests.get(base_url, params=params)
			if response.status_code == 200:
				batch_df = pd.read_csv(StringIO(response.text), sep='\t')
				all_metadata.append(batch_df)
			else:
				print(f"Error fetching batch {i//batch_size + 1}. Status code: {response.status_code}")
		except Exception as e:
			print(f"Exception occurred during batch {i//batch_size + 1}: {str(e)}")
	
	if all_metadata:
		metadata_df = pd.concat(all_metadata, ignore_index=True)
		metadata_df.to_csv(os.path.join(metadata_dir, 'uniprot_metadata.tsv'), sep='\t', index=False)
		return metadata_df
	else:
		return None

# Function to separate clusters into individual folders and save metadata
def separate_clusters(clusters_df, metadata_df=None):
	"""
	Create a separate folder for each cluster, save its members, and add metadata if available.
	"""
	clusters_output_dir = os.path.join(clusters_dir, 'individual_clusters')
	os.makedirs(clusters_output_dir, exist_ok=True)
	
	print("Separating clusters into individual folders...")
	for index, row in tqdm(clusters_df.iterrows(), total=len(clusters_df), desc="Creating cluster folders"):
		cluster_id = row['cluster_id']
		cluster_folder = os.path.join(clusters_output_dir, f"cluster_{cluster_id}")
		os.makedirs(cluster_folder, exist_ok=True)
		
		# Save cluster information
		cluster_info = pd.DataFrame({
			'cluster_id': [cluster_id],
			'representative': [row['representative']],
			'members': [row['members']]
		})
		cluster_info.to_csv(os.path.join(cluster_folder, 'cluster_info.tsv'), sep='\t', index=False)
		
		# Save list of members as separate file
		members = row['members'].split(',')
		members_df = pd.DataFrame({'uniprot_id': members})
		members_df.to_csv(os.path.join(cluster_folder, 'members.tsv'), sep='\t', index=False)
		
		# If metadata is available, filter and save for this cluster
		if metadata_df is not None:
			# Filter metadata for the members of this cluster
			cluster_metadata = metadata_df[metadata_df['Entry'].isin(members)]
			if not cluster_metadata.empty:
				cluster_metadata.to_csv(os.path.join(cluster_folder, 'metadata.tsv'), sep='\t', index=False)

# Main execution
# 1. Download clusters
clusters_df = download_afdb_clusters()

if clusters_df is not None:
	# Get all protein IDs (including representatives and members)
	all_proteins = set()
	for _, row in clusters_df.iterrows():
		all_proteins.add(row['representative'])
		all_proteins.update(row['members'].split(','))
	
	protein_list = list(all_proteins)
	print(f"Fetching metadata for {len(protein_list)} proteins...")
	
	# 2. Get metadata for all proteins
	metadata_df = get_uniprot_metadata(protein_list)
	
	if metadata_df is not None:
		print(f"Successfully retrieved metadata for {len(metadata_df)} proteins")
		# Ensure the accession column is named 'Entry' to match UniProt API output
		if 'accession' in metadata_df.columns and 'Entry' not in metadata_df.columns:
			metadata_df.rename(columns={'accession': 'Entry'}, inplace=True)
		
		# 3. Separate clusters and include metadata
		separate_clusters(clusters_df, metadata_df)
		print("Completed cluster separation with metadata")
	else:
		# If metadata retrieval fails, still separate clusters without metadata
		separate_clusters(clusters_df)
		print("Completed cluster separation without metadata")

PermissionError: [Errno 13] Permission denied: '../../datasets'

# Run ft2treebuilder and Maximum Likelihood Tree for Each Input Family
This section will iterate over each input family, run the ft2treebuilder pipeline, and also generate a standard maximum likelihood tree for comparison.

In [None]:
import os
import glob
import subprocess
from pathlib import Path

families_dir = '../families/'  # Adjust path as needed
pdb_families = sorted([f for f in glob.glob(os.path.join(families_dir, '*.pdb'))])
print(f"Found {len(pdb_families)} PDB files for families.")

# Output directory for results
output_dir = '../families/tree_results/'
os.makedirs(output_dir, exist_ok=True)

# Model and matrix paths (adjust as needed)
model_path = '../models/monodecoder_model'  # Path without .pkl
mafftmat = model_path + '_mafftmat.mtx'
submat = model_path + '_submat.txt'
raxml_path = '../raxml-ng'  # Path to RAxML-NG executable


In [None]:
def run_ft2treebuilder(pdb_file, model_path, mafftmat, submat, output_dir, raxml_path):
	"""Run ft2treebuilder for a single PDB family"""
	family_name = Path(pdb_file).stem
	family_outdir = os.path.join(output_dir, family_name)
	os.makedirs(family_outdir, exist_ok=True)
	
	cmd = f"python ../ft2treebuilder.py --model {model_path} --mafftmat {mafftmat} --submat {submat} --structures '{pdb_file}' --outdir {family_outdir} --raxmlpath {raxml_path}"
	print(f"Running: {cmd}")
	subprocess.run(cmd, shell=True, check=True)
	
	# Return the family output directory and name for ML tree generation
	return family_outdir, family_name

def run_ml_tree(family_outdir, family_name, raxml_path):
	"""Run a standard maximum likelihood tree using RAxML-NG"""
	# Find the encoded fasta output
	encoded_fasta = os.path.join(family_outdir, 'encoded.fasta')
	if not os.path.exists(encoded_fasta):
		# Try to find it if not in expected location
		candidates = list(Path(family_outdir).glob('*.fasta'))
		if candidates:
			encoded_fasta = str(candidates[0])
	
	# Run ML tree if encoded fasta exists
	if os.path.exists(encoded_fasta):
		ml_tree_prefix = os.path.join(family_outdir, 'mltree')
		ml_cmd = f"{raxml_path} --msa {encoded_fasta} --model GTR+G --prefix {ml_tree_prefix} --threads 2 --seed 12345 --redo"
		print(f"Running ML tree: {ml_cmd}")
		subprocess.run(ml_cmd, shell=True, check=True)
		return True
	else:
		print(f"Encoded fasta not found for {family_name}, skipping ML tree.")
		return False

# Main loop to process all families
for pdb_file in tqdm(pdb_families, desc="Running ft2treebuilder on families"):
	# Run ft2treebuilder
	family_outdir, family_name = run_ft2treebuilder(pdb_file, model_path, mafftmat, submat, output_dir, raxml_path)
	
	# Run standard ML tree
	run_ml_tree(family_outdir, family_name, raxml_path)


In [None]:
# For each AlphaFold DB cluster, load the tree, add lineage info, and compute tree scores

import os
import glob
import pandas as pd
from ete3 import Tree
import sys

# Ensure src is in the path for importing treescore
sys.path.append(os.path.abspath("../src"))
import treescore

# Path to the separated clusters
clusters_output_dir = os.path.join(clusters_dir, 'individual_clusters')

# Helper: Try to find a tree file in a cluster folder
def find_tree_file(cluster_folder):
    # Try common tree file names and extensions
    for ext in ['*.nwk', '*.tree', '*.tre', '*.newick']:
        files = glob.glob(os.path.join(cluster_folder, ext))
        if files:
            return files[0]
    return None

# Helper: Load lineage info from metadata.tsv
def load_lineages(metadata_path):
    df = pd.read_csv(metadata_path, sep='\t')
    # Try to find the correct columns for UniProt ID and lineage
    # Accepts 'Entry' or 'accession' for ID, and 'lineage_ids' or 'Taxonomic lineage (Ids)' for lineage
    id_col = None
    for c in ['Entry', 'accession', 'query']:
        if c in df.columns:
            id_col = c
            break
    lineage_col = None
    for c in ['lineage_ids', 'Taxonomic lineage (Ids)']:
        if c in df.columns:
            lineage_col = c
            break
    if id_col is None or lineage_col is None:
        raise ValueError("Could not find UniProt ID or lineage column in metadata")
    # Convert lineage string to set
    return dict(zip(df[id_col], df[lineage_col].map(lambda x: set(str(x).split(',')) if pd.notnull(x) else set())))

# Store results for all clusters
cluster_scores = []

for cluster_name in os.listdir(clusters_output_dir):
    cluster_folder = os.path.join(clusters_output_dir, cluster_name)
    if not os.path.isdir(cluster_folder):
        continue

    tree_file = find_tree_file(cluster_folder)
    metadata_file = os.path.join(cluster_folder, 'metadata.tsv')
    if not tree_file or not os.path.exists(metadata_file):
        print(f"Skipping {cluster_name}: missing tree or metadata")
        continue

    # Load tree using ete3
    try:
        t = Tree(tree_file, format=1)
    except Exception as e:
        print(f"Failed to load tree for {cluster_name}: {e}")
        continue

    # Load lineage info
    try:
        leaf_lineages = load_lineages(metadata_file)
    except Exception as e:
        print(f"Failed to load lineages for {cluster_name}: {e}")
        continue

    # Add lineage info to leaves
    treescore.label_leaves(t, leaf_lineages)

    # Calculate taxonomy overlap scores
    treescore.getTaxOverlap(t)
    treescore.getTaxOverlap_root(t)

    # Collect scores
    score = getattr(t, 'score', None)
    root_score = getattr(t, 'root_score', None)
    cluster_scores.append({
        'cluster': cluster_name,
        'score': score,
        'root_score': root_score
    })

    print(f"Cluster: {cluster_name}, score: {score}, root_score: {root_score}")

# Optionally, convert results to DataFrame and save
scores_df = pd.DataFrame(cluster_scores)
scores_df.to_csv(os.path.join(clusters_output_dir, 'cluster_tree_scores.tsv'), sep='\t', index=False)
print("Saved cluster tree scores to cluster_tree_scores.tsv")


# Output Summary
For each input family, the following outputs are generated in the `tree_results` directory:
- ft2treebuilder output tree (using model-based encoding)
- Standard maximum likelihood tree (using RAxML-NG on the encoded fasta)

You can now visualize or compare these trees for downstream analysis.