In [None]:
import requests
import pandas as pd
from io import StringIO
import os
from tqdm import tqdm

# Create directories to store data if they don't exist
os.makedirs('clusters', exist_ok=True)
os.makedirs('metadata', exist_ok=True)


# Function to get metadata from UniProt
def get_uniprot_metadata(uniprot_ids, batch_size=100):
	"""
	Get metadata from UniProt for a list of UniProt IDs.
	Uses the batch retrieval API for efficiency.
	"""
	base_url = "https://rest.uniprot.org/uniprotkb/search"
	all_metadata = []
	
	# Process in batches to avoid overwhelming the API
	for i in tqdm(range(0, len(uniprot_ids), batch_size), desc="Fetching UniProt metadata"):
		batch_ids = uniprot_ids[i:i+batch_size]
		query = " OR ".join([f"accession:{uid}" for uid in batch_ids])
		
		params = {
			'query': query,
			'format': 'tsv',
			'fields': 'accession,id,protein_name,gene_names,organism_name,length,reviewed,lineage_ids'
		}
		
		try:
			response = requests.get(base_url, params=params)
			if response.status_code == 200:
				batch_df = pd.read_csv(StringIO(response.text), sep='\t')
				all_metadata.append(batch_df)
			else:
				print(f"Error fetching batch {i//batch_size + 1}. Status code: {response.status_code}")
		except Exception as e:
			print(f"Exception occurred during batch {i//batch_size + 1}: {str(e)}")
	
	if all_metadata:
		metadata_df = pd.concat(all_metadata, ignore_index=True)
		metadata_df.to_csv('metadata/uniprot_metadata.tsv', sep='\t', index=False)
		return metadata_df
	else:
		return None

# Main execution
# 1. Download clusters
clusters_df = download_afdb_clusters()

# 2. Get representative IDs for metadata retrieval
if clusters_df is not None:
	# Take a subset of representatives for demonstration (adjust as needed)
	representative_ids = clusters_df['representative'].unique()[:1000]
	print(f"Fetching metadata for {len(representative_ids)} representative proteins...")
	
	# 3. Get metadata from UniProt
	metadata_df = get_uniprot_metadata(representative_ids)
	
	if metadata_df is not None:
		print(f"Successfully retrieved metadata for {len(metadata_df)} proteins")
		display(metadata_df.head())

Downloading AFDB clusters from https://ftp.ebi.ac.uk/pub/databases/alphafold/latest/UniProt.clusters.tsv...
Failed to download clusters. Status code: 404


# Run ft2treebuilder and Maximum Likelihood Tree for Each Input Family
This section will iterate over each input family, run the ft2treebuilder pipeline, and also generate a standard maximum likelihood tree for comparison.

In [None]:
import os
import glob
import subprocess
from pathlib import Path

families_dir = '../families/'  # Adjust path as needed
pdb_families = sorted([f for f in glob.glob(os.path.join(families_dir, '*.pdb'))])
print(f"Found {len(pdb_families)} PDB files for families.")

# Output directory for results
output_dir = '../families/tree_results/'
os.makedirs(output_dir, exist_ok=True)

# Model and matrix paths (adjust as needed)
model_path = '../models/monodecoder_model'  # Path without .pkl
mafftmat = model_path + '_mafftmat.mtx'
submat = model_path + '_submat.txt'
raxml_path = '../raxml-ng'  # Path to RAxML-NG executable


In [None]:
def run_ft2treebuilder(pdb_file, model_path, mafftmat, submat, output_dir, raxml_path):
	"""Run ft2treebuilder for a single PDB family"""
	family_name = Path(pdb_file).stem
	family_outdir = os.path.join(output_dir, family_name)
	os.makedirs(family_outdir, exist_ok=True)
	
	cmd = f"python ../ft2treebuilder.py --model {model_path} --mafftmat {mafftmat} --submat {submat} --structures '{pdb_file}' --outdir {family_outdir} --raxmlpath {raxml_path}"
	print(f"Running: {cmd}")
	subprocess.run(cmd, shell=True, check=True)
	
	# Return the family output directory and name for ML tree generation
	return family_outdir, family_name

def run_ml_tree(family_outdir, family_name, raxml_path):
	"""Run a standard maximum likelihood tree using RAxML-NG"""
	# Find the encoded fasta output
	encoded_fasta = os.path.join(family_outdir, 'encoded.fasta')
	if not os.path.exists(encoded_fasta):
		# Try to find it if not in expected location
		candidates = list(Path(family_outdir).glob('*.fasta'))
		if candidates:
			encoded_fasta = str(candidates[0])
	
	# Run ML tree if encoded fasta exists
	if os.path.exists(encoded_fasta):
		ml_tree_prefix = os.path.join(family_outdir, 'mltree')
		ml_cmd = f"{raxml_path} --msa {encoded_fasta} --model GTR+G --prefix {ml_tree_prefix} --threads 2 --seed 12345 --redo"
		print(f"Running ML tree: {ml_cmd}")
		subprocess.run(ml_cmd, shell=True, check=True)
		return True
	else:
		print(f"Encoded fasta not found for {family_name}, skipping ML tree.")
		return False

# Main loop to process all families
for pdb_file in tqdm(pdb_families, desc="Running ft2treebuilder on families"):
	# Run ft2treebuilder
	family_outdir, family_name = run_ft2treebuilder(pdb_file, model_path, mafftmat, submat, output_dir, raxml_path)
	
	# Run standard ML tree
	run_ml_tree(family_outdir, family_name, raxml_path)


# Output Summary
For each input family, the following outputs are generated in the `tree_results` directory:
- ft2treebuilder output tree (using model-based encoding)
- Standard maximum likelihood tree (using RAxML-NG on the encoded fasta)

You can now visualize or compare these trees for downstream analysis.