# Information-Theoretic Benchmarking of Species Tree Inference

This notebook presents a comparative workflow for species tree inference using two different character sets: traditional amino acid sequences and foldtree-encoded structural features. The goal is to benchmark the phylogenetic signal carried by each character set using information-theoretic approaches.

**Workflow Overview:**
- Construct a species tree using a standard amino acid-based pipeline (multiple sequence alignment, concatenation, and maximum likelihood inference).
- Construct an equivalent species tree using foldtree-encoded data.
- For both trees, compute column-wise log-likelihoods and character frequencies.
- Quantify the information content and phylogenetic signal of each character set by analyzing the distribution of likelihoods and character frequencies.

This approach enables a direct comparison of how much evolutionary information is captured by sequence versus structure-based encodings, providing an objective benchmark for future phylogenomic analyses.

In [95]:
cd /home/dmoi/projects/foldtree2/

/home/dmoi/projects/foldtree2


In [96]:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import requests
import random
import subprocess
from pathlib import Path
import concurrent.futures
import multiprocessing
import pandas as pd
import requests
from Bio import SeqIO
import re
import time

In [97]:
overwrite = False
model = 'mfnew_128mk2'

In [98]:
import glob
markers = glob.glob( './families/Information_benchmark/marker_genes/marker_genes/*.fa')
print( f"Found {len(markers)} marker genes" )

Found 500 marker genes


In [99]:
#set up a folder for each marker gene family
for marker in markers:
	marker_name = os.path.basename(marker).split('.')[0]
	Path(f'./families/Information_benchmark/marker_genes/{marker_name}').mkdir(parents=True, exist_ok=True)
	#add a structs folder
	Path(f'./families/Information_benchmark/marker_genes/{marker_name}/structs').mkdir(parents=True, exist_ok=True)

In [100]:
#use autoreload to reload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
# Specify your model and associated matrices
from ft2treebuilder import treebuilder
model_path = "./models/" + model  # Path to your model (without .pkl)
mafftmat = model_path + "_mafftmat.mtx"
submat = model_path + "_submat.txt"
aapropcsv = './config/aaindex1.csv'  # Path to your amino acid properties CSV file
# Initialize the treebuilder class
tb = treebuilder(model=model_path, mafftmat=mafftmat, submat=submat , aapropcsv=aapropcsv)

In [102]:
def align_fasta_with_mafft(fasta_file):
	"""
	Align a single FASTA file using MAFFT
	
	Parameters:
	-----------
	fasta_file : str
		Path to input FASTA file
	
	Returns:
	--------
	tuple: (input_path, output_path, success_status)
	"""
	# Create output filename - same directory but with .aligned.fa extension
	input_path = Path(fasta_file)
	output_path = input_path.with_name(f"{input_path.stem}.aligned.fa")
	
	# Run MAFFT
	cmd = f"mafft --auto --thread 1 {fasta_file} > {output_path}"
	print(f"Aligning: {fasta_file}")
	
	try:
		subprocess.run(cmd, shell=True, check=True, stderr=subprocess.PIPE)
		return (fasta_file, str(output_path), True)
	except subprocess.CalledProcessError as e:
		print(f"Error aligning {fasta_file}: {e}")
		return (fasta_file, str(output_path), False)

# Get number of available cores (leave 1 core free for system processes)
max_workers = max(1, multiprocessing.cpu_count() - 1)
print(f"Using {max_workers} cores for alignments")

# Process alignments in parallel
aligned_files = []
failed_files = []

def align_fasta_with_mafft(fasta_file):
	"""
	Align a single FASTA file using MAFFT
	
	Parameters:
	-----------
	fasta_file : str
		Path to input FASTA file
	
	Returns:
	--------
	tuple: (input_path, output_path, success_status)
	"""
	# Create output filename - same directory but with .aligned.fa extension
	input_path = Path(fasta_file)
	output_path = input_path.with_name(f"{input_path.stem}.aligned.fa")
	
	# Run MAFFT
	cmd = f"mafft --auto --thread 1 {fasta_file} > {output_path}"
	print(f"Aligning: {fasta_file}")
	
	try:
		subprocess.run(cmd, shell=True, check=True, stderr=subprocess.PIPE)
		return (fasta_file, str(output_path), True)
	except subprocess.CalledProcessError as e:
		print(f"Error aligning {fasta_file}: {e}")
		return (fasta_file, str(output_path), False)

# Get number of available cores (leave 1 core free for system processes)
max_workers = max(1, multiprocessing.cpu_count() - 1)
print(f"Using {max_workers} cores for alignments")
# Process alignments in parallel
aligned_files = []
failed_files = []

Using 31 cores for alignments
Using 31 cores for alignments


In [103]:
def concatenate_alignments(alignment_files, output_file):
	"""
	Concatenate multiple alignment files into a single supermatrix alignment.
	
	Parameters:
	-----------
	alignment_files : list
		List of paths to alignment files in FASTA format
	output_file : str
		Path to save the concatenated alignment
		
	Returns:
	--------
	tuple
		(concatenated_alignment, partition_info)
		- concatenated_alignment: The final MultipleSeqAlignment object
		- partition_info: Dictionary with gene boundaries for partition file creation
	"""
	if not alignment_files:
		print("No alignment files provided")
		return None, {}
	
	# Dictionary to store sequences for each species across all genes
	all_species = {}
	partition_info = {}
	current_position = 1
	
	# Process each alignment file
	for i, aln_file in enumerate(alignment_files):
		try:
			# Load the alignment
			gene_name = Path(aln_file).stem.replace('.aligned', '')
			alignment = AlignIO.read(aln_file, "fasta")
			aln_length = alignment.get_alignment_length()
			
			# Store partition information
			partition_info[gene_name] = {
				'start': current_position,
				'end': current_position + aln_length - 1
			}
			
			# Process each sequence in this alignment
			for record in alignment:
				# Extract species identifier from the sequence header
				species_id = record.id.split('|')[-1]
				
				# Initialize this species entry if it doesn't exist yet
				if species_id not in all_species:
					all_species[species_id] = {}
				
				# Add this gene's sequence for this species
				all_species[species_id][gene_name] = str(record.seq)
			
			current_position += aln_length
			print(f"Processed alignment {i+1}/{len(alignment_files)}: {gene_name} ({aln_length} columns)")
			
		except Exception as e:
			print(f"Error processing {aln_file}: {str(e)}")
	
	# Create the concatenated alignment
	concatenated_records = []
	gene_names = list(partition_info.keys())
	
	for species_id, genes in all_species.items():
		# Build the concatenated sequence for this species
		concat_seq = ""
		for gene in gene_names:
			if gene in genes:
				concat_seq += genes[gene]
			else:
				# If this species doesn't have this gene, add gaps
				gene_length = partition_info[gene]['end'] - partition_info[gene]['start'] + 1
				concat_seq += "-" * gene_length
		
		# Create a SeqRecord for this concatenated sequence
		record = SeqRecord(
			Seq(concat_seq),
			id=species_id,
			description=f"Concatenated {len(gene_names)} genes"
		)
		concatenated_records.append(record)
	
	# Create and save the concatenated alignment
	concatenated_alignment = MultipleSeqAlignment(concatenated_records)
	
	# Save alignment to file
	with open(output_file, "w") as handle:
		AlignIO.write(concatenated_alignment, handle, "fasta")
	
	# Create a partition file for RAxML-NG
	partition_file = f"{output_file}.partition"
	with open(partition_file, "w") as handle:
		for gene, pos in partition_info.items():
			handle.write(f"GTR+G, {gene} = {pos['start']}-{pos['end']}\n")
	
	print(f"Created concatenated alignment with {len(concatenated_records)} species and {concatenated_alignment.get_alignment_length()} columns")
	print(f"Partition file saved to {partition_file}")
	return concatenated_alignment, partition_info

In [104]:
def extract_oma_ids_from_fasta(fasta_file):
	"""
	Extract OMA identifiers from a FASTA file
	ID format example:

	>MOUSE45461 | OMA754554 | COQ5_MOUSE | [Mus musculus]

	
	Parameters:
	-----------
	fasta_file : str
		Path to FASTA file
		
	Returns:
	--------
	list of str: OMA identifiers
	"""
	oma_ids = []
	oma_files = {}
	for record in SeqIO.parse(fasta_file, "fasta"):
		# Extract OMA ID from the FASTA header using the pipe-separated format
		id = record.description.split('|')[0]	
		oma_ids.append(id.strip())
	oma_files[id.strip()] = fasta_file
	return oma_ids , oma_files
# Extract OMA IDs from all marker files
oma_ids = []
oma_files = {}
for marker in markers:
	ids, files = extract_oma_ids_from_fasta(marker)
	oma_ids.extend(ids)
	oma_files.update(files)

oma_ids = list(set(oma_ids))  # Remove duplicates
print(f"Extracted {len(oma_ids)} unique OMA IDs from marker genes")
oma_files = pd.DataFrame.from_dict(oma_files, orient='index', columns=['fasta_file'])
oma_files.index.name = 'oma_id'
oma_files.reset_index(inplace=True)
oma_files.to_csv("oma_markergene_files.csv", index=False)


Extracted 8377 unique OMA IDs from marker genes


In [105]:
print( oma_ids[:10] )  # Print first 10 OMA IDs for verification

['MOUSE16055', 'CAPO308128', 'PARTE15034', 'ACACA02190', 'TRYB202319', 'PLAF704684', 'CAPO305854', 'CYAME02727', 'PHYPA03750', 'CHLRE08248']


In [106]:
import json
import tqdm
def bulk_map_oma_to_uniprot(oma_ids, batch_size=100, retry_limit=3 , verbose=True):
	"""
	Map a list of OMA identifiers to UniProt using the OMA bulk API.

	Parameters:
	-----------
	oma_ids : list of str
		List of OMA identifiers (e.g., OMA123456)
	batch_size : int
		Number of IDs per batch (max 1000)
	retry_limit : int
		Number of times to retry on failure

	Returns:
	--------
	list of dict: Mapping information for each OMA ID
	"""
	url = "https://omabrowser.org/api/protein/bulk_retrieve/"
	all_results = []

	for i in tqdm.tqdm(range(0, len(oma_ids), batch_size)):
		batch = oma_ids[i:i+batch_size]
		for attempt in range(retry_limit):
			try:
				response = requests.post(url, json={"ids": batch})
				if response.ok:
					data = response.json()
					data = json.loads(response.text)  # Ensure we parse the JSON correctly
					#transfor data into dataframe
					results = {}
					for query in data:
						results[query['query_id']] = query['target']
					if verbose:
						print(f"Batch {i//batch_size+1} processed successfully with {len(results)} mappings")
						print(pd.DataFrame.from_dict(results, orient='index').reset_index())
					all_results.append(pd.DataFrame.from_dict(results, orient='index').reset_index())
					break  # Success, break retry loop
				else:
					print(f"Error fetching batch {i//batch_size+1}: HTTP {response.status_code}")
			except Exception as e:
				print(f"Error in batch {i//batch_size+1}: {str(e)}")
			time.sleep(1)
		else:
			# If all retries failed, add minimal info for each ID in the batch
			for oma_id in batch:
				all_results.append({'oma_id': oma_id, 'uniprot_id': None})
			all_results.append(pd.DataFrame([{'oma_id': oma_id, 'uniprot_id': None}]))
	# Concatenate all results into a single DataFrame
	if all_results:
		all_results = pd.concat(all_results, ignore_index=True)
		return all_results
	else:
		return pd.DataFrame(columns=['oma_id', 'uniprot_id'])


	

In [107]:
import os
if not os.path.exists("oma_to_uniprot_mapping.csv"):
	print("Mapping file does not exist, creating it now...")
	#retreive all OMA IDs from the marker genes
	mapping_df = bulk_map_oma_to_uniprot(oma_ids , verbose=False)
	# Save the mapping results to a CSV file
	mapping_df.to_csv("oma_to_uniprot_mapping.csv", index=True)
else:
	print("Mapping file already exists, loading it...")
	mapping_df = pd.read_csv("oma_to_uniprot_mapping.csv", index_col=0)
	print(f"Loaded mapping file with {len(mapping_df)} entries")
	print(mapping_df.head())
	

Mapping file already exists, loading it...
Loaded mapping file with 8377 entries
        index  entry_nr                                     entry_url  \
0  THAPS04302  20294321  https://omabrowser.org/api/protein/20294321/   
1  NAEGR05573   7221363   https://omabrowser.org/api/protein/7221363/   
2  CYAME00980  19565914  https://omabrowser.org/api/protein/19565914/   
3  TRYB204190   7199379   https://omabrowser.org/api/protein/7199379/   
4  CAPO304696   7392554   https://omabrowser.org/api/protein/7392554/   

        omaid   canonicalid                      sequence_md5  \
0  THAPS04302        B8BT93  2ff7284c3f9dbc99b3b15eadcbc04312   
1  NAEGR05573        D2UXH8  287afcc288060dfa301c0957863755bf   
2  CYAME00980       CML272C  63433c408ab217ab86000576cf745a9a   
3  TRYB204190    FEN1_TRYB2  e77b1149fd33a4e06e67374ae3b1bbc5   
4  CAPO304696  XP_004349858  2df47cf6ca86cbe5ed5282ee7ab20f31   

   sequence_length                                            species  \
0              6

In [108]:

import pyoma.browser.db
db = pyoma.browser.db.Database("/home/dmoi/datasets/OMA/OmaServer.h5")
db.get_release_name()
resolver = pyoma.browser.db.IDResolver(db)
mapper = db.id_mapper['OMA']
linkout = db.id_mapper['Linkout'] 




Cannot load SequenceSearch. Any future call to seq_search will fail!
Traceback (most recent call last):
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/pyoma/browser/db.py", line 2340, in __init__
    self.seq_idx = self.seq_idx()
                   ^^^^^^^^^^^^^^
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/tables/link.py", line 427, in __call__
    self.extfile = tb.open_file(filename, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/pyoma/browser/db.py", line 137, in synchronized_open_file
    return _tables_file._original_open_file(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/dmoi/miniforge3/envs/pyg/lib/python3.12/site-packages/tables/file.py", line 325, in open_file
    return File(filename, mode, title, root_uep, filters, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/hom

In [109]:

entry = 'CHLRE00910'  # Example entry number
entry_nr = mapper.omaid_to_entry_nr(entry)
print(f"Entry number for {entry}: {entry_nr}")


Entry number for CHLRE00910: 20577393


In [110]:
#try mapping entry
entry = 'CHLRE00910'  # Example entry number
entry_nr = mapper.omaid_to_entry_nr(entry)
xrefs = linkout.iter_xrefs_for_entry_nr(entry_nr)
print([ r for r in xrefs])


[{'source': 'UniProtKB/TrEMBL', 'xref': 'A0A2K3DKT0', 'url': 'http://uniprot.org/uniprot/A0A2K3DKT0'}, {'source': 'EntrezGene', 'xref': '5718125', 'url': 'http://www.ncbi.nlm.nih.gov/gene/5718125'}]


In [111]:
def get_uniprot_ids_from_oma(oma_id, mapper, linkout):
	"""
	Get UniProt IDs for a given OMA ID by chaining mapper and xref operations.
	
	Parameters:
	-----------
	oma_id : str
		OMA identifier (e.g., 'CHLRE00910')
	mapper : pyoma.browser.db.OmaIdMapper
		OMA ID mapper instance
	linkout : pyoma.browser.idmapper.LinkoutIdMapper
		Linkout mapper instance
	
	Returns:
	--------
	list: UniProt IDs associated with the OMA ID
	"""
	try:
		# Convert OMA ID to entry number
		entry_nr = mapper.omaid_to_entry_nr(oma_id)
		
		# Get all xrefs for this entry
		xrefs = linkout.iter_xrefs_for_entry_nr(entry_nr)
		
		# Filter for UniProt references
		uniprot_ids = [ref['xref'] for ref in xrefs 
					  if ref['source'] in ['UniProtKB/Swiss-Prot', 'UniProtKB/TrEMBL']]
		
		return uniprot_ids
	
	except Exception as e:
		print(f"Error processing {oma_id}: {str(e)}")
		return []

# Test the function with the example entry
uniprot_ids = get_uniprot_ids_from_oma(entry, mapper, linkout)
print(f"UniProt IDs for {entry}: {uniprot_ids}")

UniProt IDs for CHLRE00910: ['A0A2K3DKT0']


In [112]:
print( mapping_df.head() )

        index  entry_nr                                     entry_url  \
0  THAPS04302  20294321  https://omabrowser.org/api/protein/20294321/   
1  NAEGR05573   7221363   https://omabrowser.org/api/protein/7221363/   
2  CYAME00980  19565914  https://omabrowser.org/api/protein/19565914/   
3  TRYB204190   7199379   https://omabrowser.org/api/protein/7199379/   
4  CAPO304696   7392554   https://omabrowser.org/api/protein/7392554/   

        omaid   canonicalid                      sequence_md5  \
0  THAPS04302        B8BT93  2ff7284c3f9dbc99b3b15eadcbc04312   
1  NAEGR05573        D2UXH8  287afcc288060dfa301c0957863755bf   
2  CYAME00980       CML272C  63433c408ab217ab86000576cf745a9a   
3  TRYB204190    FEN1_TRYB2  e77b1149fd33a4e06e67374ae3b1bbc5   
4  CAPO304696  XP_004349858  2df47cf6ca86cbe5ed5282ee7ab20f31   

   sequence_length                                            species  \
0              629  {'code': 'THAPS', 'taxon_id': 296543, 'species...   
1             1206  {'co

In [113]:
def ret_refs(oma_ids, mapper, linkout):
	"""
	Retrieve UniProt references for given OMA IDs from the OMA database.
	
	Parameters:
	-----------
	oma_ids : list
		List of OMA identifiers
	mapper : pyoma.browser.db.OmaIdMapper
		OMA ID mapper instance
	linkout : pyoma.browser.idmapper.LinkoutIdMapper
		Linkout mapper instance
	
	Returns:
	--------
	dict: Dictionary mapping OMA IDs to their UniProt references
	"""
	return {oma_id: get_uniprot_ids_from_oma(oma_id, mapper, linkout) for oma_id in oma_ids}

# Try with some OMA IDs
sample_oma_ids = mapping_df['index'].sample(10).tolist()
print(f"Sample OMA IDs: {sample_oma_ids}")

sample_refs = ret_refs(sample_oma_ids, mapper, linkout)
print("\nSample references:")
for oma_id, refs in sample_refs.items():
	print(f"{oma_id}: {refs}")

Sample OMA IDs: ['DROME06042', 'DROME20126', 'CAEEL04111', 'ARATH16397', 'HUMAN96022', 'CAPO305172', 'DROME20081', 'ECTSI04009', 'CAEEL08520', 'CYAME03712']

Sample references:
DROME06042: ['Q7KKI0']
DROME20126: ['Q9XYZ5']
CAEEL04111: ['Q2WF63']
ARATH16397: ['A0A1P8B0T2']
HUMAN96022: ['A7E261', 'O14841']
CAPO305172: ['A0A0D2WK15']
DROME20081: ['Q9VD29']
ECTSI04009: ['D7G8T7']
CAEEL08520: ['P34580']
CYAME03712: []


In [114]:
from collections import defaultdict
import pandas as pd

def parse_xref_list_to_row(xref_list):
	"""
	Parse a list of xref dicts into a single DataFrame row.
	Each xref source becomes a column, value is the xref (or list if multiple for same source).
	The row index is the OMA ID (omaid).
	
	Parameters:
	-----------
	xref_list : list of dict
		List of xref dictionaries as returned by the OMA API.
		
	Returns:
	--------
	pandas.DataFrame: Single-row DataFrame with omaid as index and xrefs as columns.
	"""

	if not xref_list:
		return pd.DataFrame()

	omaid = xref_list[0].get('omaid')
	xref_dict = defaultdict(list)
	for x in xref_list:
		src = x['source']
		xref_dict[src].append(x['xref'])

	# Flatten single-item lists
	xref_dict = {k: v[0] if len(v) == 1 else v for k, v in xref_dict.items()}
	xref_dict['omaid'] = omaid

	df = pd.DataFrame([xref_dict]).set_index('omaid')
	return df

In [115]:
mapping_df = pd.read_csv("oma_to_uniprot_mapping.csv", index_col=0)
example = mapping_df.iloc[0]
print( example)
#use ret_refs to get xref
#check xref pickle
if os.path.exists("xref_data.pkl"):
	print("Loading xref data from pickle file...")
	xref_data = pd.read_pickle("xref_data.pkl")
else:
	print("Xref data pickle not found, generating it...")
	# Generate xref data
	# This will take some time depending on the number of OMA IDs
	# and the speed of the OMA API
	xref_data = ret_refs(mapping_df.omaid.to_list() , mapper, linkout)
#map xref_data to a DataFrame
print(xref_data)

index                                                               THAPS04302
entry_nr                                                              20294321
entry_url                         https://omabrowser.org/api/protein/20294321/
omaid                                                               THAPS04302
canonicalid                                                             B8BT93
sequence_md5                                  2ff7284c3f9dbc99b3b15eadcbc04312
sequence_length                                                            629
species                      {'code': 'THAPS', 'taxon_id': 296543, 'species...
oma_group                                                               926837
oma_hog_id                                                        HOG:E0802081
chromosome                                                                   2
locus                              {'start': 49583, 'end': 51961, 'strand': 1}
is_main_isoform                                     

In [116]:
mapping_df['uniprot_refs'] = mapping_df['omaid'].map(xref_data)

In [117]:
#save with uniprot_refs
mapping_df.to_csv("oma_to_uniprot_mapping_with_refs.csv", index=False)

In [118]:
print( mapping_df.head() )

        index  entry_nr                                     entry_url  \
0  THAPS04302  20294321  https://omabrowser.org/api/protein/20294321/   
1  NAEGR05573   7221363   https://omabrowser.org/api/protein/7221363/   
2  CYAME00980  19565914  https://omabrowser.org/api/protein/19565914/   
3  TRYB204190   7199379   https://omabrowser.org/api/protein/7199379/   
4  CAPO304696   7392554   https://omabrowser.org/api/protein/7392554/   

        omaid   canonicalid                      sequence_md5  \
0  THAPS04302        B8BT93  2ff7284c3f9dbc99b3b15eadcbc04312   
1  NAEGR05573        D2UXH8  287afcc288060dfa301c0957863755bf   
2  CYAME00980       CML272C  63433c408ab217ab86000576cf745a9a   
3  TRYB204190    FEN1_TRYB2  e77b1149fd33a4e06e67374ae3b1bbc5   
4  CAPO304696  XP_004349858  2df47cf6ca86cbe5ed5282ee7ab20f31   

   sequence_length                                            species  \
0              629  {'code': 'THAPS', 'taxon_id': 296543, 'species...   
1             1206  {'co

In [119]:
mapping_df['1stref'] = mapping_df['uniprot_refs'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
#eliminate rows with no 1stref
mapping_df = mapping_df[mapping_df['1stref'].notnull()]
print(mapping_df.head())

        index  entry_nr                                     entry_url  \
0  THAPS04302  20294321  https://omabrowser.org/api/protein/20294321/   
1  NAEGR05573   7221363   https://omabrowser.org/api/protein/7221363/   
3  TRYB204190   7199379   https://omabrowser.org/api/protein/7199379/   
4  CAPO304696   7392554   https://omabrowser.org/api/protein/7392554/   
5  DROME27485  17560034  https://omabrowser.org/api/protein/17560034/   

        omaid   canonicalid                      sequence_md5  \
0  THAPS04302        B8BT93  2ff7284c3f9dbc99b3b15eadcbc04312   
1  NAEGR05573        D2UXH8  287afcc288060dfa301c0957863755bf   
3  TRYB204190    FEN1_TRYB2  e77b1149fd33a4e06e67374ae3b1bbc5   
4  CAPO304696  XP_004349858  2df47cf6ca86cbe5ed5282ee7ab20f31   
5  DROME27485    ZPR1_DROME  17474110099554159b2ad79dd5d455b3   

   sequence_length                                            species  \
0              629  {'code': 'THAPS', 'taxon_id': 296543, 'species...   
1             1206  {'co

In [120]:
mapping_df['struct_folder'] = mapping_df['oma_group'].apply(lambda x: f"./families/Information_benchmark/marker_genes/{x}/structs")

In [None]:
from src.AFDB_tools import grab_struct
import tqdm

In [122]:
#download the alphafold structures for each marker gene family
for idx,row in tqdm.tqdm(mapping_df.iterrows()):
	#check if the folder exists
	struct_folder = row['struct_folder']
	if not os.path.exists(struct_folder):
		os.makedirs(struct_folder)
	#download the structure
	retpath = grab_struct(row['1stref'], struct_folder, overwrite=overwrite)
	#add struct path to mapping_df
	mapping_df.at[idx, 'struct_path'] = retpath

245it [00:00, 885.53it/s] 

error downloading structure for D2V0G9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V0G9-F1-model_v4.pdb
error downloading structure for O15881 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-O15881-F1-model_v4.pdb
error downloading structure for D2UY12 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2UY12-F1-model_v4.pdb


338it [00:00, 622.63it/s]

error downloading structure for A0A2K3DAU6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3DAU6-F1-model_v4.pdb
error downloading structure for A0A8H4F898 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4F898-F1-model_v4.pdb
error downloading structure for A0A0D2UDQ5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UDQ5-F1-model_v4.pdb
error downloading structure for G5E866 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-G5E866-F1-model_v4.pdb


530it [00:00, 612.33it/s]

error downloading structure for A0A1X7V288 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7V288-F1-model_v4.pdb
error downloading structure for A0D7P3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0D7P3-F1-model_v4.pdb
error downloading structure for X6LSP0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6LSP0-F1-model_v4.pdb


726it [00:00, 924.73it/s]

error downloading structure for A0A178UPW5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UPW5-F1-model_v4.pdb
error downloading structure for A0DJB3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0DJB3-F1-model_v4.pdb
error downloading structure for A0A178V904 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178V904-F1-model_v4.pdb


835it [00:01, 745.05it/s]

error downloading structure for A0A1X7VNC8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VNC8-F1-model_v4.pdb
error downloading structure for A9RIM7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RIM7-F1-model_v4.pdb
error downloading structure for A0A8H4BUB1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BUB1-F1-model_v4.pdb
error downloading structure for A0A8H8ULD4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8ULD4-F1-model_v4.pdb


997it [00:01, 553.64it/s]

error downloading structure for A0A2K1JLW1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JLW1-F1-model_v4.pdb
error downloading structure for A0A0D2X573 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X573-F1-model_v4.pdb
error downloading structure for A0A8H4BX66 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BX66-F1-model_v4.pdb


1076it [00:01, 552.14it/s]

error downloading structure for D2W323 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2W323-F1-model_v4.pdb
error downloading structure for A0A5S9T906 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A5S9T906-F1-model_v4.pdb
error downloading structure for A0A0D2X319 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X319-F1-model_v4.pdb
error downloading structure for A0A8H8UMD1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8UMD1-F1-model_v4.pdb


1191it [00:02, 452.73it/s]

error downloading structure for Q7S8K0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S8K0-F1-model_v4.pdb
error downloading structure for Q7SBD3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBD3-F1-model_v4.pdb
error downloading structure for A0A9L9PXM3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A9L9PXM3-F1-model_v4.pdb


1401it [00:02, 584.54it/s]

error downloading structure for A0A2K3D1T9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3D1T9-F1-model_v4.pdb
error downloading structure for A0A2K1KEY3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1KEY3-F1-model_v4.pdb
error downloading structure for A0A2K3DJF9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3DJF9-F1-model_v4.pdb


1563it [00:02, 710.01it/s]

error downloading structure for D2V5V4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V5V4-F1-model_v4.pdb
error downloading structure for A0A2K3D577 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3D577-F1-model_v4.pdb
error downloading structure for A0A0B4KHL2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0B4KHL2-F1-model_v4.pdb


1708it [00:02, 584.50it/s]

error downloading structure for D2V0Z5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V0Z5-F1-model_v4.pdb
error downloading structure for A0BQ90 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0BQ90-F1-model_v4.pdb
error downloading structure for A0A2K1IX27 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1IX27-F1-model_v4.pdb
error downloading structure for A0A2K1KKV4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1KKV4-F1-model_v4.pdb
error downloading structure for A0A8H8UKY7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8UKY7-F1-model_v4.pdb
error downloading structure for O08847 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-O08847-F1-model_v4.pdb
error downloading structure for J9R021 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-J9R021-F1-model_v4.pdb


1769it [00:03, 366.33it/s]

error downloading structure for H0YKI9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-H0YKI9-F1-model_v4.pdb
error downloading structure for A0A994J5K1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A994J5K1-F1-model_v4.pdb
error downloading structure for A0CY10 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0CY10-F1-model_v4.pdb


1985it [00:03, 557.38it/s]

error downloading structure for D2VJC0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VJC0-F1-model_v4.pdb
error downloading structure for Q7RXJ6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7RXJ6-F1-model_v4.pdb
error downloading structure for D7FNG4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FNG4-F1-model_v4.pdb


2134it [00:03, 533.37it/s]

error downloading structure for X6P1L5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6P1L5-F1-model_v4.pdb
error downloading structure for A0A8Q3SHT1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8Q3SHT1-F1-model_v4.pdb
error downloading structure for L8H0U7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H0U7-F1-model_v4.pdb


2196it [00:03, 409.13it/s]

error downloading structure for Q4V5E9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q4V5E9-F1-model_v4.pdb
error downloading structure for X6MD81 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6MD81-F1-model_v4.pdb
error downloading structure for A0A2K1IHM2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1IHM2-F1-model_v4.pdb


2295it [00:04, 411.13it/s]

error downloading structure for L8HFM7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8HFM7-F1-model_v4.pdb
error downloading structure for A0A0D2VYG0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VYG0-F1-model_v4.pdb
error downloading structure for B8BW81 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BW81-F1-model_v4.pdb
error downloading structure for A0A8H8ULD3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H8ULD3-F1-model_v4.pdb
error downloading structure for A0A8H4BX09 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BX09-F1-model_v4.pdb
error downloading structure for L8GIV8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GIV8-F1-model_v4.pdb
error downloading structure for Q84KP6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q84KP6-F1-model_v4.pdb


2342it [00:04, 267.28it/s]

error downloading structure for A9UNH9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9UNH9-F1-model_v4.pdb
error downloading structure for A9RYA1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RYA1-F1-model_v4.pdb
error downloading structure for A0A178W8K9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W8K9-F1-model_v4.pdb


2585it [00:04, 509.64it/s]

error downloading structure for Q7SBT9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBT9-F1-model_v4.pdb
error downloading structure for B8C4M1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C4M1-F1-model_v4.pdb
error downloading structure for A0D8K9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0D8K9-F1-model_v4.pdb


2646it [00:05, 497.53it/s]

error downloading structure for B7ZKR9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B7ZKR9-F1-model_v4.pdb
error downloading structure for A0A8V8TNX6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TNX6-F1-model_v4.pdb
error downloading structure for A9RTW1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RTW1-F1-model_v4.pdb
error downloading structure for A0A1C7ZN09 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1C7ZN09-F1-model_v4.pdb


2751it [00:05, 428.37it/s]

error downloading structure for L8GWY4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GWY4-F1-model_v4.pdb
error downloading structure for A0A994J3Z0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A994J3Z0-F1-model_v4.pdb
error downloading structure for A0A178VV92 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178VV92-F1-model_v4.pdb


2877it [00:05, 543.86it/s]

error downloading structure for A0A2K3E5Q8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3E5Q8-F1-model_v4.pdb
error downloading structure for A0A0D2WJE0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WJE0-F1-model_v4.pdb
error downloading structure for A0A8H4BZ97 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BZ97-F1-model_v4.pdb


2984it [00:05, 392.78it/s]

error downloading structure for Q7S6V7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S6V7-F1-model_v4.pdb
error downloading structure for X6N3M5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6N3M5-F1-model_v4.pdb
error downloading structure for A0A8V8TNZ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TNZ9-F1-model_v4.pdb


3110it [00:06, 438.86it/s]

error downloading structure for H7C0A0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-H7C0A0-F1-model_v4.pdb
error downloading structure for L8GTL5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GTL5-F1-model_v4.pdb
error downloading structure for A0A1X7VVZ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VVZ9-F1-model_v4.pdb


3314it [00:06, 652.44it/s]

error downloading structure for A0A8Q3SIS9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8Q3SIS9-F1-model_v4.pdb
error downloading structure for B8CAU9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8CAU9-F1-model_v4.pdb
error downloading structure for B8CE77 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8CE77-F1-model_v4.pdb


3387it [00:06, 530.04it/s]

error downloading structure for A0A8H4BV16 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BV16-F1-model_v4.pdb
error downloading structure for Q80XH7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q80XH7-F1-model_v4.pdb
error downloading structure for A0A8V8TNX3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TNX3-F1-model_v4.pdb


3509it [00:06, 482.07it/s]

error downloading structure for D2VT72 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VT72-F1-model_v4.pdb
error downloading structure for B8BVD1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BVD1-F1-model_v4.pdb
error downloading structure for A0A2K1JG38 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JG38-F1-model_v4.pdb


3636it [00:06, 594.21it/s]

error downloading structure for A0A1I9LME9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1I9LME9-F1-model_v4.pdb
error downloading structure for A0A0D2WQT4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WQT4-F1-model_v4.pdb
error downloading structure for A0A0D2X192 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X192-F1-model_v4.pdb


3698it [00:07, 482.75it/s]

error downloading structure for A0CWW3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0CWW3-F1-model_v4.pdb
error downloading structure for A0A2K3D4R2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3D4R2-F1-model_v4.pdb
error downloading structure for A8J3Y6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A8J3Y6-F1-model_v4.pdb


3815it [00:07, 553.93it/s]

error downloading structure for B8C728 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C728-F1-model_v4.pdb
error downloading structure for D8LRP2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LRP2-F1-model_v4.pdb
error downloading structure for A0A0D2VJQ0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VJQ0-F1-model_v4.pdb


3874it [00:07, 365.34it/s]

error downloading structure for A0A1X7V9A9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7V9A9-F1-model_v4.pdb
error downloading structure for B5YNG2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B5YNG2-F1-model_v4.pdb
error downloading structure for Q7S8V0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S8V0-F1-model_v4.pdb


3920it [00:07, 355.44it/s]

error downloading structure for A0A2K1IRD9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1IRD9-F1-model_v4.pdb
error downloading structure for A0A1X7UM21 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7UM21-F1-model_v4.pdb
error downloading structure for A0A2K3CV27 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3CV27-F1-model_v4.pdb


4082it [00:08, 441.48it/s]

error downloading structure for A0A0D2X586 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X586-F1-model_v4.pdb
error downloading structure for A0A0D2VTB7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VTB7-F1-model_v4.pdb
error downloading structure for B8C8T3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C8T3-F1-model_v4.pdb


4239it [00:08, 602.65it/s]

error downloading structure for A0A8H4BWD9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BWD9-F1-model_v4.pdb
error downloading structure for A0A178W7J7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W7J7-F1-model_v4.pdb
error downloading structure for Q8MS45 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8MS45-F1-model_v4.pdb


4386it [00:08, 546.90it/s]

error downloading structure for A0A2K1J2Q4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1J2Q4-F1-model_v4.pdb
error downloading structure for Q7SBU3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBU3-F1-model_v4.pdb
error downloading structure for A0A1X7VQ62 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VQ62-F1-model_v4.pdb


4443it [00:08, 508.29it/s]

error downloading structure for D7G3I9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7G3I9-F1-model_v4.pdb
error downloading structure for A6NMQ1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A6NMQ1-F1-model_v4.pdb
error downloading structure for Q7RWE3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7RWE3-F1-model_v4.pdb
error downloading structure for L8HD92 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8HD92-F1-model_v4.pdb


4495it [00:09, 335.88it/s]

error downloading structure for F5H112 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-F5H112-F1-model_v4.pdb
error downloading structure for A0A0D2UHS0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UHS0-F1-model_v4.pdb
error downloading structure for A0A0D2VTX0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VTX0-F1-model_v4.pdb


4574it [00:09, 313.40it/s]

error downloading structure for A0A6Q8PGB0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A6Q8PGB0-F1-model_v4.pdb
error downloading structure for C9JJJ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-C9JJJ9-F1-model_v4.pdb
error downloading structure for V5IM93 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-V5IM93-F1-model_v4.pdb
error downloading structure for A0EII8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0EII8-F1-model_v4.pdb


4642it [00:09, 291.11it/s]

error downloading structure for Q7S1D7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S1D7-F1-model_v4.pdb
error downloading structure for A0A8H4BXW5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BXW5-F1-model_v4.pdb
error downloading structure for A0A178WN52 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178WN52-F1-model_v4.pdb
error downloading structure for A0A2R8Y705 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2R8Y705-F1-model_v4.pdb


4684it [00:09, 288.64it/s]

error downloading structure for A0A0D2VFR5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VFR5-F1-model_v4.pdb
error downloading structure for A0A2K3E2B8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3E2B8-F1-model_v4.pdb
error downloading structure for A0A8H4FAP6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4FAP6-F1-model_v4.pdb


4866it [00:10, 457.96it/s]

error downloading structure for A0C1F3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0C1F3-F1-model_v4.pdb
error downloading structure for A0A0D2VGW4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2VGW4-F1-model_v4.pdb
error downloading structure for A0BZJ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0BZJ9-F1-model_v4.pdb


4913it [00:10, 379.70it/s]

error downloading structure for A0A2K3E5A0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3E5A0-F1-model_v4.pdb
error downloading structure for A0A8H4BW10 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BW10-F1-model_v4.pdb
error downloading structure for A0A2K1JHT7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JHT7-F1-model_v4.pdb


5077it [00:10, 580.83it/s]

error downloading structure for B6IDH0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B6IDH0-F1-model_v4.pdb
error downloading structure for A0A0D2WIH6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WIH6-F1-model_v4.pdb
error downloading structure for A0A178W7K6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W7K6-F1-model_v4.pdb


5225it [00:10, 535.83it/s]

error downloading structure for B8BTL9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BTL9-F1-model_v4.pdb
error downloading structure for A0A2K3CS00 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3CS00-F1-model_v4.pdb
error downloading structure for A0BYP5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0BYP5-F1-model_v4.pdb


5280it [00:10, 447.45it/s]

error downloading structure for D8LHW4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LHW4-F1-model_v4.pdb
error downloading structure for A0A4D6K4F6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A4D6K4F6-F1-model_v4.pdb
error downloading structure for A9RFW2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A9RFW2-F1-model_v4.pdb
error downloading structure for Q8IKE1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8IKE1-F1-model_v4.pdb


5395it [00:11, 438.43it/s]

error downloading structure for X6NDB8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6NDB8-F1-model_v4.pdb
error downloading structure for A0A8H4BWC8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BWC8-F1-model_v4.pdb
error downloading structure for D2VN06 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VN06-F1-model_v4.pdb


5440it [00:11, 411.57it/s]

error downloading structure for A0A9L9PWW3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A9L9PWW3-F1-model_v4.pdb
error downloading structure for A0A4D6K3W6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A4D6K3W6-F1-model_v4.pdb
error downloading structure for L8H1Z5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H1Z5-F1-model_v4.pdb


5598it [00:11, 466.49it/s]

error downloading structure for A8K6F0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A8K6F0-F1-model_v4.pdb
error downloading structure for X6NQP0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6NQP0-F1-model_v4.pdb
error downloading structure for Q9U6I2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q9U6I2-F1-model_v4.pdb


5645it [00:11, 421.27it/s]

error downloading structure for D2V3Y4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V3Y4-F1-model_v4.pdb
error downloading structure for A0A8D9I6U5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8D9I6U5-F1-model_v4.pdb
error downloading structure for D8LQX5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LQX5-F1-model_v4.pdb


5688it [00:11, 312.76it/s]

error downloading structure for A0A178WHI8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178WHI8-F1-model_v4.pdb
error downloading structure for A0A8V8TQE5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TQE5-F1-model_v4.pdb
error downloading structure for A0C766 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0C766-F1-model_v4.pdb


5819it [00:12, 455.92it/s]

error downloading structure for Q872G8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q872G8-F1-model_v4.pdb
error downloading structure for D8LIP1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LIP1-F1-model_v4.pdb
error downloading structure for A0A2K1L0X0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1L0X0-F1-model_v4.pdb


5965it [00:12, 484.87it/s]

error downloading structure for A0A0D2WVM9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2WVM9-F1-model_v4.pdb
error downloading structure for Q8I1X5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8I1X5-F1-model_v4.pdb
error downloading structure for L8GWB5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GWB5-F1-model_v4.pdb


6016it [00:12, 451.66it/s]

error downloading structure for A0A2K1KED1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1KED1-F1-model_v4.pdb
error downloading structure for D2V097 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2V097-F1-model_v4.pdb
error downloading structure for A0A2K3CRZ5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3CRZ5-F1-model_v4.pdb


6128it [00:12, 426.93it/s]

error downloading structure for A0A1X7VP53 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VP53-F1-model_v4.pdb
error downloading structure for A0A1X7UQD0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7UQD0-F1-model_v4.pdb
error downloading structure for D2VNK8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2VNK8-F1-model_v4.pdb


6404it [00:13, 761.02it/s]

error downloading structure for D7FU80 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FU80-F1-model_v4.pdb
error downloading structure for M9PFS0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-M9PFS0-F1-model_v4.pdb
error downloading structure for A0A494C1A5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A494C1A5-F1-model_v4.pdb
error downloading structure for D2UZM0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D2UZM0-F1-model_v4.pdb
error downloading structure for A0A0G2JRU0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0G2JRU0-F1-model_v4.pdb
error downloading structure for A0A994J6E8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A994J6E8-F1-model_v4.pdb
error downloading structure for X6MGV5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-X6MGV5-F1-model_v4.pdb


6490it [00:13, 520.35it/s]

error downloading structure for A0A8H4BYN2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BYN2-F1-model_v4.pdb
error downloading structure for D8LTY2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LTY2-F1-model_v4.pdb
error downloading structure for Q7SBL5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7SBL5-F1-model_v4.pdb
error downloading structure for D7FI10 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FI10-F1-model_v4.pdb
error downloading structure for A0A178W1F6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178W1F6-F1-model_v4.pdb
error downloading structure for A7E261 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A7E261-F1-model_v4.pdb


6612it [00:13, 358.55it/s]

error downloading structure for A0A8H4BWX1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BWX1-F1-model_v4.pdb
error downloading structure for B3H4G2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B3H4G2-F1-model_v4.pdb
error downloading structure for A0A8H4FA39 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4FA39-F1-model_v4.pdb
error downloading structure for A0A8H4C1K9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4C1K9-F1-model_v4.pdb


6661it [00:14, 325.64it/s]

error downloading structure for I1V4Y8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-I1V4Y8-F1-model_v4.pdb
error downloading structure for L8H8A6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H8A6-F1-model_v4.pdb
error downloading structure for L8H0V8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8H0V8-F1-model_v4.pdb
error downloading structure for A0A178UHA5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UHA5-F1-model_v4.pdb


6703it [00:14, 321.31it/s]

error downloading structure for A0A8V8TQ34 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TQ34-F1-model_v4.pdb
error downloading structure for A8K8N7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A8K8N7-F1-model_v4.pdb
error downloading structure for A0A0D2X0Z2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2X0Z2-F1-model_v4.pdb
error downloading structure for A0A178UF03 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UF03-F1-model_v4.pdb


6798it [00:14, 280.05it/s]

error downloading structure for D8LQ01 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LQ01-F1-model_v4.pdb
error downloading structure for D8LEU4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LEU4-F1-model_v4.pdb
error downloading structure for A0CXW7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0CXW7-F1-model_v4.pdb


6914it [00:14, 399.43it/s]

error downloading structure for A0A8H4FAH9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4FAH9-F1-model_v4.pdb
error downloading structure for A0A2K1JKR7 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JKR7-F1-model_v4.pdb
error downloading structure for Q7S1T6 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q7S1T6-F1-model_v4.pdb


7012it [00:15, 371.41it/s]

error downloading structure for B8BVU2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8BVU2-F1-model_v4.pdb
error downloading structure for A0A178UNQ0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178UNQ0-F1-model_v4.pdb
error downloading structure for A0A178WJP3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A178WJP3-F1-model_v4.pdb


7162it [00:15, 543.09it/s]

error downloading structure for A0A0D2UP26 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UP26-F1-model_v4.pdb
error downloading structure for A0A8V8TRG9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8V8TRG9-F1-model_v4.pdb
error downloading structure for A0A2K3DR41 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K3DR41-F1-model_v4.pdb


7273it [00:15, 440.95it/s]

error downloading structure for A0A0B4KFH4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0B4KFH4-F1-model_v4.pdb
error downloading structure for A0A8H4BYJ9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A8H4BYJ9-F1-model_v4.pdb
error downloading structure for A0A1X7VIA2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A1X7VIA2-F1-model_v4.pdb
error downloading structure for A0A2K1L907 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1L907-F1-model_v4.pdb


7353it [00:15, 465.27it/s]

error downloading structure for L8HD42 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8HD42-F1-model_v4.pdb
error downloading structure for G3M399 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-G3M399-F1-model_v4.pdb
error downloading structure for D7FN81 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D7FN81-F1-model_v4.pdb


7483it [00:16, 475.65it/s]

error downloading structure for A0A0D2UQX2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A0D2UQX2-F1-model_v4.pdb
error downloading structure for A0A2K1JLZ0 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-A0A2K1JLZ0-F1-model_v4.pdb
error downloading structure for B8C4Z8 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C4Z8-F1-model_v4.pdb


7532it [00:16, 450.10it/s]

error downloading structure for D8LRY3 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LRY3-F1-model_v4.pdb
error downloading structure for L8GY46 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GY46-F1-model_v4.pdb
error downloading structure for Q8IFP1 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-Q8IFP1-F1-model_v4.pdb
error downloading structure for D8LQN4 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LQN4-F1-model_v4.pdb


7724it [00:16, 464.71it/s]

error downloading structure for V5INF5 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-V5INF5-F1-model_v4.pdb
error downloading structure for B8C8U9 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-B8C8U9-F1-model_v4.pdb
error downloading structure for D8LES2 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-D8LES2-F1-model_v4.pdb


7751it [00:16, 460.23it/s]

error downloading structure for L8GZ75 HTTP Error 404: Not Found
https://alphafold.ebi.ac.uk/files/AF-L8GZ75-F1-model_v4.pdb





In [123]:
#drop the rows with no struct_path
mapping_df = mapping_df[mapping_df['struct_path'].notnull()]
print(f"Final mapping DataFrame with structures: {mapping_df.shape[0]} entries")

Final mapping DataFrame with structures: 7521 entries


In [124]:
#write an identifier file for each marker gene family
for sub in mapping_df['oma_group'].unique():
	sub_df = mapping_df[mapping_df['oma_group'] == sub]
	#write the uniprot ids to an identifiers.txt file
	identifiers_file = f"./families/Information_benchmark/marker_genes/{sub}/identifiers.txt"
	with open(identifiers_file, 'w') as f:
		for uniprot_id in sub_df['1stref'].unique():
			f.write(f"{uniprot_id}\n")

In [129]:
from Bio import PDB

def create_fasta_from_pdbs(pdb_folder, output_fasta, chain_id='A', species_mapping=None):
	"""
	Create a FASTA file from PDB structures by extracting amino acid sequences using Biopython.
	
	Parameters:
	-----------
	pdb_folder : str
		Path to folder containing PDB files
	output_fasta : str
		Path to output FASTA file
	chain_id : str, optional
		Chain ID to extract sequence from (default: 'A')
	species_mapping : dict, optional
		Dictionary mapping PDB IDs to species codes
		
	Returns:
	--------
	list: List of SeqRecord objects that were written to FASTA
	"""
	
	# Standard amino acid 3-letter to 1-letter conversion
	aa_codes = {
		'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E',
		'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
		'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
		'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S',
		'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
	}
	
	records = []
	parser = PDB.PDBParser(QUIET=True)
	
	# Find all PDB files in the folder
	pdb_files = glob.glob(os.path.join(pdb_folder, "*.pdb"))
	
	for pdb_file in pdb_files:
		try:
			# Get structure name from filename
			pdb_id = os.path.basename(pdb_file).replace('.pdb', '')
			
			# Parse structure
			structure = parser.get_structure(pdb_id, pdb_file)
			
			# Get first model
			model = structure[0]
			
			# Check if requested chain exists
			if chain_id in model:
				chain = model[chain_id]
				
				# Extract sequence
				sequence = ''
				for residue in chain:
					if residue.get_resname() in aa_codes:
						sequence += aa_codes[residue.get_resname()]
				
				if sequence:
					# Create record ID with species code if available
					if species_mapping and pdb_id in species_mapping:
						record_id = f"{pdb_id}|{species_mapping[pdb_id]}"
						description = f"Chain {chain_id} sequence from {pdb_file} - Species: {species_mapping[pdb_id]}"
					else:
						record_id = pdb_id
						description = f"Chain {chain_id} sequence from {pdb_file}"
					
					# Create SeqRecord object
					record = SeqRecord(
						Seq(sequence),
						id=record_id,
						description=description
					)
					records.append(record)
			
		except Exception as e:
			print(f"Error processing {pdb_file}: {str(e)}")
	
	# Write all sequences to FASTA file
	if records:
		with open(output_fasta, 'w') as handle:
			SeqIO.write(records, handle, 'fasta')
		print(f"Created FASTA file with {len(records)} sequences at {output_fasta}")
	else:
		print("No sequences were extracted from PDB files")
	
	return records

In [131]:
#create a FASTA file for each marker gene family and use the species mapping from the sub datafram

for sub in mapping_df['oma_group'].unique():
	sub_df = mapping_df[mapping_df['oma_group'] == sub]
	struct_folder = sub_df['struct_folder'].iloc[0]  # Use the first struct folder for this family
	output_fasta = f"./families/Information_benchmark/marker_genes/{sub}/sequences.fasta"
	
	# Create species mapping for this family
	species_mapping = dict( (zip(sub_df['1stref'], sub_df['omaid'].apply(lambda x: x[:5]))) )
	
	create_fasta_from_pdbs(struct_folder, output_fasta, chain_id='A', species_mapping=species_mapping)

Created FASTA file with 13 sequences at ./families/Information_benchmark/marker_genes/926837/sequences.fasta
Created FASTA file with 8 sequences at ./families/Information_benchmark/marker_genes/1067390/sequences.fasta
Created FASTA file with 15 sequences at ./families/Information_benchmark/marker_genes/1426798/sequences.fasta
Created FASTA file with 14 sequences at ./families/Information_benchmark/marker_genes/1049443/sequences.fasta
Created FASTA file with 14 sequences at ./families/Information_benchmark/marker_genes/1410217/sequences.fasta
Created FASTA file with 19 sequences at ./families/Information_benchmark/marker_genes/1027516/sequences.fasta
Created FASTA file with 15 sequences at ./families/Information_benchmark/marker_genes/1067309/sequences.fasta
Created FASTA file with 15 sequences at ./families/Information_benchmark/marker_genes/1361066/sequences.fasta
Created FASTA file with 15 sequences at ./families/Information_benchmark/marker_genes/1311493/sequences.fasta
Created FAST

In [None]:
 #use normal mafft to align the sequences
import glob
from concurrent.futures import ThreadPoolExecutor
# Get all FASTA files in the marker gene families
fasta_files = glob.glob("./families/Information_benchmark/marker_genes/*/sequences.fasta")
print(f"Found {len(fasta_files)} FASTA files for alignment")
# Align all FASTA files in parallel
max_workers = 10  # Leave 1 core free for system processes

with ThreadPoolExecutor(max_workers=max_workers) as executor:
	results = list(executor.map(align_fasta_with_mafft, tqdm.tqdm(fasta_files)))

Found 500 FASTA files for alignment


100%|███████████████████████████████████████████████| 500/500 [00:00<00:00, 33057.77it/s]

Aligning: ./families/Information_benchmark/marker_genes/1067473/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/806905/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/948912/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1067594/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/850276/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1407943/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1356514/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1422110/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1426498/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1067568/sequences.fasta





Aligning: ./families/Information_benchmark/marker_genes/1389411/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1418302/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1359625/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1399401/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1301226/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1408429/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1058810/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1350625/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1067292/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/843003/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1426395/sequences.fasta
Aligning: ./families/Information_benchmark/marker_genes/1032056/sequences.fasta
Aligning: ./families/Information_benchmar

In [135]:
import glob
import os
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from pathlib import Path

def concatenate_alignments(alignment_files, output_file):
	"""
	Concatenate multiple alignment files into a single supermatrix alignment.
	
	Parameters:
	-----------
	alignment_files : list
		List of paths to alignment files in FASTA format
	output_file : str
		Path to save the concatenated alignment
		
	Returns:
	--------
	tuple
		(concatenated_alignment, partition_info)
		- concatenated_alignment: The final MultipleSeqAlignment object
		- partition_info: Dictionary with gene boundaries for partition file creation
	"""
	if not alignment_files:
		print("No alignment files provided")
		return None, {}
	
	# Dictionary to store sequences for each species across all genes
	all_species = {}
	partition_info = {}
	current_position = 1
	
	# Process each alignment file
	for i, aln_file in enumerate(tqdm.tqdm(alignment_files, desc="Concatenating alignments")):
		try:
			# Load the alignment
			gene_name = Path(aln_file).stem.replace('.aligned', '')
			alignment = AlignIO.read(aln_file, "fasta")
			aln_length = alignment.get_alignment_length()
			
			# Store partition information
			partition_info[gene_name] = {
				'start': current_position,
				'end': current_position + aln_length - 1
			}
			
			# Process each sequence in this alignment
			for record in alignment:
				# Extract species identifier from the sequence header
				species_id = record.id.split('|')[-1]
				
				# Initialize this species entry if it doesn't exist yet
				if species_id not in all_species:
					all_species[species_id] = {}
				
				# Add this gene's sequence for this species
				all_species[species_id][gene_name] = str(record.seq)
			
			current_position += aln_length
			
		except Exception as e:
			print(f"Error processing {aln_file}: {str(e)}")
	
	# Create the concatenated alignment
	concatenated_records = []
	gene_names = list(partition_info.keys())
	
	for species_id, genes in all_species.items():
		# Build the concatenated sequence for this species
		concat_seq = ""
		for gene in gene_names:
			if gene in genes:
				concat_seq += genes[gene]
			else:
				# If this species doesn't have this gene, add gaps
				gene_length = partition_info[gene]['end'] - partition_info[gene]['start'] + 1
				concat_seq += "-" * gene_length
		
		# Create a SeqRecord for this concatenated sequence
		record = SeqRecord(
			Seq(concat_seq),
			id=species_id,
			description=f"Concatenated {len(gene_names)} genes"
		)
		concatenated_records.append(record)
	
	# Create and save the concatenated alignment
	concatenated_alignment = MultipleSeqAlignment(concatenated_records)
	
	# Save alignment to file
	with open(output_file, "w") as handle:
		AlignIO.write(concatenated_alignment, handle, "fasta")
	
	# Create a partition file for RAxML-NG
	partition_file = f"{output_file}.partition"
	with open(partition_file, "w") as handle:
		for gene, pos in partition_info.items():
			handle.write(f"GTR+G, {gene} = {pos['start']}-{pos['end']}\n")
	
	print(f"Created concatenated alignment with {len(concatenated_records)} species and {concatenated_alignment.get_alignment_length()} columns")
	print(f"Partition file saved to {partition_file}")
	return concatenated_alignment, partition_info

# Find all aligned FASTA files
aligned_fasta_files = glob.glob("./families/Information_benchmark/marker_genes/*/sequences.aligned.fa")
print(f"Found {len(aligned_fasta_files)} aligned FASTA files for concatenation")

# Define the output file
output_file = "./families/Information_benchmark/concatenated_alignment.fasta"

# Concatenate the alignments
concatenated_alignment, partition_info = concatenate_alignments(aligned_fasta_files, output_file)

Found 500 aligned FASTA files for concatenation


Concatenating alignments: 100%|██████████████████████| 500/500 [00:00<00:00, 5191.66it/s]


ValueError: Sequences must all be the same length

In [None]:
def run_site_likelihood_analysis(aln , tree , model,  output_prefix = None):
	"""
	Placeholder function for running site likelihood analysis.
	This function should be implemented based on specific requirements.
	"""
	print("Running site likelihood analysis...")
	#raxml command is  --force --evaluate --msa your_alignment.phy --model GTR+G --tree fixed_tree.newick --site-lh
	# Example: assumes alignment and tree files are available for each HOG
	# Example: assumes alignment and tree files are available
	if output_prefix is None:
		output_prefix = "./raxmlng_results/example"
	# Ensure output directory exists
	import os
	if os.path.exists(os.path.dirname(output_prefix)):
		print(f"Output directory {os.path.dirname(output_prefix)} already exists.")
	else:
		print(f"Creating output directory: {os.path.dirname(output_prefix)}")	
		# Create output directory if it doesn't exist
		os.makedirs(os.path.dirname(output_prefix), exist_ok=True)

	cmd = [
		"raxml-ng",
		"--force",
		"--evaluate",
		"--msa", aln,
		"--model", model,
		"--tree", tree,
		"--site-lh",
		"--prefix", output_prefix
	]
	print(f"Running: {' '.join(cmd)}")
	subprocess.run(cmd, check=True)


In [None]:
import os
import subprocess
from pathlib import Path
import pandas as pd
import numpy as np
import re
from Bio import AlignIO

def extract_site_likelihoods(log_file):
	"""
	Extract site-wise log-likelihood values from RAxML-NG output
	
	Parameters:
	-----------
	log_file : str
		Path to the RAxML-NG log file containing site likelihoods
		
	Returns:
	--------
	list of floats: Site log-likelihood values
	"""
	likelihoods = []
	
	with open(log_file, 'r') as f:
		# Skip to the part with site likelihoods
		for line in f:
			if line.startswith('Site '):
				break
				
		# Parse the likelihood values
		for line in f:
			if not line.strip() or line.startswith('Site '):
				continue
			if 'Sum' in line:  # End of site likelihoods section
				break
				
			parts = line.strip().split()
			if len(parts) >= 2:
				try:
					likelihoods.append(float(parts[1]))
				except ValueError:
					continue
	
	return likelihoods

def create_column_likelihood_dataframe(alignment_file, tree_file, log_file,  output_dir=None):
	"""
	Calculate site likelihoods for an alignment and create a DataFrame with
	alignment columns and their corresponding likelihood values
	
	Parameters:
	-----------
	alignment_file : str
		Path to the aligned FASTA file
	tree_file : str
		Path to the tree file in Newick format
	output_dir : str or None
		Directory to store intermediate files (defaults to same directory as alignment)
		
	Returns:
	--------
	pandas.DataFrame: DataFrame with columns for site index, alignment column, and likelihood
	"""
	if output_dir is None:
		output_dir = os.path.dirname(alignment_file)
	
	# Load the alignment
	alignment = AlignIO.read(alignment_file, "fasta")
	# Extract site likelihoods
	likelihoods = extract_site_likelihoods(log_file)
	# Prepare data for DataFrame
	data = []
	for i in range(alignment.get_alignment_length()):
		if i < len(likelihoods):
			column = [record.seq[i] for record in alignment]
			column_str = ''.join(column)
			data.append({
				'Site': i + 1,
				'Alignment_Column': column_str,
				'Log_Likelihood': likelihoods[i]
			})
	# Create DataFrame
	df = pd.DataFrame(data)
	return df


## Align FoldTree2-encoded FASTA Files with MAFFT
This cell will align all encoded FASTA files in a directory using MAFFT, producing aligned FASTA files for downstream benchmarking.

In [None]:
import glob
import os
import subprocess
from pathlib import Path


for fam in 

# Directory containing encoded FASTA files
encoded_dir = './families/encoded_fastas/'  # Change to your directory
os.makedirs(encoded_dir, exist_ok=True)

# Find all encoded FASTA files
encoded_fastas = list(Path(encoded_dir).glob("*.fasta"))
print(f"Found {len(encoded_fastas)} encoded FASTA files.")

# Align each encoded FASTA file using MAFFT via treebuilder's static method
for fasta_file in encoded_fastas:
	aligned_path = str(fasta_file.with_name(f"{fasta_file.stem}.aligned.fasta"))
	tb.run_mafft_textaln(str(fasta_file), outaln=aligned_path, matrix=mafftmat)
	print(f"Aligned {fasta_file} -> {aligned_path}")




In [None]:
#be