In [6]:
import phylustrator.ale_parser as alep

# Replace with your actual file path
ale_file = "../data/K00319.ale.uml_rec" 
data = alep.parse_ale_file(ale_file)

print(f"Loaded Species Tree with {len(data.species_tree)} nodes.")
print(f"Log Likelihood: {data.log_likelihood}")
print(f"Rates: {data.rates}")
print(f"Found {len(data.gene_trees)} reconciled gene trees.")
print("\nFirst 5 rows of Branch Stats:")
#display(data.branch_stats.head())

ValueError: could not convert string to float: 'ARC_0001'

In [4]:
import pandas as pd
import ete3
import re
from dataclasses import dataclass, field
from typing import List, Dict, Optional

@dataclass
class ALEData:
    """
    Stores parsed information from an ALE output file (.uml_rec).
    """
    species_tree_newick: str
    species_tree: ete3.Tree
    log_likelihood: float
    rates: Dict[str, float]  # {Duplications, Transfers, Losses}
    gene_trees: List[str]    # List of Newick strings
    totals: Dict[str, float] # {Duplications, Transfers, Losses, Speciations}
    branch_stats: pd.DataFrame # Columns: [S_node, Duplications, Transfers, Losses, Originations, copies]

def parse_ale_file(filepath: str) -> ALEData:
    """
    Parses an ALE .uml_rec file and returns an ALEData object.
    """
    with open(filepath, 'r') as f:
        lines = [l.strip() for l in f if l.strip()]

    # 1. Parse Reference Species Tree (First Line)
    # We load it with format=1 to ensure we keep internal node names/IDs.
    tree_line = lines[1].split("\t")[-1]
    species_tree = ete3.Tree(tree_line, format=1)
    
    # 2. Parse Rates & Likelihood
    # Look for ">logl:"
    logl = 0.0
    rates = {}
    
    # We iterate through lines to find markers
    i = 1
    while i < len(lines):
        line = lines[i]
        
        if line.startswith(">logl"):
            # Example: >logl: -1136.19
            logl = float(line.split(":")[1].strip())
        
        elif line.startswith("rate of"):
            # Skip header, next line is "ML ..."
            pass
            
        elif line.startswith("ML"):
            # Example: ML      0.154163        0.0712109       0.643975
            parts = line.split()
            # parts[0] is 'ML'
            rates = {
                "Duplications": float(parts[1]),
                "Transfers": float(parts[2]),
                "Losses": float(parts[3])
            }
        
        elif "reconciled G-s:" in line:
            # Example: 100 reconciled G-s:
            # The next N lines are gene trees until we hit "# of"
            break
            
        i += 1
        
    # 3. Parse Reconciled Gene Trees
    # -------------------------------------------------------
    # We are currently at the "100 reconciled G-s:" line.
    i += 1 # Move to first gene tree
    gene_trees = []
    
    while i < len(lines):
        line = lines[i]
        if line.startswith("# of"):
            # We hit the Totals section header
            break
        gene_trees.append(line)
        i += 1

    # 4. Parse Totals
    i += 1 # Move to 'Total' line
    totals_line = lines[i]
    t_parts = totals_line.split()
    totals = {
        "Duplications": float(t_parts[1]),
        "Transfers": float(t_parts[2]),
        "Losses": float(t_parts[3]),
        "Speciations": float(t_parts[4])
    }
    
    # 5. Parse Branch Stats Table
    table_start_index = -1
    for idx in range(i, len(lines)):
        if "S_node" in lines[idx] or ("Duplications" in lines[idx] and "copies" in lines[idx]):
            table_start_index = idx + 1
            break
            
    # Parse the table data
    data_rows = []
    if table_start_index != -1:
        for idx in range(table_start_index, len(lines)):
            line = lines[idx]
            # ALE table rows are tab or space separated
            cols = line.split()
            if not cols: continue
            
            # S_node usually is an integer, but keep as str to match tree node names safely
            row = {
                "S_node": cols[0],
                "name": cols[1],
                "Duplications": float(cols[2]),
                "Transfers": float(cols[3]),
                "Losses": float(cols[4]),
                "Originations": float(cols[5]),
                "copies": float(cols[6])
            }
            data_rows.append(row)
            
    df = pd.DataFrame(data_rows)
    
    return ALEData(
        species_tree_newick=tree_line,
        species_tree=species_tree,
        log_likelihood=logl,
        rates=rates,
        gene_trees=gene_trees,
        totals=totals,
        branch_stats=df
    )


In [8]:
ale_file = "../data/K00319.ale.uml_rec" 
ale_data = parse_ale_file(ale_file)

In [9]:
ale_data

ALEData(species_tree_newick='(((((((((((BAC_0021:1,BAC_1907:1)3172:1,((BAC_0097:1,BAC_2136:1)3227:1,BAC_2107:1)4165:1)4307:1,BAC_0128:1)4753:1,BAC_1172:1)4988:1,(((BAC_0285:1,BAC_1158:1)3343:1,(((BAC_0408:1,BAC_1498:1)3422:1,BAC_1744:1)4060:1,BAC_1294:1)4634:1)4910:1,BAC_0598:1)5085:1)5144:1,(((BAC_0374:1,BAC_1770:1)3401:1,(BAC_1324:1,BAC_1493:1)3902:1)4378:1,((BAC_0819:1,BAC_1714:1)3657:1,((((BAC_0934:1,BAC_1265:1)3724:1,(BAC_0936:1,BAC_1167:1)3725:1)4504:1,BAC_1297:1)4841:1,BAC_1125:1)5041:1)5182:1)5269:1)5333:1,(((BAC_0058:1,BAC_1169:1)3201:1,BAC_1484:1)3958:1,((BAC_0920:1,BAC_1097:1)3715:1,BAC_1671:1)4033:1)4592:1)5384:1,(((((BAC_0146:1,BAC_1271:1)3258:1,(((BAC_0903:1,BAC_0916:1)3705:1,BAC_1149:1)3818:1,((BAC_0914:1,BAC_0987:1)3711:1,BAC_1127:1)3811:1)4532:1)4854:1,(BAC_0664:1,BAC_1845:1)3574:1)5051:1,(((((((((((BAC_0216:1,(((BAC_0244:1,(BAC_0420:1,BAC_0828:1)3429:1)4388:1,(((BAC_0272:1,((((BAC_0273:1,BAC_0632:1)3336:1,(((BAC_0350:1,BAC_0836:1)3384:1,BAC_0681:1)3583:1,(BAC_0367:1,B