In [1]:
from io import StringIO
from Bio import Phylo
from openpyxl import load_workbook
import re
import numpy as np
import os
import shutil

In [2]:

def import_real_tree(filename):

    # Open up the Excel file
    workbook = load_workbook(filename)
    
    # Get the first sheet
    worksheet = workbook.worksheets[0]
    
    row_list = []
    
    for r in worksheet.rows:
        column = [cell.value for cell in r]
        row_list.append(column)
    
    return(row_list)

In [3]:
def prune(tree):

    # substitutes string of the shape ...)NAME_OF_NODE_0001:0.1...
    # for ...):0.1... (standard format with only leave names)

    tree_nodes = re.sub(r'\)\s*[A-Za-z0-9\._]+:', '):', tree)
    tree_root = re.sub(r'\)\s*[A-Za-z0-9\._]+;', ');', tree_nodes)

    return(tree_root)

In [4]:
def calculate_branch_length(tree_file):

    if isinstance(tree_file, str) is True:
        tree = Phylo.read(StringIO(tree_file), 'newick')
    else:
        tree = tree_file

    lengths = [clade.branch_length for clade in tree.find_clades() if clade.branch_length is not None]

    if not lengths:
        return None, None

    mean_length = np.mean(lengths)
    std_length = np.std(lengths)

    return mean_length, std_length

In [None]:
file_real_trees = r'<path_to_excel_file>\file.xlsx'
directory_output = r'<path_to_output_folder>'
directory_fasta = r'<path_to_fasta_folder>'

real_tree_lists = import_real_tree(file_real_trees)

if real_tree_lists[0][0].startswith('Column'):
    real_tree_lists.pop(0)


number_list = np.linspace(0, 3, 60)

for i in range(len(number_list)-1):

    counter = 0
    print('------------------------------------')
    print(number_list[i], number_list[i+1])

    for j in range(len(real_tree_lists)):
        
        real_tree = prune(real_tree_lists[j][3])
        species = real_tree_lists[j][0]
        handle = StringIO(real_tree)
        tree = Phylo.read(handle, "newick")
        
        leaf_count = sum(1 for clade in tree.find_clades() if clade.is_terminal())

        average_branch_length, std_branch_length = calculate_branch_length(tree)

        correct_leaf_count = leaf_count<=40 and leaf_count>=10
        correct_branch_length = average_branch_length >= number_list[i] and average_branch_length < number_list[i+1]

        if correct_leaf_count and correct_branch_length and counter<5:
        
            print(j+1, f"Number of leaf nodes: {leaf_count}")
            print('Species', species)
            branch_formatted = f"{average_branch_length:.4f}".replace('.', '-')

            for filename in os.listdir(directory_fasta):
                if filename == species + '.clean.fasta':
                    src_path = os.path.join(directory_fasta, filename)
                    dst_filename = f"{branch_formatted}_{filename}"
                    dst_path = os.path.join(directory_output, dst_filename)
                    if os.path.exists(src_path):  # Only copy if source exists
                        shutil.copy(src_path, dst_path)
                    else:
                        print(f"Source file not found: {src_path}")
                    break

            counter += 1

        if counter == 5:
            break

------------------------------------
0.0 0.05084745762711865
9424 Number of leaf nodes: 10
Species Phy008O8DJ_PUCGT
------------------------------------
0.05084745762711865 0.1016949152542373
264 Number of leaf nodes: 14
Species Phy008NZK8_PUCGT
327 Number of leaf nodes: 14
Species Phy008O426_PUCGT
796 Number of leaf nodes: 40
Species Phy008OAEB_PUCGT
1776 Number of leaf nodes: 38
Species Phy008O4K3_PUCGT
1869 Number of leaf nodes: 12
Species Phy008O2XD_PUCGT
------------------------------------
0.1016949152542373 0.15254237288135594
265 Number of leaf nodes: 19
Species Phy008O5P7_PUCGT
318 Number of leaf nodes: 11
Species Phy008NZS8_PUCGT
428 Number of leaf nodes: 22
Species Phy008O3TS_PUCGT
440 Number of leaf nodes: 13
Species Phy008O938_PUCGT
719 Number of leaf nodes: 17
Species Phy008O4C2_PUCGT
------------------------------------
0.15254237288135594 0.2033898305084746
1 Number of leaf nodes: 18
Species Phy008O9U2_PUCGT
75 Number of leaf nodes: 31
Species Phy008O3QR_PUCGT
105 Numbe