In [2]:
import pandas as pd
from dendropy import Tree
import numpy as np
import matplotlib.pyplot as plt

COLOR = '#343434'

color_map = {
    "Zambia" : "#009E73", # green
    "Mozambique" : "#56B4E9", # blue
    "Nigeria" : "#0072B2",  # reddish-purple
    "Cameroon" : "#D55E00", # brown 
    "Democratic Republic of the Congo" : COLOR, #Black
    "Uganda" : "#F0E442",
    "Malawi" : "#E69F00", # orange 
    "Other" : "#C8C8C8" # grey
}


# Calculate phylogenetic diversity per country
Here we calculate the amount of phylogenetic diversity added by the genomic surveillance in each CholGEN member state. 

First, we load in the metadata to identify which tips in the tree are from CholGEN and which aren't.

In [3]:
md = list()
for file in ["supplemental_data1.csv", "supplemental_data2.csv"]:
    df = pd.read_csv( "../../data/" + file, usecols=["taxa", "country", "included_analysis"] )
    df["workshop"] = (file == "supplemental_data1.csv")
    md.append( df )
    
md = pd.concat( md )
md = md.loc[md["included_analysis"]].copy()
workshop_dict = md.set_index( "taxa" )["workshop"].to_dict()

Next, we load the posterior distribution of our phylogenetic reconstruction.

In [None]:
yielder = Tree.yield_from_files( files=["../../beast-analyses/2025-05-28_constant_relaxed.combined.down.trees"], schema="nexus", preserve_underscores=True )

Finally, we perform the calculation of phylogenetic diversity. We iterate through each tree in the posterior, convert branch lengths from units of decimal years to substitutions/site by multipling by the estimated substitution rate on that branch. For each country, we estimate the phylogenetic diversity as the difference between the total branch length of the tree and the total branch lengths of a tree in which the CholGEN sequences from that country have been removed.

In [4]:
countries = ["Nigeria", "Cameroon", "Democratic Republic of the Congo", "Uganda", "Zambia", "Malawi", "Mozambique"]
columns = [ 
	"tree", 
	"total_branch_length", 
	"Nigeria", "Nigeria_random", 
	"Cameroon", "Cameroon_random", 
	"Democratic Republic of the Congo", "Democratic Republic of the Congo_random", 
	"Uganda", "Uganda_random", 
	"Zambia", "Zambia_random",
	"Malawi", "Malawi_random",
	"Mozambique", "Mozambique_random"
]
results = {col : [] for col in columns}

country_dict = dict()
for country in countries:
	country_dict[country] = md.loc[(md["country"]==country)&md["workshop"],"taxa"].to_list()

for tree_idx, t in enumerate( yielder ):
	tree = t
	
	total_branch_length = list()
	for node in tree.preorder_node_iter():
		if node.edge_length is None:
			continue
		node.edge_length *= float( node.annotations.get_value( "rate" ) )
		total_branch_length.append( node.edge_length )
	
	total_branch_length = np.sum( total_branch_length )
	
	for country in countries:
		workshop_tree = tree.extract_tree_without_taxa_labels( country_dict[country] )
		
		wo_workshop_bl = np.sum( [edge.length for edge in workshop_tree.preorder_edge_iter() if edge.length] )
		workshop_bl = total_branch_length - wo_workshop_bl
		
		random_sample = md.sample( n=len( country_dict[country] ), replace=False )["taxa"].to_list()
		
		random_tree = tree.extract_tree_without_taxa_labels( random_sample )
		wo_random_bl = np.sum( [edge.length for edge in random_tree.preorder_edge_iter() if edge.length] )
		random_bl = total_branch_length - wo_random_bl
		results[country].append( workshop_bl )
		results[f"{country}_random"].append( random_bl )
		
	results["tree"].append( tree_idx )
	results["total_branch_length"].append( total_branch_length )
	
	if tree_idx % 100 == 0:
		print( f"Completed {tree_idx} trees..." )

results = pd.DataFrame( results )
results.to_csv( "subs-site-per-country.csv", index=False )

Completed 0 trees...
Completed 100 trees...
Completed 200 trees...
Completed 300 trees...
Completed 400 trees...
Completed 500 trees...
Completed 600 trees...
Completed 700 trees...
