In [1]:
import pandas as pd
from dendropy import Tree

This is a simple script to extract the estimate substitution rates found along branches that descendent from multiple MRCAs.

First, we load the tree.

In [2]:
tree = Tree.get( path="../../beast-analyses/2024-08-06_constant_relaxed.mcc.tree", schema="nexus", preserve_underscores=True )

Next, we load the metadata. We will be assessing how substitution rate is associated with lineage so we extract the `te` column specifically.

In [12]:
md = list()
for file in ["supplemental_data1.csv", "supplemental_data2.csv"]:
    df = pd.read_csv( "../../data/" + file, usecols=["taxa", "te", "included_analysis"] )
    df["workshop"] = (file == "supplemental_data1.csv")
    md.append( df )
    
md = pd.concat( md )
md = md.loc[md["included_analysis"]]
md = md.drop_duplicates()
md.head()

Unnamed: 0,taxa,te,included_analysis,workshop
0,Africa|CMR|CMR_CEN005NA35|T12|2023-04-12,T12,True,True
1,Africa|CMR|CMR_CEN032BM-B48|T12|2023-05-04,T12,True,True
2,Africa|CMR|CMR_E05557442|T12|2023-05-12,T12,True,True
3,Africa|CMR|CMR_CEN030DA43|T12|2023-05-03,T12,True,True
4,Africa|CMR|CMR_CEN027NV44|T12|2023-05-01,T12,True,True


Lastly, we iterate through the third wave 7PET lineages, identify their MRCA, and collect all of the substitution rates on branches that descend from that MRCA.

In [13]:
results = {
    "lineage" : [],
    "internal" : [],
    "median_rate" : [],
    "branch_length" : []
}

for lineage in ["T9", "T10", "T11", "T12", "T13", "T15"]:
    representatives = md.loc[md["te"]==lineage,"taxa"].to_list()
    #if lineage == "T13":
    #    representatives = ["Africa|KEN|KEN-2007-008|T13|2007-01-01", "Asia|YEM|ERR2269621|T13|2017-01-01"]
    lineage_mrca = tree.mrca( taxon_labels=representatives )
    print( f"{lineage}: {lineage_mrca.distance_from_tip()}")
    for node in lineage_mrca.postorder_iter():
        results["lineage"].append( lineage )
        results["internal"].append( node.is_internal() )
        rate = node.annotations["rate_median"].value
        if rate == "":
            rate = node.annotations["rate"].value
        results["median_rate"].append( float( rate ) )
        results["branch_length"].append( node.edge_length )
        
results = pd.DataFrame( results )
results.to_csv( "lineage_rates.csv", index=False )
results.head()

T9: 24.353003133916896
T10: 28.91582267614507
T11: 23.798626897544487
T12: 16.105707519851478
T13: 18.02554753876225
T15: 5.3616885438717565


Unnamed: 0,lineage,internal,median_rate,branch_length
0,T9,False,2.967276e-07,3.01049
1,T9,False,8.649867e-07,3.01049
2,T9,True,2.763022e-07,0.556136
3,T9,False,1.712785e-07,2.520708
4,T9,False,2.896827e-07,3.520708
