In [None]:
import pandas as pd
import json
from dendropy import Tree

# Extracting substitutions
This notebook is used to extract the nucleotide substitutions that occur along the branches of each lineage from the JSON file generated by `augur ancestral`.

First, we load the metadata for all sequences included in the analysis. We need this to get the lineage assignments of taxa in the tree.

In [None]:
tree = Tree.get( path="../../nextstrain/time.tree", schema="newick", preserve_underscores=True )
tree.is_rooted = True
taxa = [i.label for i in tree.taxon_namespace]

md = list()
for file in ["supplemental_data1.csv", "supplemental_data2.csv"]:
    df = pd.read_csv( "../../data/" + file, usecols=["taxa", "te"] )
    df["workshop"] = (file == "supplemental_data1.csv")
    md.append( df )

md = pd.concat( md )
md = md.loc[md["taxa"].isin( taxa )].copy()
md.head()

Next we load in the JSON file generate by `augur ancestral`. The complete discription of this file can be found on the [augur website](https://docs.nextstrain.org/projects/augur/en/stable/usage/cli/translate.html#example-node-data-json).

In [None]:
with open("../../nextstrain/nt-muts.json", 'r') as file:
    muts = json.load(file)

The parsed JSON file contains a dictionary where the name of each node in the phylogeny is linked to a list containing the mutations that occur on the branches immediately ancestral to the node. For each linaege, we iterate through its nodes, and record the mutations that occur on those nodes. The mutations extracted from each lineage are saved in a CSV file called `substitutions.csv`.

In [None]:
results = {
    "mutation" : [],
    "node" : [],
    "internal" : [],
    "lineage" : []
}

lineages = ["T9", "T10", "T11", "T12", "T13", "T15"]
visited_nodes = list()
for lin in lineages:
    representatives = md.loc[md["te"]==lin,"taxa"].to_list()
    representatives = [i for i in representatives if i != "Africa|KEN|KEN-2007-002|T13|2007-01-01"]
    lineage_mrca = tree.mrca( taxon_labels=representatives )
    visited_nodes.append( lineage_mrca.label )
    print( f"{lin}: {lineage_mrca.distance_from_tip()} {lineage_mrca.label}")
    for node in lineage_mrca.postorder_iter():
        label = node.label if node.is_internal() else node.taxon.label
        visited_nodes.append( label )
        if muts["nodes"][label]:
            for mut in muts["nodes"][label]["muts"]:
                results["mutation"].append( mut )
                results["node"].append( label )
                results["internal"].append( node.is_internal() )
                results["lineage"].append( lin )

for node in tree.postorder_node_iter():
    label = node.label if node.is_internal() else node.taxon.label
    if label in visited_nodes:
        continue
    for mut in muts["nodes"][label]["muts"]:
        results["mutation"].append( mut )
        results["node"].append( label )
        results["internal"].append( node.is_internal() )
        results["lineage"].append( "background" )

results = pd.DataFrame( results )
results[["ref", "position", "sub"]] = results["mutation"].str.extract( r"([^\d\s]+)(\d+)([^\d\s]+)" )
results["position"] = pd.to_numeric( results["position"] )
results["sub_type"] = results["ref"] + results["sub"]
results.to_csv( "substitutions.csv", index=False )