# Structural Phylogenetic

In [1]:
import shutil
from pathlib import Path


def get_paths(protcode="BLDB"):
    paths, names = [], []

    ref_path = Path(f"../../data/{protcode}_ref.csv")
    ref_meta = pd.read_csv(ref_path)
    for entry, asym in zip(list(ref_meta["Entry ID"]), list(ref_meta["Auth Asym ID"])):
        p = Path(f"../../data/{protcode}/processed/{entry}_{asym}.pdb")
        if p.exists():
            paths.append(p)
            names.append(f"{protcode}_{p.stem}")

    for p in Path(f"../../data/{protcode}/generated/BQS").glob("*.pdb"):
        paths.append(p)
        names.append(f"{protcode}_{p.stem}")

    return paths, names

### Structural Phylogenetic Tree (Q-score)

In [2]:
from src.phylogenetics import qs_phylogenetic_tree

bldb_paths, bldb_names = get_paths("BLDB")
cytc_paths, cytc_names = get_paths("CYTC")
gfp_paths, gfp_names = get_paths("GFP")
ras_paths, ras_names = get_paths("RAS")

phylo_tree = qs_phylogenetic_tree(
    pdb_list=bldb_paths + cytc_paths + gfp_paths + ras_paths,
    names=bldb_names + cytc_names + gfp_names + ras_names
)

100%|██████████| 903840/903840 [1:33:44<00:00, 160.70it/s]  


In [3]:
print(phylo_tree["newick"])

WD = "wd"
Path(WD).mkdir(parents=True, exist_ok=True)
with open(f'{WD}/qs-tree.all.newick', 'w', encoding='utf-8') as f:
    f.write(phylo_tree["newick"])

(((GFP_5YR2_A:0.33824,(((GFP_6OG8_B:0.04259,(GFP_4ZF3_A:0.03075,(GFP_6OG9_B:0.01469,((GFP_6UN6_A:0.00804,GFP_6OFN_A:0.00804)Inner414:0.00463,((GFP_4ZF4_A:0.00755,GFP_6OFL_A:0.00755)Inner402:0.00296,GFP_6OFK_A:0.01050)Inner487:0.00217)Inner535:0.00419)Inner580:0.01606)Inner785:0.01184)Inner865:0.00994,GFP_3P28_A:0.05253)Inner921:0.26057,(((GFP_FM_20:0.20713,GFP_FM_2:0.20713)Inner1304:0.00909,GFP_FM_40:0.21622)Inner1307:0.03265,((GFP_6OFO_A:0.10847,GFP_6B7R_B:0.10847)Inner1175:0.09422,(((GFP_SM_12:0.11420,GFP_FM_10:0.11420)Inner1198:0.04065,((((GFP_FM_43:0.09675,(GFP_FM_6:0.08415,GFP_FM_15:0.08415)Inner1076:0.01260)Inner1126:0.02776,GFP_FM_17:0.12450)Inner1225:0.01982,((GFP_FM_24:0.13532,GFP_FM_37:0.13532)Inner1244:0.00612,(GFP_FM_36:0.12612,(GFP_SM_24:0.10697,((GFP_SM_42:0.09731,GFP_SM_18:0.09731)Inner1128:0.00363,((GFP_FM_1:0.07713,GFP_SM_36:0.07713)Inner1041:0.01054,(GFP_FM_21:0.08190,GFP_FM_41:0.08190)Inner1062:0.00577)Inner1092:0.01904)Inner1147:0.00967)Inner1172:0.01914)Inner1227:0

Use Tree Visualization Tools (e.g., https://itol.embl.de/) to visualize the newick format tree.

### Structural Phylogenetic Tree (3Di)

Prepare zipped PDB files for calculation.

In [None]:
WD = Path("./wd/3Di")
WD.mkdir(parents=True, exist_ok=True)


def prepare(protcode, wd=WD):
    for path, name in zip(*get_paths(protcode)):
        shutil.copy(path, wd / f"{name}.pdb")


prepare("BLDB")
prepare("CYTC")
prepare("GFP")
prepare("RAS")

shutil.make_archive(WD.parent / "pdbs", "zip", WD)

Upload the zipped file to FoldTree.
See more at https://github.com/DessimozLab/fold_tree.

### Phylogenetic Tree Summarization

Make sure you have installed Dendropy.

https://jeetsukumaran.github.io/DendroPy/

In [5]:
from src.phylogenetics import normalize

WD = Path("wd")
assert WD.exists()

normalize(input_path=WD / "qs-tree.all.newick", output_path=WD / "qs-tree.all.norm.newick")
normalize(input_path=WD / "3di.foldtree.newick", output_path=WD / "3di.foldtree.norm.newick")

More about SumTrees.

https://jeetsukumaran.github.io/DendroPy/programs/sumtrees.html

In [6]:
!cd {WD} && sumtrees --output=sum.norm.newick --output-tree-format newick --set-edges mean-length --suppress-annotations qs-tree.all.norm.newick 3di.foldtree.norm.newick

|                                 SumTrees                                 |
|                     Phylogenetic Tree Summarization                      |
|                              Version 5.0.1                               |
|                   By Jeet Sukumaran and Mark T. Holder                   |
|                         Using: DendroPy 5.0.1 ()                         |
+--------------------------------------------------------------------------+
|                                 Citation                                 |
|                                 ~~~~~~~~                                 |
| If any stage of your work or analyses relies on code or programs from    |
| this library, either directly or indirectly (e.g., through usage of your |
| own or third-party programs, pipelines, or toolkits which use, rely on,  |
| incorporate, or are otherwise primarily derivative of code/programs in   |
| this library), please cite:                                   

### Phylogenetic Tree Visualization

In [11]:
from ete4.smartview import TreeLayout
from ete4 import Tree
from pathlib import Path
import src.colorscheme as color
import pandas as pd

WD = Path("wd")
tree = Tree(open(str(WD / "sum.norm.newick")))
bldb = pd.read_csv(Path("../../data/BLDB_raw.csv"))

for node in tree.traverse():
    node.support = None


def get_class(pdb):
    result = bldb[bldb['PDB'] == pdb]['Ambler Class']
    return None if result.empty else result.values[0]


def find_bldb_nodes(ambler):
    nodes = []
    for n in tree.traverse():
        if not n.is_leaf:
            continue
        ids = n.name.split("_")
        cls, mtd = ids[0], ids[1]
        if cls != "BLDB" or mtd in ["FM", "SM"]:
            continue
        pdb = mtd
        if ambler == get_class(pdb):
            nodes.append(n)
    return nodes


bldb_A = tree.common_ancestor(find_bldb_nodes("A"))
bldb_B = tree.common_ancestor(find_bldb_nodes("B1") + find_bldb_nodes("B2") + find_bldb_nodes("B3"))
bldb_C = tree.common_ancestor(find_bldb_nodes("C"))
bldb_D = tree.common_ancestor(find_bldb_nodes("D"))


def ns(n):
    n.sm_style["hz_line_width"] = 1.5
    n.sm_style["vt_line_width"] = 1.5
    n.sm_style['hz_line_color'] = color.CS_BG_DARK

    if n == bldb_A:
        nst = NodeStyle()
        nst["bgcolor"] = "#50184E"
        n.set_style(nst)
    elif n == bldb_B:
        nst = NodeStyle()
        nst["bgcolor"] = "#20364F"
        n.set_style(nst)
    elif n == bldb_D:
        nst = NodeStyle()
        nst["bgcolor"] = "#4E9280"
        n.set_style(nst)

    if n.is_leaf:
        ids = n.name.split("_")
        cls, mtd = ids[0], ids[1]
        if mtd == "SM":
            n.sm_style['size'] = 3
            n.sm_style['fgcolor'] = color.CS_FG_A
            n.sm_style["hz_line_width"] = 2
            n.sm_style['hz_line_color'] = color.CS_FG_A
        elif mtd == "FM":
            n.sm_style['size'] = 3
            n.sm_style['fgcolor'] = color.CS_FG_B
            n.sm_style["hz_line_width"] = 2
            n.sm_style['hz_line_color'] = color.CS_FG_B


tree.explore(
    keep_server=True,
    layouts=[
        TreeLayout(
            name="SUM_TREE",
            ns=ns,
            # ts=ts,
            active=True,
            aligned_faces=True
        )
    ])

TreeError: No common ancestor for nodes: []

In [None]:
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D

# Legends
fig, ax = plt.subplots(figsize=(2, 2), dpi=200)
plt.legend(
    handles=[
        Line2D([0], [0],
               label="SM", marker="o", markerfacecolor=color.CS_FG_A,
               markeredgewidth=0, markersize=10, linestyle=""),
        Line2D([0], [0],
               label="FM", marker="o", markerfacecolor=color.CS_FG_B,
               markeredgewidth=0, markersize=10, linestyle=""),
        Line2D([0], [0],
               label="exp.", color=color.CS_BG_DARK, linestyle="-"),
    ],
    # loc='lower right',
    # bbox_to_anchor=(1.28, 0)
)
plt.grid(False)
plt.xticks([])
plt.yticks([])
plt.show()

### Phylogenetic Tree Visualization (Q-score)

In [5]:
from ete4.smartview import TreeLayout
from ete4 import Tree
from pathlib import Path

WD = Path("wd")
tree = Tree(open(str(WD / "qs-tree.all.formatted.newick")))

for node in tree.traverse():
    node.support = None


def ns(n):
    n.sm_style["hz_line_width"] = 1.5
    n.sm_style["vt_line_width"] = 1.5
    n.sm_style['hz_line_color'] = color.CS_BG_DARK

    if n.is_leaf:
        ids = n.name.split("_")
        cls, mtd = ids[0], ids[1]
        if mtd == "SM":
            n.sm_style['size'] = 3
            n.sm_style['fgcolor'] = color.CS_FG_A
            n.sm_style["hz_line_width"] = 2
            n.sm_style['hz_line_color'] = color.CS_FG_A
        elif mtd == "FM":
            n.sm_style['size'] = 3
            n.sm_style['fgcolor'] = color.CS_FG_B
            n.sm_style["hz_line_width"] = 2
            n.sm_style['hz_line_color'] = color.CS_FG_B


tree.explore(
    keep_server=True,
    layouts=[
        TreeLayout(
            name="SUM_TREE",
            ns=ns,
            # ts=ts,
            active=True,
            aligned_faces=True
        )
    ])

Added tree tree-3 with id 0.


### Phylogenetic Tree Visualization (3Di)

In [12]:
from ete4.smartview import NodeStyle, TreeLayout
from ete4 import Tree
from pathlib import Path
import src.colorscheme as color
import pandas as pd

WD = Path("wd")
tree = Tree(open(str(WD / "3di.foldtree.newick")))

for node in tree.traverse():
    node.support = None


def ns(n):
    n.sm_style["hz_line_width"] = 1.5
    n.sm_style["vt_line_width"] = 1.5
    n.sm_style['hz_line_color'] = color.CS_BG_DARK

    if n.is_leaf:
        ids = n.name.split("_")
        cls, mtd = ids[0], ids[1]
        if mtd == "SM":
            n.sm_style['size'] = 3
            n.sm_style['fgcolor'] = color.CS_FG_A
            n.sm_style["hz_line_width"] = 2
            n.sm_style['hz_line_color'] = color.CS_FG_A
        elif mtd == "FM":
            n.sm_style['size'] = 3
            n.sm_style['fgcolor'] = color.CS_FG_B
            n.sm_style["hz_line_width"] = 2
            n.sm_style['hz_line_color'] = color.CS_FG_B


tree.explore(
    keep_server=True,
    layouts=[
        TreeLayout(
            name="SUM_TREE",
            ns=ns,
            # ts=ts,
            active=True,
            aligned_faces=True
        )
    ])

Added tree tree-3 with id 0.
