In [1]:
from os import path
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster import hierarchy
from ete3 import PhyloTree
from scipy.cluster.hierarchy import to_tree
from ete3 import Tree

In [2]:
gcf_centroids = pd.read_csv("GCF_centroids.csv",index_col= 0)

In [3]:
zero_root = pd.Series(0, index=gcf_centroids.columns, name='zero')

In [5]:
_gcf_centroids = gcf_centroids.append(zero_root)

In [6]:
clusterer = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=0,
        affinity="euclidean",
        linkage="ward")

In [7]:
clusterer.fit(_gcf_centroids)

AgglomerativeClustering(distance_threshold=0, n_clusters=None)

In [8]:
def get_linkage_matrix(model, **kwargs):
        counts = np.zeros(model.children_.shape[0])
        n_samples = len(model.labels_)
        for i, merge in enumerate(model.children_):
            current_count = 0
            for child_idx in merge:
                if child_idx < n_samples:
                    current_count += 1  
                else:
                    current_count += counts[child_idx - n_samples]
            counts[i] = current_count
        linkage_matrix = np.column_stack([model.children_, model.distances_,
                                          counts]).astype(float)
        return linkage_matrix

In [9]:
get_linkage_matrix(clusterer)

array([[ 86.        , 182.        ,   0.        ,   2.        ],
       [281.        , 299.        ,   0.        ,   3.        ],
       [ 67.        , 253.        ,   0.        ,   2.        ],
       ...,
       [437.        , 593.        ,  15.32535848, 231.        ],
       [533.        , 594.        ,  16.28024106, 237.        ],
       [592.        , 595.        ,  21.33559059, 299.        ]])

In [10]:
scipy_tree = hierarchy.to_tree(get_linkage_matrix(clusterer), False)

In [11]:
def getNewick(node, newick, parentdist, leaf_names):
        if node.is_leaf():
            return "%s:%.2f%s" % (leaf_names[node.id], parentdist - node.dist, newick)
        else:
            if len(newick) > 0:
                newick = "):%.2f%s" % (parentdist - node.dist, newick)
            else:
                newick = ");"
            newick = getNewick(node.get_left(), newick, node.dist, leaf_names)
            newick = getNewick(node.get_right(), ",%s" % (newick), node.dist, leaf_names)
            newick = "(%s" % (newick)
            return newick

In [12]:
ete_tree = PhyloTree(getNewick(scipy_tree, "", scipy_tree.dist,_gcf_centroids.index.values))

In [13]:
ete_tree.set_outgroup('zero')

In [14]:
with open("ete_tree.newick", "w") as newick:
    newick.write(ete_tree.write())